diff options
Diffstat (limited to 'target')
37 files changed, 6879 insertions, 2212 deletions
diff --git a/target/arm/Makefile.objs b/target/arm/Makefile.objs index 83febd232c..fa39fd7c83 100644 --- a/target/arm/Makefile.objs +++ b/target/arm/Makefile.objs @@ -86,3 +86,4 @@ obj-$(CONFIG_SOFTMMU) += psci.o obj-$(TARGET_AARCH64) += translate-a64.o helper-a64.o obj-$(TARGET_AARCH64) += translate-sve.o sve_helper.o obj-$(TARGET_AARCH64) += pauth_helper.o +obj-$(TARGET_AARCH64) += mte_helper.o diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 5b7a36b5d7..5050e1843a 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -203,6 +203,9 @@ static void arm_cpu_reset(DeviceState *dev) * Enable TBI0 and TBI1. While the real kernel only enables TBI0, * turning on both here will produce smaller code and otherwise * make no difference to the user-level emulation. + * + * In sve_probe_page, we assume that this is set. + * Do not modify this without other changes. */ env->cp15.tcr_el[1].raw_tcr = (3ULL << 37); #else @@ -1108,7 +1111,7 @@ static void arm_set_pmu(Object *obj, bool value, Error **errp) ARMCPU *cpu = ARM_CPU(obj); if (value) { - if (kvm_enabled() && !kvm_arm_pmu_supported(CPU(cpu))) { + if (kvm_enabled() && !kvm_arm_pmu_supported()) { error_setg(errp, "'pmu' feature not supported by KVM on this host"); return; } @@ -1249,6 +1252,25 @@ void arm_cpu_post_init(Object *obj) if (kvm_enabled()) { kvm_arm_add_vcpu_properties(obj); } + +#ifndef CONFIG_USER_ONLY + if (arm_feature(&cpu->env, ARM_FEATURE_AARCH64) && + cpu_isar_feature(aa64_mte, cpu)) { + object_property_add_link(obj, "tag-memory", + TYPE_MEMORY_REGION, + (Object **)&cpu->tag_memory, + qdev_prop_allow_set_link_before_realize, + OBJ_PROP_LINK_STRONG); + + if (arm_feature(&cpu->env, ARM_FEATURE_EL3)) { + object_property_add_link(obj, "secure-tag-memory", + TYPE_MEMORY_REGION, + (Object **)&cpu->secure_tag_memory, + qdev_prop_allow_set_link_before_realize, + OBJ_PROP_LINK_STRONG); + } + } +#endif } static void arm_cpu_finalizefn(Object *obj) @@ -1738,18 +1760,43 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) #ifndef CONFIG_USER_ONLY MachineState *ms = MACHINE(qdev_get_machine()); unsigned int smp_cpus = ms->smp.cpus; + bool has_secure = cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY); - if (cpu->has_el3 || arm_feature(env, ARM_FEATURE_M_SECURITY)) { - cs->num_ases = 2; + /* + * We must set cs->num_ases to the final value before + * the first call to cpu_address_space_init. + */ + if (cpu->tag_memory != NULL) { + cs->num_ases = 3 + has_secure; + } else { + cs->num_ases = 1 + has_secure; + } + if (has_secure) { if (!cpu->secure_memory) { cpu->secure_memory = cs->memory; } cpu_address_space_init(cs, ARMASIdx_S, "cpu-secure-memory", cpu->secure_memory); - } else { - cs->num_ases = 1; } + + if (cpu->tag_memory != NULL) { + cpu_address_space_init(cs, ARMASIdx_TagNS, "cpu-tag-memory", + cpu->tag_memory); + if (has_secure) { + cpu_address_space_init(cs, ARMASIdx_TagS, "cpu-tag-memory", + cpu->secure_tag_memory); + } + } else if (cpu_isar_feature(aa64_mte, cpu)) { + /* + * Since there is no tag memory, we can't meaningfully support MTE + * to its fullest. To avoid problems later, when we would come to + * use the tag memory, downgrade support to insns only. + */ + cpu->isar.id_aa64pfr1 = + FIELD_DP64(cpu->isar.id_aa64pfr1, ID_AA64PFR1, MTE, 1); + } + cpu_address_space_init(cs, ARMASIdx_NS, "cpu-memory", cs->memory); /* No core_count specified, default to smp_cpus. */ @@ -1758,6 +1805,30 @@ static void arm_cpu_realizefn(DeviceState *dev, Error **errp) } #endif + if (tcg_enabled()) { + int dcz_blocklen = 4 << cpu->dcz_blocksize; + + /* + * We only support DCZ blocklen that fits on one page. + * + * Architectually this is always true. However TARGET_PAGE_SIZE + * is variable and, for compatibility with -machine virt-2.7, + * is only 1KiB, as an artifact of legacy ARMv5 subpage support. + * But even then, while the largest architectural DCZ blocklen + * is 2KiB, no cpu actually uses such a large blocklen. + */ + assert(dcz_blocklen <= TARGET_PAGE_SIZE); + + /* + * We only support DCZ blocksize >= 2*TAG_GRANULE, which is to say + * both nibbles of each byte storing tag data may be written at once. + * Since TAG_GRANULE is 16, this means that blocklen must be >= 32. + */ + if (cpu_isar_feature(aa64_mte, cpu)) { + assert(dcz_blocklen >= 2 * TAG_GRANULE); + } + } + qemu_init_vcpu(cs); cpu_reset(cs); @@ -2169,8 +2240,8 @@ static void arm_cpu_class_init(ObjectClass *oc, void *data) cc->tlb_fill = arm_cpu_tlb_fill; cc->debug_excp_handler = arm_debug_excp_handler; cc->debug_check_watchpoint = arm_debug_check_watchpoint; -#if !defined(CONFIG_USER_ONLY) cc->do_unaligned_access = arm_cpu_do_unaligned_access; +#if !defined(CONFIG_USER_ONLY) cc->do_transaction_failed = arm_cpu_do_transaction_failed; cc->adjust_watchpoint_address = arm_adjust_watchpoint_address; #endif /* CONFIG_TCG && !CONFIG_USER_ONLY */ diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 677584e5da..cf99dcca9f 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -502,6 +502,9 @@ typedef struct CPUARMState { uint64_t pmccfiltr_el0; /* Performance Monitor Filter Register */ uint64_t vpidr_el2; /* Virtualization Processor ID Register */ uint64_t vmpidr_el2; /* Virtualization Multiprocessor ID Register */ + uint64_t tfsr_el[4]; /* tfsre0_el1 is index 0. */ + uint64_t gcr_el1; + uint64_t rgsr_el1; } cp15; struct { @@ -789,6 +792,10 @@ struct ARMCPU { /* MemoryRegion to use for secure physical accesses */ MemoryRegion *secure_memory; + /* MemoryRegion to use for allocation tag accesses */ + MemoryRegion *tag_memory; + MemoryRegion *secure_tag_memory; + /* For v8M, pointer to the IDAU interface provided by board/SoC */ Object *idau; @@ -1282,6 +1289,7 @@ void pmu_init(ARMCPU *cpu); #define PSTATE_SS (1U << 21) #define PSTATE_PAN (1U << 22) #define PSTATE_UAO (1U << 23) +#define PSTATE_TCO (1U << 25) #define PSTATE_V (1U << 28) #define PSTATE_C (1U << 29) #define PSTATE_Z (1U << 30) @@ -2334,7 +2342,7 @@ static inline uint64_t cpreg_to_kvm_id(uint32_t cpregid) * migration or KVM state synchronization. (Typically this is for "registers" * which are actually used as instructions for cache maintenance and so on.) * IO indicates that this register does I/O and therefore its accesses - * need to be surrounded by gen_io_start()/gen_io_end(). In particular, + * need to be marked with gen_io_start() and also end the TB. In particular, * registers which implement clocks or timers require this. * RAISES_EXC is for when the read or write hook might raise an exception; * the generated code will synchronize the CPU state before calling the hook @@ -2356,7 +2364,9 @@ static inline uint64_t cpreg_to_kvm_id(uint32_t cpregid) #define ARM_CP_NZCV (ARM_CP_SPECIAL | 0x0300) #define ARM_CP_CURRENTEL (ARM_CP_SPECIAL | 0x0400) #define ARM_CP_DC_ZVA (ARM_CP_SPECIAL | 0x0500) -#define ARM_LAST_SPECIAL ARM_CP_DC_ZVA +#define ARM_CP_DC_GVA (ARM_CP_SPECIAL | 0x0600) +#define ARM_CP_DC_GZVA (ARM_CP_SPECIAL | 0x0700) +#define ARM_LAST_SPECIAL ARM_CP_DC_GZVA #define ARM_CP_FPU 0x1000 #define ARM_CP_SVE 0x2000 #define ARM_CP_NO_GDB 0x4000 @@ -2979,6 +2989,8 @@ typedef enum ARMMMUIdxBit { typedef enum ARMASIdx { ARMASIdx_NS = 0, ARMASIdx_S = 1, + ARMASIdx_TagNS = 2, + ARMASIdx_TagS = 3, } ARMASIdx; /* Return the Exception Level targeted by debug exceptions. */ @@ -3183,10 +3195,10 @@ typedef ARMCPU ArchCPU; * | | | TBFLAG_A32 | | * | | +-----+----------+ TBFLAG_AM32 | * | TBFLAG_ANY | |TBFLAG_M32| | - * | | +-+----------+--------------| - * | | | TBFLAG_A64 | - * +--------------+---------+---------------------------+ - * 31 20 15 0 + * | +-----------+----------+--------------| + * | | TBFLAG_A64 | + * +--------------+-------------------------------------+ + * 31 20 0 * * Unless otherwise noted, these bits are cached in env->hflags. */ @@ -3253,6 +3265,10 @@ FIELD(TBFLAG_A64, BT, 9, 1) FIELD(TBFLAG_A64, BTYPE, 10, 2) /* Not cached. */ FIELD(TBFLAG_A64, TBID, 12, 2) FIELD(TBFLAG_A64, UNPRIV, 14, 1) +FIELD(TBFLAG_A64, ATA, 15, 1) +FIELD(TBFLAG_A64, TCMA, 16, 2) +FIELD(TBFLAG_A64, MTE_ACTIVE, 18, 1) +FIELD(TBFLAG_A64, MTE0_ACTIVE, 19, 1) /** * cpu_mmu_index: @@ -3385,6 +3401,20 @@ static inline uint64_t *aa64_vfp_qreg(CPUARMState *env, unsigned regno) /* Shared between translate-sve.c and sve_helper.c. */ extern const uint64_t pred_esz_masks[4]; +/* Helper for the macros below, validating the argument type. */ +static inline MemTxAttrs *typecheck_memtxattrs(MemTxAttrs *x) +{ + return x; +} + +/* + * Lvalue macros for ARM TLB bits that we must cache in the TCG TLB. + * Using these should be a bit more self-documenting than using the + * generic target bits directly. + */ +#define arm_tlb_bti_gp(x) (typecheck_memtxattrs(x)->target_tlb_bit0) +#define arm_tlb_mte_tagged(x) (typecheck_memtxattrs(x)->target_tlb_bit1) + /* * Naming convention for isar_feature functions: * Functions which test 32-bit ID registers should have _aa32_ in @@ -3814,6 +3844,16 @@ static inline bool isar_feature_aa64_bti(const ARMISARegisters *id) return FIELD_EX64(id->id_aa64pfr1, ID_AA64PFR1, BT) != 0; } +static inline bool isar_feature_aa64_mte_insn_reg(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64pfr1, ID_AA64PFR1, MTE) != 0; +} + +static inline bool isar_feature_aa64_mte(const ARMISARegisters *id) +{ + return FIELD_EX64(id->id_aa64pfr1, ID_AA64PFR1, MTE) >= 2; +} + static inline bool isar_feature_aa64_pmu_8_1(const ARMISARegisters *id) { return FIELD_EX64(id->id_aa64dfr0, ID_AA64DFR0, PMUVER) >= 4 && diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 778cecc2e6..a2f4733eed 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -266,7 +266,7 @@ void arm_cpu_sve_finalize(ARMCPU *cpu, Error **errp) /* Collect the set of vector lengths supported by KVM. */ bitmap_zero(kvm_supported, ARM_MAX_VQ); - if (kvm_enabled() && kvm_arm_sve_supported(CPU(cpu))) { + if (kvm_enabled() && kvm_arm_sve_supported()) { kvm_arm_sve_get_vls(CPU(cpu), kvm_supported); } else if (kvm_enabled()) { assert(!cpu_isar_feature(aa64_sve, cpu)); @@ -473,7 +473,7 @@ static void cpu_max_set_sve_max_vq(Object *obj, Visitor *v, const char *name, return; } - if (kvm_enabled() && !kvm_arm_sve_supported(CPU(cpu))) { + if (kvm_enabled() && !kvm_arm_sve_supported()) { error_setg(errp, "cannot set sve-max-vq"); error_append_hint(errp, "SVE not supported by KVM on this host\n"); return; @@ -519,7 +519,7 @@ static void cpu_arm_set_sve_vq(Object *obj, Visitor *v, const char *name, return; } - if (value && kvm_enabled() && !kvm_arm_sve_supported(CPU(cpu))) { + if (value && kvm_enabled() && !kvm_arm_sve_supported()) { error_setg(errp, "cannot enable %s", name); error_append_hint(errp, "SVE not supported by KVM on this host\n"); return; @@ -556,7 +556,7 @@ static void cpu_arm_set_sve(Object *obj, Visitor *v, const char *name, return; } - if (value && kvm_enabled() && !kvm_arm_sve_supported(CPU(cpu))) { + if (value && kvm_enabled() && !kvm_arm_sve_supported()) { error_setg(errp, "'sve' feature not supported by KVM on this host"); return; } @@ -654,6 +654,11 @@ static void aarch64_max_initfn(Object *obj) t = cpu->isar.id_aa64pfr1; t = FIELD_DP64(t, ID_AA64PFR1, BT, 1); + /* + * Begin with full support for MTE; will be downgraded to MTE=1 + * during realize if the board provides no tag memory. + */ + t = FIELD_DP64(t, ID_AA64PFR1, MTE, 2); cpu->isar.id_aa64pfr1 = t; t = cpu->isar.id_aa64mmfr1; @@ -751,7 +756,7 @@ static void aarch64_cpu_set_aarch64(Object *obj, bool value, Error **errp) * uniform execution state like do_interrupt. */ if (value == false) { - if (!kvm_enabled() || !kvm_arm_aarch32_supported(CPU(cpu))) { + if (!kvm_enabled() || !kvm_arm_aarch32_supported()) { error_setg(errp, "'aarch64' feature cannot be disabled " "unless KVM is enabled and 32-bit EL1 " "is supported"); diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c index bc0649a44a..8682630ff6 100644 --- a/target/arm/helper-a64.c +++ b/target/arm/helper-a64.c @@ -1119,85 +1119,41 @@ void HELPER(dc_zva)(CPUARMState *env, uint64_t vaddr_in) * (which matches the usual QEMU behaviour of not implementing either * alignment faults or any memory attribute handling). */ - - ARMCPU *cpu = env_archcpu(env); - uint64_t blocklen = 4 << cpu->dcz_blocksize; + int blocklen = 4 << env_archcpu(env)->dcz_blocksize; uint64_t vaddr = vaddr_in & ~(blocklen - 1); + int mmu_idx = cpu_mmu_index(env, false); + void *mem; + + /* + * Trapless lookup. In addition to actual invalid page, may + * return NULL for I/O, watchpoints, clean pages, etc. + */ + mem = tlb_vaddr_to_host(env, vaddr, MMU_DATA_STORE, mmu_idx); #ifndef CONFIG_USER_ONLY - { + if (unlikely(!mem)) { + uintptr_t ra = GETPC(); + /* - * Slightly awkwardly, QEMU's TARGET_PAGE_SIZE may be less than - * the block size so we might have to do more than one TLB lookup. - * We know that in fact for any v8 CPU the page size is at least 4K - * and the block size must be 2K or less, but TARGET_PAGE_SIZE is only - * 1K as an artefact of legacy v5 subpage support being present in the - * same QEMU executable. So in practice the hostaddr[] array has - * two entries, given the current setting of TARGET_PAGE_BITS_MIN. + * Trap if accessing an invalid page. DC_ZVA requires that we supply + * the original pointer for an invalid page. But watchpoints require + * that we probe the actual space. So do both. */ - int maxidx = DIV_ROUND_UP(blocklen, TARGET_PAGE_SIZE); - void *hostaddr[DIV_ROUND_UP(2 * KiB, 1 << TARGET_PAGE_BITS_MIN)]; - int try, i; - unsigned mmu_idx = cpu_mmu_index(env, false); - TCGMemOpIdx oi = make_memop_idx(MO_UB, mmu_idx); - - assert(maxidx <= ARRAY_SIZE(hostaddr)); - - for (try = 0; try < 2; try++) { - - for (i = 0; i < maxidx; i++) { - hostaddr[i] = tlb_vaddr_to_host(env, - vaddr + TARGET_PAGE_SIZE * i, - 1, mmu_idx); - if (!hostaddr[i]) { - break; - } - } - if (i == maxidx) { - /* - * If it's all in the TLB it's fair game for just writing to; - * we know we don't need to update dirty status, etc. - */ - for (i = 0; i < maxidx - 1; i++) { - memset(hostaddr[i], 0, TARGET_PAGE_SIZE); - } - memset(hostaddr[i], 0, blocklen - (i * TARGET_PAGE_SIZE)); - return; - } + (void) probe_write(env, vaddr_in, 1, mmu_idx, ra); + mem = probe_write(env, vaddr, blocklen, mmu_idx, ra); + + if (unlikely(!mem)) { /* - * OK, try a store and see if we can populate the tlb. This - * might cause an exception if the memory isn't writable, - * in which case we will longjmp out of here. We must for - * this purpose use the actual register value passed to us - * so that we get the fault address right. + * The only remaining reason for mem == NULL is I/O. + * Just do a series of byte writes as the architecture demands. */ - helper_ret_stb_mmu(env, vaddr_in, 0, oi, GETPC()); - /* Now we can populate the other TLB entries, if any */ - for (i = 0; i < maxidx; i++) { - uint64_t va = vaddr + TARGET_PAGE_SIZE * i; - if (va != (vaddr_in & TARGET_PAGE_MASK)) { - helper_ret_stb_mmu(env, va, 0, oi, GETPC()); - } + for (int i = 0; i < blocklen; i++) { + cpu_stb_mmuidx_ra(env, vaddr + i, 0, mmu_idx, ra); } - } - - /* - * Slow path (probably attempt to do this to an I/O device or - * similar, or clearing of a block of code we have translations - * cached for). Just do a series of byte writes as the architecture - * demands. It's not worth trying to use a cpu_physical_memory_map(), - * memset(), unmap() sequence here because: - * + we'd need to account for the blocksize being larger than a page - * + the direct-RAM access case is almost always going to be dealt - * with in the fastpath code above, so there's no speed benefit - * + we would have to deal with the map returning NULL because the - * bounce buffer was in use - */ - for (i = 0; i < blocklen; i++) { - helper_ret_stb_mmu(env, vaddr + i, 0, oi, GETPC()); + return; } } -#else - memset(g2h(vaddr), 0, blocklen); #endif + + memset(mem, 0, blocklen); } diff --git a/target/arm/helper-a64.h b/target/arm/helper-a64.h index 3df7c185aa..5b0b699a50 100644 --- a/target/arm/helper-a64.h +++ b/target/arm/helper-a64.h @@ -103,3 +103,19 @@ DEF_HELPER_FLAGS_3(autda, TCG_CALL_NO_WG, i64, env, i64, i64) DEF_HELPER_FLAGS_3(autdb, TCG_CALL_NO_WG, i64, env, i64, i64) DEF_HELPER_FLAGS_2(xpaci, TCG_CALL_NO_RWG_SE, i64, env, i64) DEF_HELPER_FLAGS_2(xpacd, TCG_CALL_NO_RWG_SE, i64, env, i64) + +DEF_HELPER_FLAGS_3(mte_check1, TCG_CALL_NO_WG, i64, env, i32, i64) +DEF_HELPER_FLAGS_3(mte_checkN, TCG_CALL_NO_WG, i64, env, i32, i64) +DEF_HELPER_FLAGS_3(mte_check_zva, TCG_CALL_NO_WG, i64, env, i32, i64) +DEF_HELPER_FLAGS_3(irg, TCG_CALL_NO_RWG, i64, env, i64, i64) +DEF_HELPER_FLAGS_4(addsubg, TCG_CALL_NO_RWG_SE, i64, env, i64, s32, i32) +DEF_HELPER_FLAGS_3(ldg, TCG_CALL_NO_WG, i64, env, i64, i64) +DEF_HELPER_FLAGS_3(stg, TCG_CALL_NO_WG, void, env, i64, i64) +DEF_HELPER_FLAGS_3(stg_parallel, TCG_CALL_NO_WG, void, env, i64, i64) +DEF_HELPER_FLAGS_2(stg_stub, TCG_CALL_NO_WG, void, env, i64) +DEF_HELPER_FLAGS_3(st2g, TCG_CALL_NO_WG, void, env, i64, i64) +DEF_HELPER_FLAGS_3(st2g_parallel, TCG_CALL_NO_WG, void, env, i64, i64) +DEF_HELPER_FLAGS_2(st2g_stub, TCG_CALL_NO_WG, void, env, i64) +DEF_HELPER_FLAGS_2(ldgm, TCG_CALL_NO_WG, i64, env, i64) +DEF_HELPER_FLAGS_3(stgm, TCG_CALL_NO_WG, void, env, i64, i64) +DEF_HELPER_FLAGS_3(stzgm_tags, TCG_CALL_NO_WG, void, env, i64, i64) diff --git a/target/arm/helper-sve.h b/target/arm/helper-sve.h index 7a200755ac..63c4a087ca 100644 --- a/target/arm/helper-sve.h +++ b/target/arm/helper-sve.h @@ -1196,6 +1196,64 @@ DEF_HELPER_FLAGS_4(sve_ld1sds_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ld1sdu_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ld1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld2dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld3dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld4dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1hsu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hdu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hds_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1hsu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hdu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1hds_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1sdu_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1sds_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ld1sdu_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ld1sds_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + DEF_HELPER_FLAGS_4(sve_ldff1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ldff1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ldff1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) @@ -1227,6 +1285,55 @@ DEF_HELPER_FLAGS_4(sve_ldff1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ldff1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ldff1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1hh_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hsu_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hdu_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hss_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hds_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1hh_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hsu_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hdu_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hss_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1hds_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1ss_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1sdu_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1sds_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1ss_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1sdu_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1sds_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldff1dd_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldff1dd_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + DEF_HELPER_FLAGS_4(sve_ldnf1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) @@ -1258,6 +1365,55 @@ DEF_HELPER_FLAGS_4(sve_ldnf1sds_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ldnf1dd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_ldnf1dd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bhu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bsu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bdu_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bhs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bss_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1bds_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1hh_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hss_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hds_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1hh_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hsu_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hdu_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hss_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1hds_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1ss_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1sds_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1ss_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1sdu_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1sds_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_ldnf1dd_le_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_ldnf1dd_be_r_mte, TCG_CALL_NO_WG, + void, env, ptr, tl, i32) + DEF_HELPER_FLAGS_4(sve_st1bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_st2bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_st3bb_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) @@ -1305,6 +1461,53 @@ DEF_HELPER_FLAGS_4(sve_st1hd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_st1sd_le_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) DEF_HELPER_FLAGS_4(sve_st1sd_be_r, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4bb_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4hh_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4hh_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4ss_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4ss_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4dd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st2dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st3dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st4dd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1bh_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1bs_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1bd_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1hs_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1hd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1hs_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1hd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + +DEF_HELPER_FLAGS_4(sve_st1sd_le_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) +DEF_HELPER_FLAGS_4(sve_st1sd_be_r_mte, TCG_CALL_NO_WG, void, env, ptr, tl, i32) + DEF_HELPER_FLAGS_6(sve_ldbsu_zsu, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr, tl, i32) DEF_HELPER_FLAGS_6(sve_ldhsu_le_zsu, TCG_CALL_NO_WG, @@ -1414,6 +1617,115 @@ DEF_HELPER_FLAGS_6(sve_ldsds_le_zd, TCG_CALL_NO_WG, DEF_HELPER_FLAGS_6(sve_ldsds_be_zd, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbsu_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhsu_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhsu_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldss_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldss_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbss_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhss_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhss_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbsu_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhsu_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhsu_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldss_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldss_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbss_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhss_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhss_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbdu_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_lddd_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_lddd_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbds_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbdu_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_lddd_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_lddd_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbds_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldbdu_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhdu_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsdu_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_lddd_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_lddd_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldbds_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldhds_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldsds_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + DEF_HELPER_FLAGS_6(sve_ldffbsu_zsu, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr, tl, i32) DEF_HELPER_FLAGS_6(sve_ldffhsu_le_zsu, TCG_CALL_NO_WG, @@ -1523,6 +1835,115 @@ DEF_HELPER_FLAGS_6(sve_ldffsds_le_zd, TCG_CALL_NO_WG, DEF_HELPER_FLAGS_6(sve_ldffsds_be_zd, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbsu_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhsu_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhsu_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffss_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffss_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbss_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhss_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhss_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbsu_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhsu_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhsu_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffss_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffss_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbss_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhss_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhss_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbdu_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffdd_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffdd_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbds_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbdu_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffdd_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffdd_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbds_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_ldffbdu_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhdu_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsdu_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffdd_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffdd_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffbds_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffhds_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_ldffsds_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + DEF_HELPER_FLAGS_6(sve_stbs_zsu, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr, tl, i32) DEF_HELPER_FLAGS_6(sve_sths_le_zsu, TCG_CALL_NO_WG, @@ -1590,4 +2011,71 @@ DEF_HELPER_FLAGS_6(sve_stdd_le_zd, TCG_CALL_NO_WG, DEF_HELPER_FLAGS_6(sve_stdd_be_zd, TCG_CALL_NO_WG, void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stbs_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sths_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sths_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stss_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stss_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbs_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sths_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sths_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stss_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stss_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbd_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_le_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_be_zsu_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbd_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_le_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_be_zss_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + +DEF_HELPER_FLAGS_6(sve_stbd_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_sthd_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stsd_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_le_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) +DEF_HELPER_FLAGS_6(sve_stdd_be_zd_mte, TCG_CALL_NO_WG, + void, env, ptr, ptr, ptr, tl, i32) + DEF_HELPER_FLAGS_4(sve2_pmull_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) diff --git a/target/arm/helper.c b/target/arm/helper.c index 972a766730..dc9c29f998 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -44,7 +44,8 @@ static bool get_phys_addr_lpae(CPUARMState *env, target_ulong address, bool s1_is_el0, hwaddr *phys_ptr, MemTxAttrs *txattrs, int *prot, target_ulong *page_size_ptr, - ARMMMUFaultInfo *fi, ARMCacheAttrs *cacheattrs); + ARMMMUFaultInfo *fi, ARMCacheAttrs *cacheattrs) + __attribute__((nonnull)); #endif static void switch_mode(CPUARMState *env, int mode); @@ -2011,9 +2012,19 @@ static void scr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) uint32_t valid_mask = 0x3fff; ARMCPU *cpu = env_archcpu(env); - if (arm_el_is_aa64(env, 3)) { + if (ri->state == ARM_CP_STATE_AA64) { value |= SCR_FW | SCR_AW; /* these two bits are RES1. */ valid_mask &= ~SCR_NET; + + if (cpu_isar_feature(aa64_lor, cpu)) { + valid_mask |= SCR_TLOR; + } + if (cpu_isar_feature(aa64_pauth, cpu)) { + valid_mask |= SCR_API | SCR_APK; + } + if (cpu_isar_feature(aa64_mte, cpu)) { + valid_mask |= SCR_ATA; + } } else { valid_mask &= ~(SCR_RW | SCR_ST); } @@ -2032,12 +2043,6 @@ static void scr_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t value) valid_mask &= ~SCR_SMD; } } - if (cpu_isar_feature(aa64_lor, cpu)) { - valid_mask |= SCR_TLOR; - } - if (cpu_isar_feature(aa64_pauth, cpu)) { - valid_mask |= SCR_API | SCR_APK; - } /* Clear all-context RES0 bits. */ value &= valid_mask; @@ -4697,6 +4702,22 @@ static void sctlr_write(CPUARMState *env, const ARMCPRegInfo *ri, { ARMCPU *cpu = env_archcpu(env); + if (arm_feature(env, ARM_FEATURE_PMSA) && !cpu->has_mpu) { + /* M bit is RAZ/WI for PMSA with no MPU implemented */ + value &= ~SCTLR_M; + } + + /* ??? Lots of these bits are not implemented. */ + + if (ri->state == ARM_CP_STATE_AA64 && !cpu_isar_feature(aa64_mte, cpu)) { + if (ri->opc1 == 6) { /* SCTLR_EL3 */ + value &= ~(SCTLR_ITFSB | SCTLR_TCF | SCTLR_ATA); + } else { + value &= ~(SCTLR_ITFSB | SCTLR_TCF0 | SCTLR_TCF | + SCTLR_ATA0 | SCTLR_ATA); + } + } + if (raw_read(env, ri) == value) { /* Skip the TLB flush if nothing actually changed; Linux likes * to do a lot of pointless SCTLR writes. @@ -4704,13 +4725,8 @@ static void sctlr_write(CPUARMState *env, const ARMCPRegInfo *ri, return; } - if (arm_feature(env, ARM_FEATURE_PMSA) && !cpu->has_mpu) { - /* M bit is RAZ/WI for PMSA with no MPU implemented */ - value &= ~SCTLR_M; - } - raw_write(env, ri, value); - /* ??? Lots of these bits are not implemented. */ + /* This may enable/disable the MMU, so do a TLB flush. */ tlb_flush(CPU(cpu)); @@ -5236,17 +5252,22 @@ static void do_hcr_write(CPUARMState *env, uint64_t value, uint64_t valid_mask) if (cpu_isar_feature(aa64_pauth, cpu)) { valid_mask |= HCR_API | HCR_APK; } + if (cpu_isar_feature(aa64_mte, cpu)) { + valid_mask |= HCR_ATA | HCR_DCT | HCR_TID5; + } } /* Clear RES0 bits. */ value &= valid_mask; - /* These bits change the MMU setup: + /* + * These bits change the MMU setup: * HCR_VM enables stage 2 translation * HCR_PTW forbids certain page-table setups - * HCR_DC Disables stage1 and enables stage2 translation + * HCR_DC disables stage1 and enables stage2 translation + * HCR_DCT enables tagging on (disabled) stage1 translation */ - if ((env->cp15.hcr_el2 ^ value) & (HCR_VM | HCR_PTW | HCR_DC)) { + if ((env->cp15.hcr_el2 ^ value) & (HCR_VM | HCR_PTW | HCR_DC | HCR_DCT)) { tlb_flush(CPU(cpu)); } env->cp15.hcr_el2 = value; @@ -5861,6 +5882,9 @@ static void define_arm_vh_e2h_redirects_aliases(ARMCPU *cpu) { K(3, 0, 1, 2, 0), K(3, 4, 1, 2, 0), K(3, 5, 1, 2, 0), "ZCR_EL1", "ZCR_EL2", "ZCR_EL12", isar_feature_aa64_sve }, + { K(3, 0, 5, 6, 0), K(3, 4, 5, 6, 0), K(3, 5, 5, 6, 0), + "TFSR_EL1", "TFSR_EL2", "TFSR_EL12", isar_feature_aa64_mte }, + /* TODO: ARMv8.2-SPE -- PMSCR_EL2 */ /* TODO: ARMv8.4-Trace -- TRFCR_EL2 */ }; @@ -6835,6 +6859,165 @@ static const ARMCPRegInfo dcpodp_reg[] = { }; #endif /*CONFIG_USER_ONLY*/ +static CPAccessResult access_aa64_tid5(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + if ((arm_current_el(env) < 2) && (arm_hcr_el2_eff(env) & HCR_TID5)) { + return CP_ACCESS_TRAP_EL2; + } + + return CP_ACCESS_OK; +} + +static CPAccessResult access_mte(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + int el = arm_current_el(env); + + if (el < 2 && + arm_feature(env, ARM_FEATURE_EL2) && + !(arm_hcr_el2_eff(env) & HCR_ATA)) { + return CP_ACCESS_TRAP_EL2; + } + if (el < 3 && + arm_feature(env, ARM_FEATURE_EL3) && + !(env->cp15.scr_el3 & SCR_ATA)) { + return CP_ACCESS_TRAP_EL3; + } + return CP_ACCESS_OK; +} + +static uint64_t tco_read(CPUARMState *env, const ARMCPRegInfo *ri) +{ + return env->pstate & PSTATE_TCO; +} + +static void tco_write(CPUARMState *env, const ARMCPRegInfo *ri, uint64_t val) +{ + env->pstate = (env->pstate & ~PSTATE_TCO) | (val & PSTATE_TCO); +} + +static const ARMCPRegInfo mte_reginfo[] = { + { .name = "TFSRE0_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .crn = 5, .crm = 6, .opc2 = 1, + .access = PL1_RW, .accessfn = access_mte, + .fieldoffset = offsetof(CPUARMState, cp15.tfsr_el[0]) }, + { .name = "TFSR_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .crn = 5, .crm = 6, .opc2 = 0, + .access = PL1_RW, .accessfn = access_mte, + .fieldoffset = offsetof(CPUARMState, cp15.tfsr_el[1]) }, + { .name = "TFSR_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .crn = 5, .crm = 6, .opc2 = 0, + .access = PL2_RW, .accessfn = access_mte, + .fieldoffset = offsetof(CPUARMState, cp15.tfsr_el[2]) }, + { .name = "TFSR_EL3", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 6, .crn = 5, .crm = 6, .opc2 = 0, + .access = PL3_RW, + .fieldoffset = offsetof(CPUARMState, cp15.tfsr_el[3]) }, + { .name = "RGSR_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .crn = 1, .crm = 0, .opc2 = 5, + .access = PL1_RW, .accessfn = access_mte, + .fieldoffset = offsetof(CPUARMState, cp15.rgsr_el1) }, + { .name = "GCR_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .crn = 1, .crm = 0, .opc2 = 6, + .access = PL1_RW, .accessfn = access_mte, + .fieldoffset = offsetof(CPUARMState, cp15.gcr_el1) }, + { .name = "GMID_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 1, .crn = 0, .crm = 0, .opc2 = 4, + .access = PL1_R, .accessfn = access_aa64_tid5, + .type = ARM_CP_CONST, .resetvalue = GMID_EL1_BS }, + { .name = "TCO", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 3, .crn = 4, .crm = 2, .opc2 = 7, + .type = ARM_CP_NO_RAW, + .access = PL0_RW, .readfn = tco_read, .writefn = tco_write }, + { .name = "DC_IGVAC", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 3, + .type = ARM_CP_NOP, .access = PL1_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_IGSW", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 4, + .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_tsw }, + { .name = "DC_IGDVAC", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 5, + .type = ARM_CP_NOP, .access = PL1_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_IGDSW", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 6, + .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_tsw }, + { .name = "DC_CGSW", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 10, .opc2 = 4, + .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_tsw }, + { .name = "DC_CGDSW", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 10, .opc2 = 6, + .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_tsw }, + { .name = "DC_CIGSW", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 14, .opc2 = 4, + .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_tsw }, + { .name = "DC_CIGDSW", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 14, .opc2 = 6, + .type = ARM_CP_NOP, .access = PL1_W, .accessfn = access_tsw }, + REGINFO_SENTINEL +}; + +static const ARMCPRegInfo mte_tco_ro_reginfo[] = { + { .name = "TCO", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 3, .crn = 4, .crm = 2, .opc2 = 7, + .type = ARM_CP_CONST, .access = PL0_RW, }, + REGINFO_SENTINEL +}; + +static const ARMCPRegInfo mte_el0_cacheop_reginfo[] = { + { .name = "DC_CGVAC", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 10, .opc2 = 3, + .type = ARM_CP_NOP, .access = PL0_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_CGDVAC", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 10, .opc2 = 5, + .type = ARM_CP_NOP, .access = PL0_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_CGVAP", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 12, .opc2 = 3, + .type = ARM_CP_NOP, .access = PL0_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_CGDVAP", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 12, .opc2 = 5, + .type = ARM_CP_NOP, .access = PL0_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_CGVADP", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 13, .opc2 = 3, + .type = ARM_CP_NOP, .access = PL0_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_CGDVADP", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 13, .opc2 = 5, + .type = ARM_CP_NOP, .access = PL0_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_CIGVAC", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 3, + .type = ARM_CP_NOP, .access = PL0_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_CIGDVAC", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 5, + .type = ARM_CP_NOP, .access = PL0_W, + .accessfn = aa64_cacheop_poc_access }, + { .name = "DC_GVA", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 4, .opc2 = 3, + .access = PL0_W, .type = ARM_CP_DC_GVA, +#ifndef CONFIG_USER_ONLY + /* Avoid overhead of an access check that always passes in user-mode */ + .accessfn = aa64_zva_access, +#endif + }, + { .name = "DC_GZVA", .state = ARM_CP_STATE_AA64, + .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 4, .opc2 = 4, + .access = PL0_W, .type = ARM_CP_DC_GZVA, +#ifndef CONFIG_USER_ONLY + /* Avoid overhead of an access check that always passes in user-mode */ + .accessfn = aa64_zva_access, +#endif + }, + REGINFO_SENTINEL +}; + #endif static CPAccessResult access_predinv(CPUARMState *env, const ARMCPRegInfo *ri, @@ -7960,6 +8143,19 @@ void register_cp_regs_for_features(ARMCPU *cpu) } } #endif /*CONFIG_USER_ONLY*/ + + /* + * If full MTE is enabled, add all of the system registers. + * If only "instructions available at EL0" are enabled, + * then define only a RAZ/WI version of PSTATE.TCO. + */ + if (cpu_isar_feature(aa64_mte, cpu)) { + define_arm_cp_regs(cpu, mte_reginfo); + define_arm_cp_regs(cpu, mte_el0_cacheop_reginfo); + } else if (cpu_isar_feature(aa64_mte_insn_reg, cpu)) { + define_arm_cp_regs(cpu, mte_tco_ro_reginfo); + define_arm_cp_regs(cpu, mte_el0_cacheop_reginfo); + } #endif if (cpu_isar_feature(any_predinv, cpu)) { @@ -9509,6 +9705,9 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs) break; } } + if (cpu_isar_feature(aa64_mte, cpu)) { + new_mode |= PSTATE_TCO; + } pstate_write(env, PSTATE_DAIF | new_mode); env->aarch64 = 1; @@ -9614,42 +9813,6 @@ void arm_cpu_do_interrupt(CPUState *cs) } #endif /* !CONFIG_USER_ONLY */ -/* Return the exception level which controls this address translation regime */ -static uint32_t regime_el(CPUARMState *env, ARMMMUIdx mmu_idx) -{ - switch (mmu_idx) { - case ARMMMUIdx_E20_0: - case ARMMMUIdx_E20_2: - case ARMMMUIdx_E20_2_PAN: - case ARMMMUIdx_Stage2: - case ARMMMUIdx_E2: - return 2; - case ARMMMUIdx_SE3: - return 3; - case ARMMMUIdx_SE10_0: - return arm_el_is_aa64(env, 3) ? 1 : 3; - case ARMMMUIdx_SE10_1: - case ARMMMUIdx_SE10_1_PAN: - case ARMMMUIdx_Stage1_E0: - case ARMMMUIdx_Stage1_E1: - case ARMMMUIdx_Stage1_E1_PAN: - case ARMMMUIdx_E10_0: - case ARMMMUIdx_E10_1: - case ARMMMUIdx_E10_1_PAN: - case ARMMMUIdx_MPrivNegPri: - case ARMMMUIdx_MUserNegPri: - case ARMMMUIdx_MPriv: - case ARMMMUIdx_MUser: - case ARMMMUIdx_MSPrivNegPri: - case ARMMMUIdx_MSUserNegPri: - case ARMMMUIdx_MSPriv: - case ARMMMUIdx_MSUser: - return 1; - default: - g_assert_not_reached(); - } -} - uint64_t arm_sctlr(CPUARMState *env, int el) { /* Only EL0 needs to be adjusted for EL1&0 or EL2&0. */ @@ -9732,15 +9895,6 @@ static inline uint64_t regime_ttbr(CPUARMState *env, ARMMMUIdx mmu_idx, #endif /* !CONFIG_USER_ONLY */ -/* Return the TCR controlling this translation regime */ -static inline TCR *regime_tcr(CPUARMState *env, ARMMMUIdx mmu_idx) -{ - if (mmu_idx == ARMMMUIdx_Stage2) { - return &env->cp15.vtcr_el2; - } - return &env->cp15.tcr_el[regime_el(env, mmu_idx)]; -} - /* Convert a possible stage1+2 MMU index into the appropriate * stage 1 MMU index */ @@ -10541,6 +10695,16 @@ static int aa64_va_parameter_tbid(uint64_t tcr, ARMMMUIdx mmu_idx) } } +static int aa64_va_parameter_tcma(uint64_t tcr, ARMMMUIdx mmu_idx) +{ + if (regime_has_2_ranges(mmu_idx)) { + return extract64(tcr, 57, 2); + } else { + /* Replicate the single TCMA bit so we always have 2 bits. */ + return extract32(tcr, 30, 1) * 3; + } +} + ARMVAParameters aa64_va_parameters(CPUARMState *env, uint64_t va, ARMMMUIdx mmu_idx, bool data) { @@ -10935,22 +11099,19 @@ static bool get_phys_addr_lpae(CPUARMState *env, target_ulong address, } /* When in aarch64 mode, and BTI is enabled, remember GP in the IOTLB. */ if (aarch64 && guarded && cpu_isar_feature(aa64_bti, cpu)) { - txattrs->target_tlb_bit0 = true; + arm_tlb_bti_gp(txattrs) = true; } - if (cacheattrs != NULL) { - if (mmu_idx == ARMMMUIdx_Stage2) { - cacheattrs->attrs = convert_stage2_attrs(env, - extract32(attrs, 0, 4)); - } else { - /* Index into MAIR registers for cache attributes */ - uint8_t attrindx = extract32(attrs, 0, 3); - uint64_t mair = env->cp15.mair_el[regime_el(env, mmu_idx)]; - assert(attrindx <= 7); - cacheattrs->attrs = extract64(mair, attrindx * 8, 8); - } - cacheattrs->shareability = extract32(attrs, 6, 2); + if (mmu_idx == ARMMMUIdx_Stage2) { + cacheattrs->attrs = convert_stage2_attrs(env, extract32(attrs, 0, 4)); + } else { + /* Index into MAIR registers for cache attributes */ + uint8_t attrindx = extract32(attrs, 0, 3); + uint64_t mair = env->cp15.mair_el[regime_el(env, mmu_idx)]; + assert(attrindx <= 7); + cacheattrs->attrs = extract64(mair, attrindx * 8, 8); } + cacheattrs->shareability = extract32(attrs, 6, 2); *phys_ptr = descaddr; *page_size_ptr = page_size; @@ -11673,9 +11834,19 @@ static uint8_t combine_cacheattr_nibble(uint8_t s1, uint8_t s2) */ static ARMCacheAttrs combine_cacheattrs(ARMCacheAttrs s1, ARMCacheAttrs s2) { - uint8_t s1lo = extract32(s1.attrs, 0, 4), s2lo = extract32(s2.attrs, 0, 4); - uint8_t s1hi = extract32(s1.attrs, 4, 4), s2hi = extract32(s2.attrs, 4, 4); + uint8_t s1lo, s2lo, s1hi, s2hi; ARMCacheAttrs ret; + bool tagged = false; + + if (s1.attrs == 0xf0) { + tagged = true; + s1.attrs = 0xff; + } + + s1lo = extract32(s1.attrs, 0, 4); + s2lo = extract32(s2.attrs, 0, 4); + s1hi = extract32(s1.attrs, 4, 4); + s2hi = extract32(s2.attrs, 4, 4); /* Combine shareability attributes (table D4-43) */ if (s1.shareability == 2 || s2.shareability == 2) { @@ -11723,6 +11894,11 @@ static ARMCacheAttrs combine_cacheattrs(ARMCacheAttrs s1, ARMCacheAttrs s2) } } + /* TODO: CombineS1S2Desc does not consider transient, only WB, RWA. */ + if (tagged && ret.attrs == 0xff) { + ret.attrs = 0xf0; + } + return ret; } @@ -11785,28 +11961,32 @@ bool get_phys_addr(CPUARMState *env, target_ulong address, ret = get_phys_addr_lpae(env, ipa, access_type, ARMMMUIdx_Stage2, mmu_idx == ARMMMUIdx_E10_0, phys_ptr, attrs, &s2_prot, - page_size, fi, - cacheattrs != NULL ? &cacheattrs2 : NULL); + page_size, fi, &cacheattrs2); fi->s2addr = ipa; /* Combine the S1 and S2 perms. */ *prot &= s2_prot; - /* Combine the S1 and S2 cache attributes, if needed */ - if (!ret && cacheattrs != NULL) { - if (env->cp15.hcr_el2 & HCR_DC) { - /* - * HCR.DC forces the first stage attributes to - * Normal Non-Shareable, - * Inner Write-Back Read-Allocate Write-Allocate, - * Outer Write-Back Read-Allocate Write-Allocate. - */ + /* If S2 fails, return early. */ + if (ret) { + return ret; + } + + /* Combine the S1 and S2 cache attributes. */ + if (env->cp15.hcr_el2 & HCR_DC) { + /* + * HCR.DC forces the first stage attributes to + * Normal Non-Shareable, + * Inner Write-Back Read-Allocate Write-Allocate, + * Outer Write-Back Read-Allocate Write-Allocate. + * Do not overwrite Tagged within attrs. + */ + if (cacheattrs->attrs != 0xf0) { cacheattrs->attrs = 0xff; - cacheattrs->shareability = 0; } - *cacheattrs = combine_cacheattrs(*cacheattrs, cacheattrs2); + cacheattrs->shareability = 0; } - - return ret; + *cacheattrs = combine_cacheattrs(*cacheattrs, cacheattrs2); + return 0; } else { /* * For non-EL2 CPUs a stage1+stage2 translation is just stage 1. @@ -11867,6 +12047,9 @@ bool get_phys_addr(CPUARMState *env, target_ulong address, /* Definitely a real MMU, not an MPU */ if (regime_translation_disabled(env, mmu_idx)) { + uint64_t hcr; + uint8_t memattr; + /* * MMU disabled. S1 addresses within aa64 translation regimes are * still checked for bounds -- see AArch64.TranslateAddressS1Off. @@ -11904,6 +12087,27 @@ bool get_phys_addr(CPUARMState *env, target_ulong address, *phys_ptr = address; *prot = PAGE_READ | PAGE_WRITE | PAGE_EXEC; *page_size = TARGET_PAGE_SIZE; + + /* Fill in cacheattr a-la AArch64.TranslateAddressS1Off. */ + hcr = arm_hcr_el2_eff(env); + cacheattrs->shareability = 0; + if (hcr & HCR_DC) { + if (hcr & HCR_DCT) { + memattr = 0xf0; /* Tagged, Normal, WB, RWA */ + } else { + memattr = 0xff; /* Normal, WB, RWA */ + } + } else if (access_type == MMU_INST_FETCH) { + if (regime_sctlr(env, mmu_idx) & SCTLR_I) { + memattr = 0xee; /* Normal, WT, RA, NT */ + } else { + memattr = 0x44; /* Normal, NC, No */ + } + cacheattrs->shareability = 2; /* outer sharable */ + } else { + memattr = 0x00; /* Device, nGnRnE */ + } + cacheattrs->attrs = memattr; return 0; } @@ -11931,11 +12135,12 @@ hwaddr arm_cpu_get_phys_page_attrs_debug(CPUState *cs, vaddr addr, bool ret; ARMMMUFaultInfo fi = {}; ARMMMUIdx mmu_idx = arm_mmu_idx(env); + ARMCacheAttrs cacheattrs = {}; *attrs = (MemTxAttrs) {}; ret = get_phys_addr(env, addr, 0, mmu_idx, &phys_addr, - attrs, &prot, &page_size, &fi, NULL); + attrs, &prot, &page_size, &fi, &cacheattrs); if (ret) { return -1; @@ -12565,6 +12770,36 @@ static uint32_t rebuild_hflags_a64(CPUARMState *env, int el, int fp_el, } } + if (cpu_isar_feature(aa64_mte, env_archcpu(env))) { + /* + * Set MTE_ACTIVE if any access may be Checked, and leave clear + * if all accesses must be Unchecked: + * 1) If no TBI, then there are no tags in the address to check, + * 2) If Tag Check Override, then all accesses are Unchecked, + * 3) If Tag Check Fail == 0, then Checked access have no effect, + * 4) If no Allocation Tag Access, then all accesses are Unchecked. + */ + if (allocation_tag_access_enabled(env, el, sctlr)) { + flags = FIELD_DP32(flags, TBFLAG_A64, ATA, 1); + if (tbid + && !(env->pstate & PSTATE_TCO) + && (sctlr & (el == 0 ? SCTLR_TCF0 : SCTLR_TCF))) { + flags = FIELD_DP32(flags, TBFLAG_A64, MTE_ACTIVE, 1); + } + } + /* And again for unprivileged accesses, if required. */ + if (FIELD_EX32(flags, TBFLAG_A64, UNPRIV) + && tbid + && !(env->pstate & PSTATE_TCO) + && (sctlr & SCTLR_TCF0) + && allocation_tag_access_enabled(env, 0, sctlr)) { + flags = FIELD_DP32(flags, TBFLAG_A64, MTE0_ACTIVE, 1); + } + /* Cache TCMA as well as TBI. */ + flags = FIELD_DP32(flags, TBFLAG_A64, TCMA, + aa64_va_parameter_tcma(tcr, mmu_idx)); + } + return rebuild_hflags_common(env, fp_el, mmu_idx, flags); } diff --git a/target/arm/helper.h b/target/arm/helper.h index 2a20c8174c..759639a63a 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -96,6 +96,8 @@ DEF_HELPER_FLAGS_1(rebuild_hflags_a32_newel, TCG_CALL_NO_RWG, void, env) DEF_HELPER_FLAGS_2(rebuild_hflags_a32, TCG_CALL_NO_RWG, void, env, int) DEF_HELPER_FLAGS_2(rebuild_hflags_a64, TCG_CALL_NO_RWG, void, env, int) +DEF_HELPER_FLAGS_5(probe_access, TCG_CALL_NO_WG, void, env, tl, i32, i32, i32) + DEF_HELPER_1(vfp_get_fpscr, i32, env) DEF_HELPER_2(vfp_set_fpscr, void, env, i32) diff --git a/target/arm/internals.h b/target/arm/internals.h index 4bdbc3a8ac..ae99725d2b 100644 --- a/target/arm/internals.h +++ b/target/arm/internals.h @@ -913,6 +913,51 @@ static inline bool regime_is_pan(CPUARMState *env, ARMMMUIdx mmu_idx) } } +/* Return the exception level which controls this address translation regime */ +static inline uint32_t regime_el(CPUARMState *env, ARMMMUIdx mmu_idx) +{ + switch (mmu_idx) { + case ARMMMUIdx_E20_0: + case ARMMMUIdx_E20_2: + case ARMMMUIdx_E20_2_PAN: + case ARMMMUIdx_Stage2: + case ARMMMUIdx_E2: + return 2; + case ARMMMUIdx_SE3: + return 3; + case ARMMMUIdx_SE10_0: + return arm_el_is_aa64(env, 3) ? 1 : 3; + case ARMMMUIdx_SE10_1: + case ARMMMUIdx_SE10_1_PAN: + case ARMMMUIdx_Stage1_E0: + case ARMMMUIdx_Stage1_E1: + case ARMMMUIdx_Stage1_E1_PAN: + case ARMMMUIdx_E10_0: + case ARMMMUIdx_E10_1: + case ARMMMUIdx_E10_1_PAN: + case ARMMMUIdx_MPrivNegPri: + case ARMMMUIdx_MUserNegPri: + case ARMMMUIdx_MPriv: + case ARMMMUIdx_MUser: + case ARMMMUIdx_MSPrivNegPri: + case ARMMMUIdx_MSUserNegPri: + case ARMMMUIdx_MSPriv: + case ARMMMUIdx_MSUser: + return 1; + default: + g_assert_not_reached(); + } +} + +/* Return the TCR controlling this translation regime */ +static inline TCR *regime_tcr(CPUARMState *env, ARMMMUIdx mmu_idx) +{ + if (mmu_idx == ARMMMUIdx_Stage2) { + return &env->cp15.vtcr_el2; + } + return &env->cp15.tcr_el[regime_el(env, mmu_idx)]; +} + /* Return the FSR value for a debug exception (watchpoint, hardware * breakpoint or BKPT insn) targeting the specified exception level. */ @@ -1159,6 +1204,9 @@ static inline uint32_t aarch64_pstate_valid_mask(const ARMISARegisters *id) if (isar_feature_aa64_uao(id)) { valid |= PSTATE_UAO; } + if (isar_feature_aa64_mte(id)) { + valid |= PSTATE_TCO; + } return valid; } @@ -1195,6 +1243,24 @@ static inline int exception_target_el(CPUARMState *env) return target_el; } +/* Determine if allocation tags are available. */ +static inline bool allocation_tag_access_enabled(CPUARMState *env, int el, + uint64_t sctlr) +{ + if (el < 3 + && arm_feature(env, ARM_FEATURE_EL3) + && !(env->cp15.scr_el3 & SCR_ATA)) { + return false; + } + if (el < 2 + && arm_feature(env, ARM_FEATURE_EL2) + && !(arm_hcr_el2_eff(env) & HCR_ATA)) { + return false; + } + sctlr &= (el == 0 ? SCTLR_ATA0 : SCTLR_ATA); + return sctlr != 0; +} + #ifndef CONFIG_USER_ONLY /* Security attributes for an address, as returned by v8m_security_lookup. */ @@ -1228,10 +1294,95 @@ bool get_phys_addr(CPUARMState *env, target_ulong address, MMUAccessType access_type, ARMMMUIdx mmu_idx, hwaddr *phys_ptr, MemTxAttrs *attrs, int *prot, target_ulong *page_size, - ARMMMUFaultInfo *fi, ARMCacheAttrs *cacheattrs); + ARMMMUFaultInfo *fi, ARMCacheAttrs *cacheattrs) + __attribute__((nonnull)); void arm_log_exception(int idx); #endif /* !CONFIG_USER_ONLY */ +/* + * The log2 of the words in the tag block, for GMID_EL1.BS. + * The is the maximum, 256 bytes, which manipulates 64-bits of tags. + */ +#define GMID_EL1_BS 6 + +/* We associate one allocation tag per 16 bytes, the minimum. */ +#define LOG2_TAG_GRANULE 4 +#define TAG_GRANULE (1 << LOG2_TAG_GRANULE) + +/* + * The SVE simd_data field, for memory ops, contains either + * rd (5 bits) or a shift count (2 bits). + */ +#define SVE_MTEDESC_SHIFT 5 + +/* Bits within a descriptor passed to the helper_mte_check* functions. */ +FIELD(MTEDESC, MIDX, 0, 4) +FIELD(MTEDESC, TBI, 4, 2) +FIELD(MTEDESC, TCMA, 6, 2) +FIELD(MTEDESC, WRITE, 8, 1) +FIELD(MTEDESC, ESIZE, 9, 5) +FIELD(MTEDESC, TSIZE, 14, 10) /* mte_checkN only */ + +bool mte_probe1(CPUARMState *env, uint32_t desc, uint64_t ptr); +uint64_t mte_check1(CPUARMState *env, uint32_t desc, + uint64_t ptr, uintptr_t ra); +uint64_t mte_checkN(CPUARMState *env, uint32_t desc, + uint64_t ptr, uintptr_t ra); + +static inline int allocation_tag_from_addr(uint64_t ptr) +{ + return extract64(ptr, 56, 4); +} + +static inline uint64_t address_with_allocation_tag(uint64_t ptr, int rtag) +{ + return deposit64(ptr, 56, 4, rtag); +} + +/* Return true if tbi bits mean that the access is checked. */ +static inline bool tbi_check(uint32_t desc, int bit55) +{ + return (desc >> (R_MTEDESC_TBI_SHIFT + bit55)) & 1; +} + +/* Return true if tcma bits mean that the access is unchecked. */ +static inline bool tcma_check(uint32_t desc, int bit55, int ptr_tag) +{ + /* + * We had extracted bit55 and ptr_tag for other reasons, so fold + * (ptr<59:55> == 00000 || ptr<59:55> == 11111) into a single test. + */ + bool match = ((ptr_tag + bit55) & 0xf) == 0; + bool tcma = (desc >> (R_MTEDESC_TCMA_SHIFT + bit55)) & 1; + return tcma && match; +} + +/* + * For TBI, ideally, we would do nothing. Proper behaviour on fault is + * for the tag to be present in the FAR_ELx register. But for user-only + * mode, we do not have a TLB with which to implement this, so we must + * remove the top byte. + */ +static inline uint64_t useronly_clean_ptr(uint64_t ptr) +{ + /* TBI is known to be enabled. */ +#ifdef CONFIG_USER_ONLY + ptr = sextract64(ptr, 0, 56); +#endif + return ptr; +} + +static inline uint64_t useronly_maybe_clean_ptr(uint32_t desc, uint64_t ptr) +{ +#ifdef CONFIG_USER_ONLY + int64_t clean_ptr = sextract64(ptr, 0, 56); + if (tbi_check(desc, clean_ptr < 0)) { + ptr = clean_ptr; + } +#endif + return ptr; +} + #endif diff --git a/target/arm/kvm.c b/target/arm/kvm.c index eef3bbd1cc..7c672c78b8 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -208,9 +208,9 @@ void kvm_arm_add_vcpu_properties(Object *obj) } } -bool kvm_arm_pmu_supported(CPUState *cpu) +bool kvm_arm_pmu_supported(void) { - return kvm_check_extension(cpu->kvm_state, KVM_CAP_ARM_PMU_V3); + return kvm_check_extension(kvm_state, KVM_CAP_ARM_PMU_V3); } int kvm_arm_get_max_vm_ipa_size(MachineState *ms) diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index f09ed9f4df..3dc494aaa7 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -652,18 +652,14 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf) return true; } -bool kvm_arm_aarch32_supported(CPUState *cpu) +bool kvm_arm_aarch32_supported(void) { - KVMState *s = KVM_STATE(current_accel()); - - return kvm_check_extension(s, KVM_CAP_ARM_EL1_32BIT); + return kvm_check_extension(kvm_state, KVM_CAP_ARM_EL1_32BIT); } -bool kvm_arm_sve_supported(CPUState *cpu) +bool kvm_arm_sve_supported(void) { - KVMState *s = KVM_STATE(current_accel()); - - return kvm_check_extension(s, KVM_CAP_ARM_SVE); + return kvm_check_extension(kvm_state, KVM_CAP_ARM_SVE); } QEMU_BUILD_BUG_ON(KVM_ARM64_SVE_VQ_MIN != 1); @@ -798,7 +794,7 @@ int kvm_arch_init_vcpu(CPUState *cs) env->features &= ~(1ULL << ARM_FEATURE_PMU); } if (cpu_isar_feature(aa64_sve, cpu)) { - assert(kvm_arm_sve_supported(cs)); + assert(kvm_arm_sve_supported()); cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_SVE; } diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index 48bf5e16d5..a4ce4fd93d 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -269,29 +269,26 @@ void kvm_arm_add_vcpu_properties(Object *obj); /** * kvm_arm_aarch32_supported: - * @cs: CPUState * - * Returns: true if the KVM VCPU can enable AArch32 mode + * Returns: true if KVM can enable AArch32 mode * and false otherwise. */ -bool kvm_arm_aarch32_supported(CPUState *cs); +bool kvm_arm_aarch32_supported(void); /** * kvm_arm_pmu_supported: - * @cs: CPUState * - * Returns: true if the KVM VCPU can enable its PMU + * Returns: true if KVM can enable the PMU * and false otherwise. */ -bool kvm_arm_pmu_supported(CPUState *cs); +bool kvm_arm_pmu_supported(void); /** * kvm_arm_sve_supported: - * @cs: CPUState * - * Returns true if the KVM VCPU can enable SVE and false otherwise. + * Returns true if KVM can enable SVE and false otherwise. */ -bool kvm_arm_sve_supported(CPUState *cs); +bool kvm_arm_sve_supported(void); /** * kvm_arm_get_max_vm_ipa_size: @@ -359,17 +356,17 @@ static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu) static inline void kvm_arm_add_vcpu_properties(Object *obj) {} -static inline bool kvm_arm_aarch32_supported(CPUState *cs) +static inline bool kvm_arm_aarch32_supported(void) { return false; } -static inline bool kvm_arm_pmu_supported(CPUState *cs) +static inline bool kvm_arm_pmu_supported(void) { return false; } -static inline bool kvm_arm_sve_supported(CPUState *cs) +static inline bool kvm_arm_sve_supported(void) { return false; } diff --git a/target/arm/m_helper.c b/target/arm/m_helper.c index 5e8a795d20..036454234c 100644 --- a/target/arm/m_helper.c +++ b/target/arm/m_helper.c @@ -187,12 +187,13 @@ static bool v7m_stack_write(ARMCPU *cpu, uint32_t addr, uint32_t value, hwaddr physaddr; int prot; ARMMMUFaultInfo fi = {}; + ARMCacheAttrs cacheattrs = {}; bool secure = mmu_idx & ARM_MMU_IDX_M_S; int exc; bool exc_secure; if (get_phys_addr(env, addr, MMU_DATA_STORE, mmu_idx, &physaddr, - &attrs, &prot, &page_size, &fi, NULL)) { + &attrs, &prot, &page_size, &fi, &cacheattrs)) { /* MPU/SAU lookup failed */ if (fi.type == ARMFault_QEMU_SFault) { if (mode == STACK_LAZYFP) { @@ -279,13 +280,14 @@ static bool v7m_stack_read(ARMCPU *cpu, uint32_t *dest, uint32_t addr, hwaddr physaddr; int prot; ARMMMUFaultInfo fi = {}; + ARMCacheAttrs cacheattrs = {}; bool secure = mmu_idx & ARM_MMU_IDX_M_S; int exc; bool exc_secure; uint32_t value; if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &physaddr, - &attrs, &prot, &page_size, &fi, NULL)) { + &attrs, &prot, &page_size, &fi, &cacheattrs)) { /* MPU/SAU lookup failed */ if (fi.type == ARMFault_QEMU_SFault) { qemu_log_mask(CPU_LOG_INT, @@ -1928,6 +1930,7 @@ static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx, V8M_SAttributes sattrs = {}; MemTxAttrs attrs = {}; ARMMMUFaultInfo fi = {}; + ARMCacheAttrs cacheattrs = {}; MemTxResult txres; target_ulong page_size; hwaddr physaddr; @@ -1945,8 +1948,8 @@ static bool v7m_read_half_insn(ARMCPU *cpu, ARMMMUIdx mmu_idx, "...really SecureFault with SFSR.INVEP\n"); return false; } - if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx, - &physaddr, &attrs, &prot, &page_size, &fi, NULL)) { + if (get_phys_addr(env, addr, MMU_INST_FETCH, mmu_idx, &physaddr, + &attrs, &prot, &page_size, &fi, &cacheattrs)) { /* the MPU lookup failed */ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_IACCVIOL_MASK; armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_MEM, env->v7m.secure); diff --git a/target/arm/mte_helper.c b/target/arm/mte_helper.c new file mode 100644 index 0000000000..5ea57d487a --- /dev/null +++ b/target/arm/mte_helper.c @@ -0,0 +1,906 @@ +/* + * ARM v8.5-MemTag Operations + * + * Copyright (c) 2020 Linaro, Ltd. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "internals.h" +#include "exec/exec-all.h" +#include "exec/ram_addr.h" +#include "exec/cpu_ldst.h" +#include "exec/helper-proto.h" + + +static int choose_nonexcluded_tag(int tag, int offset, uint16_t exclude) +{ + if (exclude == 0xffff) { + return 0; + } + if (offset == 0) { + while (exclude & (1 << tag)) { + tag = (tag + 1) & 15; + } + } else { + do { + do { + tag = (tag + 1) & 15; + } while (exclude & (1 << tag)); + } while (--offset > 0); + } + return tag; +} + +/** + * allocation_tag_mem: + * @env: the cpu environment + * @ptr_mmu_idx: the addressing regime to use for the virtual address + * @ptr: the virtual address for which to look up tag memory + * @ptr_access: the access to use for the virtual address + * @ptr_size: the number of bytes in the normal memory access + * @tag_access: the access to use for the tag memory + * @tag_size: the number of bytes in the tag memory access + * @ra: the return address for exception handling + * + * Our tag memory is formatted as a sequence of little-endian nibbles. + * That is, the byte at (addr >> (LOG2_TAG_GRANULE + 1)) contains two + * tags, with the tag at [3:0] for the lower addr and the tag at [7:4] + * for the higher addr. + * + * Here, resolve the physical address from the virtual address, and return + * a pointer to the corresponding tag byte. Exit with exception if the + * virtual address is not accessible for @ptr_access. + * + * The @ptr_size and @tag_size values may not have an obvious relation + * due to the alignment of @ptr, and the number of tag checks required. + * + * If there is no tag storage corresponding to @ptr, return NULL. + */ +static uint8_t *allocation_tag_mem(CPUARMState *env, int ptr_mmu_idx, + uint64_t ptr, MMUAccessType ptr_access, + int ptr_size, MMUAccessType tag_access, + int tag_size, uintptr_t ra) +{ +#ifdef CONFIG_USER_ONLY + /* Tag storage not implemented. */ + return NULL; +#else + uintptr_t index; + CPUIOTLBEntry *iotlbentry; + int in_page, flags; + ram_addr_t ptr_ra; + hwaddr ptr_paddr, tag_paddr, xlat; + MemoryRegion *mr; + ARMASIdx tag_asi; + AddressSpace *tag_as; + void *host; + + /* + * Probe the first byte of the virtual address. This raises an + * exception for inaccessible pages, and resolves the virtual address + * into the softmmu tlb. + * + * When RA == 0, this is for mte_probe1. The page is expected to be + * valid. Indicate to probe_access_flags no-fault, then assert that + * we received a valid page. + */ + flags = probe_access_flags(env, ptr, ptr_access, ptr_mmu_idx, + ra == 0, &host, ra); + assert(!(flags & TLB_INVALID_MASK)); + + /* + * Find the iotlbentry for ptr. This *must* be present in the TLB + * because we just found the mapping. + * TODO: Perhaps there should be a cputlb helper that returns a + * matching tlb entry + iotlb entry. + */ + index = tlb_index(env, ptr_mmu_idx, ptr); +# ifdef CONFIG_DEBUG_TCG + { + CPUTLBEntry *entry = tlb_entry(env, ptr_mmu_idx, ptr); + target_ulong comparator = (ptr_access == MMU_DATA_LOAD + ? entry->addr_read + : tlb_addr_write(entry)); + g_assert(tlb_hit(comparator, ptr)); + } +# endif + iotlbentry = &env_tlb(env)->d[ptr_mmu_idx].iotlb[index]; + + /* If the virtual page MemAttr != Tagged, access unchecked. */ + if (!arm_tlb_mte_tagged(&iotlbentry->attrs)) { + return NULL; + } + + /* + * If not backed by host ram, there is no tag storage: access unchecked. + * This is probably a guest os bug though, so log it. + */ + if (unlikely(flags & TLB_MMIO)) { + qemu_log_mask(LOG_GUEST_ERROR, + "Page @ 0x%" PRIx64 " indicates Tagged Normal memory " + "but is not backed by host ram\n", ptr); + return NULL; + } + + /* + * The Normal memory access can extend to the next page. E.g. a single + * 8-byte access to the last byte of a page will check only the last + * tag on the first page. + * Any page access exception has priority over tag check exception. + */ + in_page = -(ptr | TARGET_PAGE_MASK); + if (unlikely(ptr_size > in_page)) { + void *ignore; + flags |= probe_access_flags(env, ptr + in_page, ptr_access, + ptr_mmu_idx, ra == 0, &ignore, ra); + assert(!(flags & TLB_INVALID_MASK)); + } + + /* Any debug exception has priority over a tag check exception. */ + if (unlikely(flags & TLB_WATCHPOINT)) { + int wp = ptr_access == MMU_DATA_LOAD ? BP_MEM_READ : BP_MEM_WRITE; + assert(ra != 0); + cpu_check_watchpoint(env_cpu(env), ptr, ptr_size, + iotlbentry->attrs, wp, ra); + } + + /* + * Find the physical address within the normal mem space. + * The memory region lookup must succeed because TLB_MMIO was + * not set in the cputlb lookup above. + */ + mr = memory_region_from_host(host, &ptr_ra); + tcg_debug_assert(mr != NULL); + tcg_debug_assert(memory_region_is_ram(mr)); + ptr_paddr = ptr_ra; + do { + ptr_paddr += mr->addr; + mr = mr->container; + } while (mr); + + /* Convert to the physical address in tag space. */ + tag_paddr = ptr_paddr >> (LOG2_TAG_GRANULE + 1); + + /* Look up the address in tag space. */ + tag_asi = iotlbentry->attrs.secure ? ARMASIdx_TagS : ARMASIdx_TagNS; + tag_as = cpu_get_address_space(env_cpu(env), tag_asi); + mr = address_space_translate(tag_as, tag_paddr, &xlat, NULL, + tag_access == MMU_DATA_STORE, + iotlbentry->attrs); + + /* + * Note that @mr will never be NULL. If there is nothing in the address + * space at @tag_paddr, the translation will return the unallocated memory + * region. For our purposes, the result must be ram. + */ + if (unlikely(!memory_region_is_ram(mr))) { + /* ??? Failure is a board configuration error. */ + qemu_log_mask(LOG_UNIMP, + "Tag Memory @ 0x%" HWADDR_PRIx " not found for " + "Normal Memory @ 0x%" HWADDR_PRIx "\n", + tag_paddr, ptr_paddr); + return NULL; + } + + /* + * Ensure the tag memory is dirty on write, for migration. + * Tag memory can never contain code or display memory (vga). + */ + if (tag_access == MMU_DATA_STORE) { + ram_addr_t tag_ra = memory_region_get_ram_addr(mr) + xlat; + cpu_physical_memory_set_dirty_flag(tag_ra, DIRTY_MEMORY_MIGRATION); + } + + return memory_region_get_ram_ptr(mr) + xlat; +#endif +} + +uint64_t HELPER(irg)(CPUARMState *env, uint64_t rn, uint64_t rm) +{ + int rtag; + + /* + * Our IMPDEF choice for GCR_EL1.RRND==1 is to behave as if + * GCR_EL1.RRND==0, always producing deterministic results. + */ + uint16_t exclude = extract32(rm | env->cp15.gcr_el1, 0, 16); + int start = extract32(env->cp15.rgsr_el1, 0, 4); + int seed = extract32(env->cp15.rgsr_el1, 8, 16); + int offset, i; + + /* RandomTag */ + for (i = offset = 0; i < 4; ++i) { + /* NextRandomTagBit */ + int top = (extract32(seed, 5, 1) ^ extract32(seed, 3, 1) ^ + extract32(seed, 2, 1) ^ extract32(seed, 0, 1)); + seed = (top << 15) | (seed >> 1); + offset |= top << i; + } + rtag = choose_nonexcluded_tag(start, offset, exclude); + env->cp15.rgsr_el1 = rtag | (seed << 8); + + return address_with_allocation_tag(rn, rtag); +} + +uint64_t HELPER(addsubg)(CPUARMState *env, uint64_t ptr, + int32_t offset, uint32_t tag_offset) +{ + int start_tag = allocation_tag_from_addr(ptr); + uint16_t exclude = extract32(env->cp15.gcr_el1, 0, 16); + int rtag = choose_nonexcluded_tag(start_tag, tag_offset, exclude); + + return address_with_allocation_tag(ptr + offset, rtag); +} + +static int load_tag1(uint64_t ptr, uint8_t *mem) +{ + int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; + return extract32(*mem, ofs, 4); +} + +uint64_t HELPER(ldg)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + int mmu_idx = cpu_mmu_index(env, false); + uint8_t *mem; + int rtag = 0; + + /* Trap if accessing an invalid page. */ + mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, 1, + MMU_DATA_LOAD, 1, GETPC()); + + /* Load if page supports tags. */ + if (mem) { + rtag = load_tag1(ptr, mem); + } + + return address_with_allocation_tag(xt, rtag); +} + +static void check_tag_aligned(CPUARMState *env, uint64_t ptr, uintptr_t ra) +{ + if (unlikely(!QEMU_IS_ALIGNED(ptr, TAG_GRANULE))) { + arm_cpu_do_unaligned_access(env_cpu(env), ptr, MMU_DATA_STORE, + cpu_mmu_index(env, false), ra); + g_assert_not_reached(); + } +} + +/* For use in a non-parallel context, store to the given nibble. */ +static void store_tag1(uint64_t ptr, uint8_t *mem, int tag) +{ + int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; + *mem = deposit32(*mem, ofs, 4, tag); +} + +/* For use in a parallel context, atomically store to the given nibble. */ +static void store_tag1_parallel(uint64_t ptr, uint8_t *mem, int tag) +{ + int ofs = extract32(ptr, LOG2_TAG_GRANULE, 1) * 4; + uint8_t old = atomic_read(mem); + + while (1) { + uint8_t new = deposit32(old, ofs, 4, tag); + uint8_t cmp = atomic_cmpxchg(mem, old, new); + if (likely(cmp == old)) { + return; + } + old = cmp; + } +} + +typedef void stg_store1(uint64_t, uint8_t *, int); + +static inline void do_stg(CPUARMState *env, uint64_t ptr, uint64_t xt, + uintptr_t ra, stg_store1 store1) +{ + int mmu_idx = cpu_mmu_index(env, false); + uint8_t *mem; + + check_tag_aligned(env, ptr, ra); + + /* Trap if accessing an invalid page. */ + mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, TAG_GRANULE, + MMU_DATA_STORE, 1, ra); + + /* Store if page supports tags. */ + if (mem) { + store1(ptr, mem, allocation_tag_from_addr(xt)); + } +} + +void HELPER(stg)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + do_stg(env, ptr, xt, GETPC(), store_tag1); +} + +void HELPER(stg_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + do_stg(env, ptr, xt, GETPC(), store_tag1_parallel); +} + +void HELPER(stg_stub)(CPUARMState *env, uint64_t ptr) +{ + int mmu_idx = cpu_mmu_index(env, false); + uintptr_t ra = GETPC(); + + check_tag_aligned(env, ptr, ra); + probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra); +} + +static inline void do_st2g(CPUARMState *env, uint64_t ptr, uint64_t xt, + uintptr_t ra, stg_store1 store1) +{ + int mmu_idx = cpu_mmu_index(env, false); + int tag = allocation_tag_from_addr(xt); + uint8_t *mem1, *mem2; + + check_tag_aligned(env, ptr, ra); + + /* + * Trap if accessing an invalid page(s). + * This takes priority over !allocation_tag_access_enabled. + */ + if (ptr & TAG_GRANULE) { + /* Two stores unaligned mod TAG_GRANULE*2 -- modify two bytes. */ + mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, + TAG_GRANULE, MMU_DATA_STORE, 1, ra); + mem2 = allocation_tag_mem(env, mmu_idx, ptr + TAG_GRANULE, + MMU_DATA_STORE, TAG_GRANULE, + MMU_DATA_STORE, 1, ra); + + /* Store if page(s) support tags. */ + if (mem1) { + store1(TAG_GRANULE, mem1, tag); + } + if (mem2) { + store1(0, mem2, tag); + } + } else { + /* Two stores aligned mod TAG_GRANULE*2 -- modify one byte. */ + mem1 = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, + 2 * TAG_GRANULE, MMU_DATA_STORE, 1, ra); + if (mem1) { + tag |= tag << 4; + atomic_set(mem1, tag); + } + } +} + +void HELPER(st2g)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + do_st2g(env, ptr, xt, GETPC(), store_tag1); +} + +void HELPER(st2g_parallel)(CPUARMState *env, uint64_t ptr, uint64_t xt) +{ + do_st2g(env, ptr, xt, GETPC(), store_tag1_parallel); +} + +void HELPER(st2g_stub)(CPUARMState *env, uint64_t ptr) +{ + int mmu_idx = cpu_mmu_index(env, false); + uintptr_t ra = GETPC(); + int in_page = -(ptr | TARGET_PAGE_MASK); + + check_tag_aligned(env, ptr, ra); + + if (likely(in_page >= 2 * TAG_GRANULE)) { + probe_write(env, ptr, 2 * TAG_GRANULE, mmu_idx, ra); + } else { + probe_write(env, ptr, TAG_GRANULE, mmu_idx, ra); + probe_write(env, ptr + TAG_GRANULE, TAG_GRANULE, mmu_idx, ra); + } +} + +#define LDGM_STGM_SIZE (4 << GMID_EL1_BS) + +uint64_t HELPER(ldgm)(CPUARMState *env, uint64_t ptr) +{ + int mmu_idx = cpu_mmu_index(env, false); + uintptr_t ra = GETPC(); + void *tag_mem; + + ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE); + + /* Trap if accessing an invalid page. */ + tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_LOAD, + LDGM_STGM_SIZE, MMU_DATA_LOAD, + LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra); + + /* The tag is squashed to zero if the page does not support tags. */ + if (!tag_mem) { + return 0; + } + + QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6); + /* + * We are loading 64-bits worth of tags. The ordering of elements + * within the word corresponds to a 64-bit little-endian operation. + */ + return ldq_le_p(tag_mem); +} + +void HELPER(stgm)(CPUARMState *env, uint64_t ptr, uint64_t val) +{ + int mmu_idx = cpu_mmu_index(env, false); + uintptr_t ra = GETPC(); + void *tag_mem; + + ptr = QEMU_ALIGN_DOWN(ptr, LDGM_STGM_SIZE); + + /* Trap if accessing an invalid page. */ + tag_mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, + LDGM_STGM_SIZE, MMU_DATA_LOAD, + LDGM_STGM_SIZE / (2 * TAG_GRANULE), ra); + + /* + * Tag store only happens if the page support tags, + * and if the OS has enabled access to the tags. + */ + if (!tag_mem) { + return; + } + + QEMU_BUILD_BUG_ON(GMID_EL1_BS != 6); + /* + * We are storing 64-bits worth of tags. The ordering of elements + * within the word corresponds to a 64-bit little-endian operation. + */ + stq_le_p(tag_mem, val); +} + +void HELPER(stzgm_tags)(CPUARMState *env, uint64_t ptr, uint64_t val) +{ + uintptr_t ra = GETPC(); + int mmu_idx = cpu_mmu_index(env, false); + int log2_dcz_bytes, log2_tag_bytes; + intptr_t dcz_bytes, tag_bytes; + uint8_t *mem; + + /* + * In arm_cpu_realizefn, we assert that dcz > LOG2_TAG_GRANULE+1, + * i.e. 32 bytes, which is an unreasonably small dcz anyway, + * to make sure that we can access one complete tag byte here. + */ + log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2; + log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1); + dcz_bytes = (intptr_t)1 << log2_dcz_bytes; + tag_bytes = (intptr_t)1 << log2_tag_bytes; + ptr &= -dcz_bytes; + + mem = allocation_tag_mem(env, mmu_idx, ptr, MMU_DATA_STORE, dcz_bytes, + MMU_DATA_STORE, tag_bytes, ra); + if (mem) { + int tag_pair = (val & 0xf) * 0x11; + memset(mem, tag_pair, tag_bytes); + } +} + +/* Record a tag check failure. */ +static void mte_check_fail(CPUARMState *env, int mmu_idx, + uint64_t dirty_ptr, uintptr_t ra) +{ + ARMMMUIdx arm_mmu_idx = core_to_aa64_mmu_idx(mmu_idx); + int el, reg_el, tcf, select; + uint64_t sctlr; + + reg_el = regime_el(env, arm_mmu_idx); + sctlr = env->cp15.sctlr_el[reg_el]; + + switch (arm_mmu_idx) { + case ARMMMUIdx_E10_0: + case ARMMMUIdx_E20_0: + el = 0; + tcf = extract64(sctlr, 38, 2); + break; + default: + el = reg_el; + tcf = extract64(sctlr, 40, 2); + } + + switch (tcf) { + case 1: + /* + * Tag check fail causes a synchronous exception. + * + * In restore_state_to_opc, we set the exception syndrome + * for the load or store operation. Unwind first so we + * may overwrite that with the syndrome for the tag check. + */ + cpu_restore_state(env_cpu(env), ra, true); + env->exception.vaddress = dirty_ptr; + raise_exception(env, EXCP_DATA_ABORT, + syn_data_abort_no_iss(el != 0, 0, 0, 0, 0, 0, 0x11), + exception_target_el(env)); + /* noreturn, but fall through to the assert anyway */ + + case 0: + /* + * Tag check fail does not affect the PE. + * We eliminate this case by not setting MTE_ACTIVE + * in tb_flags, so that we never make this runtime call. + */ + g_assert_not_reached(); + + case 2: + /* Tag check fail causes asynchronous flag set. */ + mmu_idx = arm_mmu_idx_el(env, el); + if (regime_has_2_ranges(mmu_idx)) { + select = extract64(dirty_ptr, 55, 1); + } else { + select = 0; + } + env->cp15.tfsr_el[el] |= 1 << select; + break; + + default: + /* Case 3: Reserved. */ + qemu_log_mask(LOG_GUEST_ERROR, + "Tag check failure with SCTLR_EL%d.TCF%s " + "set to reserved value %d\n", + reg_el, el ? "" : "0", tcf); + break; + } +} + +/* + * Perform an MTE checked access for a single logical or atomic access. + */ +static bool mte_probe1_int(CPUARMState *env, uint32_t desc, uint64_t ptr, + uintptr_t ra, int bit55) +{ + int mem_tag, mmu_idx, ptr_tag, size; + MMUAccessType type; + uint8_t *mem; + + ptr_tag = allocation_tag_from_addr(ptr); + + if (tcma_check(desc, bit55, ptr_tag)) { + return true; + } + + mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); + type = FIELD_EX32(desc, MTEDESC, WRITE) ? MMU_DATA_STORE : MMU_DATA_LOAD; + size = FIELD_EX32(desc, MTEDESC, ESIZE); + + mem = allocation_tag_mem(env, mmu_idx, ptr, type, size, + MMU_DATA_LOAD, 1, ra); + if (!mem) { + return true; + } + + mem_tag = load_tag1(ptr, mem); + return ptr_tag == mem_tag; +} + +/* + * No-fault version of mte_check1, to be used by SVE for MemSingleNF. + * Returns false if the access is Checked and the check failed. This + * is only intended to probe the tag -- the validity of the page must + * be checked beforehand. + */ +bool mte_probe1(CPUARMState *env, uint32_t desc, uint64_t ptr) +{ + int bit55 = extract64(ptr, 55, 1); + + /* If TBI is disabled, the access is unchecked. */ + if (unlikely(!tbi_check(desc, bit55))) { + return true; + } + + return mte_probe1_int(env, desc, ptr, 0, bit55); +} + +uint64_t mte_check1(CPUARMState *env, uint32_t desc, + uint64_t ptr, uintptr_t ra) +{ + int bit55 = extract64(ptr, 55, 1); + + /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */ + if (unlikely(!tbi_check(desc, bit55))) { + return ptr; + } + + if (unlikely(!mte_probe1_int(env, desc, ptr, ra, bit55))) { + int mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); + mte_check_fail(env, mmu_idx, ptr, ra); + } + + return useronly_clean_ptr(ptr); +} + +uint64_t HELPER(mte_check1)(CPUARMState *env, uint32_t desc, uint64_t ptr) +{ + return mte_check1(env, desc, ptr, GETPC()); +} + +/* + * Perform an MTE checked access for multiple logical accesses. + */ + +/** + * checkN: + * @tag: tag memory to test + * @odd: true to begin testing at tags at odd nibble + * @cmp: the tag to compare against + * @count: number of tags to test + * + * Return the number of successful tests. + * Thus a return value < @count indicates a failure. + * + * A note about sizes: count is expected to be small. + * + * The most common use will be LDP/STP of two integer registers, + * which means 16 bytes of memory touching at most 2 tags, but + * often the access is aligned and thus just 1 tag. + * + * Using AdvSIMD LD/ST (multiple), one can access 64 bytes of memory, + * touching at most 5 tags. SVE LDR/STR (vector) with the default + * vector length is also 64 bytes; the maximum architectural length + * is 256 bytes touching at most 9 tags. + * + * The loop below uses 7 logical operations and 1 memory operation + * per tag pair. An implementation that loads an aligned word and + * uses masking to ignore adjacent tags requires 18 logical operations + * and thus does not begin to pay off until 6 tags. + * Which, according to the survey above, is unlikely to be common. + */ +static int checkN(uint8_t *mem, int odd, int cmp, int count) +{ + int n = 0, diff; + + /* Replicate the test tag and compare. */ + cmp *= 0x11; + diff = *mem++ ^ cmp; + + if (odd) { + goto start_odd; + } + + while (1) { + /* Test even tag. */ + if (unlikely((diff) & 0x0f)) { + break; + } + if (++n == count) { + break; + } + + start_odd: + /* Test odd tag. */ + if (unlikely((diff) & 0xf0)) { + break; + } + if (++n == count) { + break; + } + + diff = *mem++ ^ cmp; + } + return n; +} + +uint64_t mte_checkN(CPUARMState *env, uint32_t desc, + uint64_t ptr, uintptr_t ra) +{ + int mmu_idx, ptr_tag, bit55; + uint64_t ptr_last, ptr_end, prev_page, next_page; + uint64_t tag_first, tag_end; + uint64_t tag_byte_first, tag_byte_end; + uint32_t esize, total, tag_count, tag_size, n, c; + uint8_t *mem1, *mem2; + MMUAccessType type; + + bit55 = extract64(ptr, 55, 1); + + /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */ + if (unlikely(!tbi_check(desc, bit55))) { + return ptr; + } + + ptr_tag = allocation_tag_from_addr(ptr); + + if (tcma_check(desc, bit55, ptr_tag)) { + goto done; + } + + mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); + type = FIELD_EX32(desc, MTEDESC, WRITE) ? MMU_DATA_STORE : MMU_DATA_LOAD; + esize = FIELD_EX32(desc, MTEDESC, ESIZE); + total = FIELD_EX32(desc, MTEDESC, TSIZE); + + /* Find the addr of the end of the access, and of the last element. */ + ptr_end = ptr + total; + ptr_last = ptr_end - esize; + + /* Round the bounds to the tag granule, and compute the number of tags. */ + tag_first = QEMU_ALIGN_DOWN(ptr, TAG_GRANULE); + tag_end = QEMU_ALIGN_UP(ptr_last, TAG_GRANULE); + tag_count = (tag_end - tag_first) / TAG_GRANULE; + + /* Round the bounds to twice the tag granule, and compute the bytes. */ + tag_byte_first = QEMU_ALIGN_DOWN(ptr, 2 * TAG_GRANULE); + tag_byte_end = QEMU_ALIGN_UP(ptr_last, 2 * TAG_GRANULE); + + /* Locate the page boundaries. */ + prev_page = ptr & TARGET_PAGE_MASK; + next_page = prev_page + TARGET_PAGE_SIZE; + + if (likely(tag_end - prev_page <= TARGET_PAGE_SIZE)) { + /* Memory access stays on one page. */ + tag_size = (tag_byte_end - tag_byte_first) / (2 * TAG_GRANULE); + mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, total, + MMU_DATA_LOAD, tag_size, ra); + if (!mem1) { + goto done; + } + /* Perform all of the comparisons. */ + n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, tag_count); + } else { + /* Memory access crosses to next page. */ + tag_size = (next_page - tag_byte_first) / (2 * TAG_GRANULE); + mem1 = allocation_tag_mem(env, mmu_idx, ptr, type, next_page - ptr, + MMU_DATA_LOAD, tag_size, ra); + + tag_size = (tag_byte_end - next_page) / (2 * TAG_GRANULE); + mem2 = allocation_tag_mem(env, mmu_idx, next_page, type, + ptr_end - next_page, + MMU_DATA_LOAD, tag_size, ra); + + /* + * Perform all of the comparisons. + * Note the possible but unlikely case of the operation spanning + * two pages that do not both have tagging enabled. + */ + n = c = (next_page - tag_first) / TAG_GRANULE; + if (mem1) { + n = checkN(mem1, ptr & TAG_GRANULE, ptr_tag, c); + } + if (n == c) { + if (!mem2) { + goto done; + } + n += checkN(mem2, 0, ptr_tag, tag_count - c); + } + } + + /* + * If we failed, we know which granule. Compute the element that + * is first in that granule, and signal failure on that element. + */ + if (unlikely(n < tag_count)) { + uint64_t fail_ofs; + + fail_ofs = tag_first + n * TAG_GRANULE - ptr; + fail_ofs = ROUND_UP(fail_ofs, esize); + mte_check_fail(env, mmu_idx, ptr + fail_ofs, ra); + } + + done: + return useronly_clean_ptr(ptr); +} + +uint64_t HELPER(mte_checkN)(CPUARMState *env, uint32_t desc, uint64_t ptr) +{ + return mte_checkN(env, desc, ptr, GETPC()); +} + +/* + * Perform an MTE checked access for DC_ZVA. + */ +uint64_t HELPER(mte_check_zva)(CPUARMState *env, uint32_t desc, uint64_t ptr) +{ + uintptr_t ra = GETPC(); + int log2_dcz_bytes, log2_tag_bytes; + int mmu_idx, bit55; + intptr_t dcz_bytes, tag_bytes, i; + void *mem; + uint64_t ptr_tag, mem_tag, align_ptr; + + bit55 = extract64(ptr, 55, 1); + + /* If TBI is disabled, the access is unchecked, and ptr is not dirty. */ + if (unlikely(!tbi_check(desc, bit55))) { + return ptr; + } + + ptr_tag = allocation_tag_from_addr(ptr); + + if (tcma_check(desc, bit55, ptr_tag)) { + goto done; + } + + /* + * In arm_cpu_realizefn, we asserted that dcz > LOG2_TAG_GRANULE+1, + * i.e. 32 bytes, which is an unreasonably small dcz anyway, to make + * sure that we can access one complete tag byte here. + */ + log2_dcz_bytes = env_archcpu(env)->dcz_blocksize + 2; + log2_tag_bytes = log2_dcz_bytes - (LOG2_TAG_GRANULE + 1); + dcz_bytes = (intptr_t)1 << log2_dcz_bytes; + tag_bytes = (intptr_t)1 << log2_tag_bytes; + align_ptr = ptr & -dcz_bytes; + + /* + * Trap if accessing an invalid page. DC_ZVA requires that we supply + * the original pointer for an invalid page. But watchpoints require + * that we probe the actual space. So do both. + */ + mmu_idx = FIELD_EX32(desc, MTEDESC, MIDX); + (void) probe_write(env, ptr, 1, mmu_idx, ra); + mem = allocation_tag_mem(env, mmu_idx, align_ptr, MMU_DATA_STORE, + dcz_bytes, MMU_DATA_LOAD, tag_bytes, ra); + if (!mem) { + goto done; + } + + /* + * Unlike the reasoning for checkN, DC_ZVA is always aligned, and thus + * it is quite easy to perform all of the comparisons at once without + * any extra masking. + * + * The most common zva block size is 64; some of the thunderx cpus use + * a block size of 128. For user-only, aarch64_max_initfn will set the + * block size to 512. Fill out the other cases for future-proofing. + * + * In order to be able to find the first miscompare later, we want the + * tag bytes to be in little-endian order. + */ + switch (log2_tag_bytes) { + case 0: /* zva_blocksize 32 */ + mem_tag = *(uint8_t *)mem; + ptr_tag *= 0x11u; + break; + case 1: /* zva_blocksize 64 */ + mem_tag = cpu_to_le16(*(uint16_t *)mem); + ptr_tag *= 0x1111u; + break; + case 2: /* zva_blocksize 128 */ + mem_tag = cpu_to_le32(*(uint32_t *)mem); + ptr_tag *= 0x11111111u; + break; + case 3: /* zva_blocksize 256 */ + mem_tag = cpu_to_le64(*(uint64_t *)mem); + ptr_tag *= 0x1111111111111111ull; + break; + + default: /* zva_blocksize 512, 1024, 2048 */ + ptr_tag *= 0x1111111111111111ull; + i = 0; + do { + mem_tag = cpu_to_le64(*(uint64_t *)(mem + i)); + if (unlikely(mem_tag != ptr_tag)) { + goto fail; + } + i += 8; + align_ptr += 16 * TAG_GRANULE; + } while (i < tag_bytes); + goto done; + } + + if (likely(mem_tag == ptr_tag)) { + goto done; + } + + fail: + /* Locate the first nibble that differs. */ + i = ctz64(mem_tag ^ ptr_tag) >> 4; + mte_check_fail(env, mmu_idx, align_ptr + i * TAG_GRANULE, ra); + + done: + return useronly_clean_ptr(ptr); +} diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode index 6d890b2161..686f9fbf46 100644 --- a/target/arm/neon-dp.decode +++ b/target/arm/neon-dp.decode @@ -429,6 +429,112 @@ Vimm_1r 1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm vm=%vm_dp vd=%vd_dp size=1 VDUP_scalar 1111 001 1 1 . 11 index:1 100 .... 11 000 q:1 . 0 .... \ vm=%vm_dp vd=%vd_dp size=2 + + ################################################################## + # 2-reg-misc grouping: + # 1111 001 11 D 11 size:2 opc1:2 Vd:4 0 opc2:4 q:1 M 0 Vm:4 + ################################################################## + + &2misc vd vm q size + + @2misc .... ... .. . .. size:2 .. .... . .... q:1 . . .... \ + &2misc vm=%vm_dp vd=%vd_dp + @2misc_q0 .... ... .. . .. size:2 .. .... . .... . . . .... \ + &2misc vm=%vm_dp vd=%vd_dp q=0 + @2misc_q1 .... ... .. . .. size:2 .. .... . .... . . . .... \ + &2misc vm=%vm_dp vd=%vd_dp q=1 + + VREV64 1111 001 11 . 11 .. 00 .... 0 0000 . . 0 .... @2misc + VREV32 1111 001 11 . 11 .. 00 .... 0 0001 . . 0 .... @2misc + VREV16 1111 001 11 . 11 .. 00 .... 0 0010 . . 0 .... @2misc + + VPADDL_S 1111 001 11 . 11 .. 00 .... 0 0100 . . 0 .... @2misc + VPADDL_U 1111 001 11 . 11 .. 00 .... 0 0101 . . 0 .... @2misc + + AESE 1111 001 11 . 11 .. 00 .... 0 0110 0 . 0 .... @2misc_q1 + AESD 1111 001 11 . 11 .. 00 .... 0 0110 1 . 0 .... @2misc_q1 + AESMC 1111 001 11 . 11 .. 00 .... 0 0111 0 . 0 .... @2misc_q1 + AESIMC 1111 001 11 . 11 .. 00 .... 0 0111 1 . 0 .... @2misc_q1 + + VCLS 1111 001 11 . 11 .. 00 .... 0 1000 . . 0 .... @2misc + VCLZ 1111 001 11 . 11 .. 00 .... 0 1001 . . 0 .... @2misc + VCNT 1111 001 11 . 11 .. 00 .... 0 1010 . . 0 .... @2misc + + VMVN 1111 001 11 . 11 .. 00 .... 0 1011 . . 0 .... @2misc + + VPADAL_S 1111 001 11 . 11 .. 00 .... 0 1100 . . 0 .... @2misc + VPADAL_U 1111 001 11 . 11 .. 00 .... 0 1101 . . 0 .... @2misc + + VQABS 1111 001 11 . 11 .. 00 .... 0 1110 . . 0 .... @2misc + VQNEG 1111 001 11 . 11 .. 00 .... 0 1111 . . 0 .... @2misc + + VCGT0 1111 001 11 . 11 .. 01 .... 0 0000 . . 0 .... @2misc + VCGE0 1111 001 11 . 11 .. 01 .... 0 0001 . . 0 .... @2misc + VCEQ0 1111 001 11 . 11 .. 01 .... 0 0010 . . 0 .... @2misc + VCLE0 1111 001 11 . 11 .. 01 .... 0 0011 . . 0 .... @2misc + VCLT0 1111 001 11 . 11 .. 01 .... 0 0100 . . 0 .... @2misc + + SHA1H 1111 001 11 . 11 .. 01 .... 0 0101 1 . 0 .... @2misc_q1 + + VABS 1111 001 11 . 11 .. 01 .... 0 0110 . . 0 .... @2misc + VNEG 1111 001 11 . 11 .. 01 .... 0 0111 . . 0 .... @2misc + + VCGT0_F 1111 001 11 . 11 .. 01 .... 0 1000 . . 0 .... @2misc + VCGE0_F 1111 001 11 . 11 .. 01 .... 0 1001 . . 0 .... @2misc + VCEQ0_F 1111 001 11 . 11 .. 01 .... 0 1010 . . 0 .... @2misc + VCLE0_F 1111 001 11 . 11 .. 01 .... 0 1011 . . 0 .... @2misc + VCLT0_F 1111 001 11 . 11 .. 01 .... 0 1100 . . 0 .... @2misc + + VABS_F 1111 001 11 . 11 .. 01 .... 0 1110 . . 0 .... @2misc + VNEG_F 1111 001 11 . 11 .. 01 .... 0 1111 . . 0 .... @2misc + + VSWP 1111 001 11 . 11 .. 10 .... 0 0000 . . 0 .... @2misc + VTRN 1111 001 11 . 11 .. 10 .... 0 0001 . . 0 .... @2misc + VUZP 1111 001 11 . 11 .. 10 .... 0 0010 . . 0 .... @2misc + VZIP 1111 001 11 . 11 .. 10 .... 0 0011 . . 0 .... @2misc + + VMOVN 1111 001 11 . 11 .. 10 .... 0 0100 0 . 0 .... @2misc_q0 + # VQMOVUN: unsigned result (source is always signed) + VQMOVUN 1111 001 11 . 11 .. 10 .... 0 0100 1 . 0 .... @2misc_q0 + # VQMOVN: signed result, source may be signed (_S) or unsigned (_U) + VQMOVN_S 1111 001 11 . 11 .. 10 .... 0 0101 0 . 0 .... @2misc_q0 + VQMOVN_U 1111 001 11 . 11 .. 10 .... 0 0101 1 . 0 .... @2misc_q0 + + VSHLL 1111 001 11 . 11 .. 10 .... 0 0110 0 . 0 .... @2misc_q0 + + SHA1SU1 1111 001 11 . 11 .. 10 .... 0 0111 0 . 0 .... @2misc_q1 + SHA256SU0 1111 001 11 . 11 .. 10 .... 0 0111 1 . 0 .... @2misc_q1 + + VRINTN 1111 001 11 . 11 .. 10 .... 0 1000 . . 0 .... @2misc + VRINTX 1111 001 11 . 11 .. 10 .... 0 1001 . . 0 .... @2misc + VRINTA 1111 001 11 . 11 .. 10 .... 0 1010 . . 0 .... @2misc + VRINTZ 1111 001 11 . 11 .. 10 .... 0 1011 . . 0 .... @2misc + + VCVT_F16_F32 1111 001 11 . 11 .. 10 .... 0 1100 0 . 0 .... @2misc_q0 + + VRINTM 1111 001 11 . 11 .. 10 .... 0 1101 . . 0 .... @2misc + + VCVT_F32_F16 1111 001 11 . 11 .. 10 .... 0 1110 0 . 0 .... @2misc_q0 + + VRINTP 1111 001 11 . 11 .. 10 .... 0 1111 . . 0 .... @2misc + + VCVTAS 1111 001 11 . 11 .. 11 .... 0 0000 . . 0 .... @2misc + VCVTAU 1111 001 11 . 11 .. 11 .... 0 0001 . . 0 .... @2misc + VCVTNS 1111 001 11 . 11 .. 11 .... 0 0010 . . 0 .... @2misc + VCVTNU 1111 001 11 . 11 .. 11 .... 0 0011 . . 0 .... @2misc + VCVTPS 1111 001 11 . 11 .. 11 .... 0 0100 . . 0 .... @2misc + VCVTPU 1111 001 11 . 11 .. 11 .... 0 0101 . . 0 .... @2misc + VCVTMS 1111 001 11 . 11 .. 11 .... 0 0110 . . 0 .... @2misc + VCVTMU 1111 001 11 . 11 .. 11 .... 0 0111 . . 0 .... @2misc + + VRECPE 1111 001 11 . 11 .. 11 .... 0 1000 . . 0 .... @2misc + VRSQRTE 1111 001 11 . 11 .. 11 .... 0 1001 . . 0 .... @2misc + VRECPE_F 1111 001 11 . 11 .. 11 .... 0 1010 . . 0 .... @2misc + VRSQRTE_F 1111 001 11 . 11 .. 11 .... 0 1011 . . 0 .... @2misc + VCVT_FS 1111 001 11 . 11 .. 11 .... 0 1100 . . 0 .... @2misc + VCVT_FU 1111 001 11 . 11 .. 11 .... 0 1101 . . 0 .... @2misc + VCVT_SF 1111 001 11 . 11 .. 11 .... 0 1110 . . 0 .... @2misc + VCVT_UF 1111 001 11 . 11 .. 11 .... 0 1111 . . 0 .... @2misc ] # Subgroup for size != 0b11 diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c index eb0de080f1..b1065216b2 100644 --- a/target/arm/op_helper.c +++ b/target/arm/op_helper.c @@ -935,3 +935,19 @@ uint32_t HELPER(ror_cc)(CPUARMState *env, uint32_t x, uint32_t i) return ((uint32_t)x >> shift) | (x << (32 - shift)); } } + +void HELPER(probe_access)(CPUARMState *env, target_ulong ptr, + uint32_t access_type, uint32_t mmu_idx, + uint32_t size) +{ + uint32_t in_page = -((uint32_t)ptr | TARGET_PAGE_SIZE); + uintptr_t ra = GETPC(); + + if (likely(size <= in_page)) { + probe_access(env, ptr, size, access_type, mmu_idx, ra); + } else { + probe_access(env, ptr, in_page, access_type, mmu_idx, ra); + probe_access(env, ptr + in_page, size - in_page, + access_type, mmu_idx, ra); + } +} diff --git a/target/arm/sve_helper.c b/target/arm/sve_helper.c index e590db6637..382fa82bc8 100644 --- a/target/arm/sve_helper.c +++ b/target/arm/sve_helper.c @@ -3966,14 +3966,16 @@ static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \ static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \ target_ulong addr, uintptr_t ra) \ { \ - *(TYPEE *)(vd + H(reg_off)) = (TYPEM)TLB(env, addr, ra); \ + *(TYPEE *)(vd + H(reg_off)) = \ + (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \ } #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \ static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \ target_ulong addr, uintptr_t ra) \ { \ - TLB(env, addr, (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \ + TLB(env, useronly_clean_ptr(addr), \ + (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \ } #define DO_LD_PRIM_1(NAME, H, TE, TM) \ @@ -4091,6 +4093,19 @@ static bool sve_probe_page(SVEHostPage *info, bool nofault, int flags; addr += mem_off; + + /* + * User-only currently always issues with TBI. See the comment + * above useronly_clean_ptr. Usually we clean this top byte away + * during translation, but we can't do that for e.g. vector + imm + * addressing modes. + * + * We currently always enable TBI for user-only, and do not provide + * a way to turn it off. So clean the pointer unconditionally here, + * rather than look it up here, or pass it down from above. + */ + addr = useronly_clean_ptr(addr); + flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault, &info->host, retaddr); info->flags = flags; @@ -4393,15 +4408,89 @@ static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env, #endif } +typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t); + +static inline QEMU_ALWAYS_INLINE +void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env, + uint64_t *vg, target_ulong addr, int esize, + int msize, uint32_t mtedesc, uintptr_t ra, + mte_check_fn *check) +{ + intptr_t mem_off, reg_off, reg_last; + + /* Process the page only if MemAttr == Tagged. */ + if (arm_tlb_mte_tagged(&info->page[0].attrs)) { + mem_off = info->mem_off_first[0]; + reg_off = info->reg_off_first[0]; + reg_last = info->reg_off_split; + if (reg_last < 0) { + reg_last = info->reg_off_last[0]; + } + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + check(env, mtedesc, addr, ra); + } + reg_off += esize; + mem_off += msize; + } while (reg_off <= reg_last && (reg_off & 63)); + } while (reg_off <= reg_last); + } + + mem_off = info->mem_off_first[1]; + if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) { + reg_off = info->reg_off_first[1]; + reg_last = info->reg_off_last[1]; + + do { + uint64_t pg = vg[reg_off >> 6]; + do { + if ((pg >> (reg_off & 63)) & 1) { + check(env, mtedesc, addr, ra); + } + reg_off += esize; + mem_off += msize; + } while (reg_off & 63); + } while (reg_off <= reg_last); + } +} + +typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env, + uint64_t *vg, target_ulong addr, + int esize, int msize, uint32_t mtedesc, + uintptr_t ra); + +static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env, + uint64_t *vg, target_ulong addr, + int esize, int msize, uint32_t mtedesc, + uintptr_t ra) +{ + sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize, + mtedesc, ra, mte_check1); +} + +static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env, + uint64_t *vg, target_ulong addr, + int esize, int msize, uint32_t mtedesc, + uintptr_t ra) +{ + sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize, + mtedesc, ra, mte_checkN); +} + + /* * Common helper for all contiguous 1,2,3,4-register predicated stores. */ static inline QEMU_ALWAYS_INLINE void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, uint32_t desc, const uintptr_t retaddr, - const int esz, const int msz, const int N, + const int esz, const int msz, const int N, uint32_t mtedesc, sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) + sve_ldst1_tlb_fn *tlb_fn, + sve_cont_ldst_mte_check_fn *mte_check_fn) { const unsigned rd = simd_data(desc); const intptr_t reg_max = simd_oprsz(desc); @@ -4426,7 +4515,14 @@ void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, BP_MEM_READ, retaddr); - /* TODO: MTE check. */ + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mte_check_fn && mtedesc) { + mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz, + mtedesc, retaddr); + } flags = info.page[0].flags | info.page[1].flags; if (unlikely(flags != 0)) { @@ -4532,26 +4628,67 @@ void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr, } } -#define DO_LD1_1(NAME, ESZ) \ -void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ - sve_##NAME##_host, sve_##NAME##_tlb); \ +static inline QEMU_ALWAYS_INLINE +void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, + uint32_t desc, const uintptr_t ra, + const int esz, const int msz, const int N, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + int bit55 = extract64(addr, 55, 1); + + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* Perform gross MTE suppression early. */ + if (!tbi_check(desc, bit55) || + tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + mtedesc = 0; + } + + sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn, + N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN); } -#define DO_LD1_2(NAME, ESZ, MSZ) \ -void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ - sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ -} \ -void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ - sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ +#define DO_LD1_1(NAME, ESZ) \ +void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \ + sve_##NAME##_host, sve_##NAME##_tlb, NULL); \ +} \ +void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \ + sve_##NAME##_host, sve_##NAME##_tlb); \ +} + +#define DO_LD1_2(NAME, ESZ, MSZ) \ +void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ + sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \ +} \ +void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \ + sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \ +} \ +void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ + sve_##NAME##_le_host, sve_##NAME##_le_tlb); \ +} \ +void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \ + sve_##NAME##_be_host, sve_##NAME##_be_tlb); \ } DO_LD1_1(ld1bb, MO_8) @@ -4577,26 +4714,44 @@ DO_LD1_2(ld1dd, MO_64, MO_64) #undef DO_LD1_1 #undef DO_LD1_2 -#define DO_LDN_1(N) \ -void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ - sve_ld1bb_host, sve_ld1bb_tlb); \ +#define DO_LDN_1(N) \ +void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \ + sve_ld1bb_host, sve_ld1bb_tlb, NULL); \ +} \ +void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \ + sve_ld1bb_host, sve_ld1bb_tlb); \ } -#define DO_LDN_2(N, SUFF, ESZ) \ -void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ - sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ -} \ -void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ - sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ +#define DO_LDN_2(N, SUFF, ESZ) \ +void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ + sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \ +} \ +void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \ + sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \ +} \ +void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ + sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \ +} \ +void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \ + sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \ } DO_LDN_1(2) @@ -4654,7 +4809,7 @@ static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz) */ static inline QEMU_ALWAYS_INLINE void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, - uint32_t desc, const uintptr_t retaddr, + uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc, const int esz, const int msz, const SVEContFault fault, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) @@ -4686,13 +4841,25 @@ void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, mem_off = info.mem_off_first[0]; flags = info.page[0].flags; + /* + * Disable MTE checking if the Tagged bit is not set. Since TBI must + * be set within MTEDESC for MTE, !mtedesc => !mte_active. + */ + if (arm_tlb_mte_tagged(&info.page[0].attrs)) { + mtedesc = 0; + } + if (fault == FAULT_FIRST) { + /* Trapping mte check for the first-fault element. */ + if (mtedesc) { + mte_check1(env, mtedesc, addr + mem_off, retaddr); + } + /* * Special handling of the first active element, * if it crosses a page boundary or is MMIO. */ bool is_split = mem_off == info.mem_off_split; - /* TODO: MTE check. */ if (unlikely(flags != 0) || unlikely(is_split)) { /* * Use the slow path for cross-page handling. @@ -4728,7 +4895,9 @@ void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, /* Watchpoint hit, see below. */ goto do_fault; } - /* TODO: MTE check. */ + if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) { + goto do_fault; + } /* * Use the slow path for cross-page handling. * This is RAM, without a watchpoint, and will not trap. @@ -4776,7 +4945,9 @@ void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, & BP_MEM_READ)) { goto do_fault; } - /* TODO: MTE check. */ + if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) { + goto do_fault; + } host_fn(vd, reg_off, host + mem_off); } reg_off += 1 << esz; @@ -4814,44 +4985,103 @@ void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr, record_fault(env, reg_off, reg_max); } -#define DO_LDFF1_LDNF1_1(PART, ESZ) \ +static inline QEMU_ALWAYS_INLINE +void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr, + uint32_t desc, const uintptr_t retaddr, + const int esz, const int msz, const SVEContFault fault, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + int bit55 = extract64(addr, 55, 1); + + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* Perform gross MTE suppression early. */ + if (!tbi_check(desc, bit55) || + tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + mtedesc = 0; + } + + sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc, + esz, msz, fault, host_fn, tlb_fn); +} + +#define DO_LDFF1_LDNF1_1(PART, ESZ) \ void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \ target_ulong addr, uint32_t desc) \ { \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ } \ void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \ target_ulong addr, uint32_t desc) \ { \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \ + sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ +} \ +void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \ + sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \ sve_ld1##PART##_host, sve_ld1##PART##_tlb); \ } -#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ +#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \ void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \ target_ulong addr, uint32_t desc) \ { \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ } \ void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \ target_ulong addr, uint32_t desc) \ { \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ } \ void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \ target_ulong addr, uint32_t desc) \ { \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ } \ void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \ target_ulong addr, uint32_t desc) \ { \ - sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ + sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \ sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ +} \ +void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ + sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ + sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \ +} \ +void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \ + sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ +} \ +void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \ + sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \ } DO_LDFF1_LDNF1_1(bb, MO_8) @@ -4882,11 +5112,12 @@ DO_LDFF1_LDNF1_2(dd, MO_64, MO_64) */ static inline QEMU_ALWAYS_INLINE -void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, uint32_t desc, - const uintptr_t retaddr, const int esz, - const int msz, const int N, +void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, + uint32_t desc, const uintptr_t retaddr, + const int esz, const int msz, const int N, uint32_t mtedesc, sve_ldst1_host_fn *host_fn, - sve_ldst1_tlb_fn *tlb_fn) + sve_ldst1_tlb_fn *tlb_fn, + sve_cont_ldst_mte_check_fn *mte_check_fn) { const unsigned rd = simd_data(desc); const intptr_t reg_max = simd_oprsz(desc); @@ -4908,7 +5139,14 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, uint32_t desc, sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz, BP_MEM_WRITE, retaddr); - /* TODO: MTE check. */ + /* + * Handle mte checks for all active elements. + * Since TBI must be set for MTE, !mtedesc => !mte_active. + */ + if (mte_check_fn && mtedesc) { + mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz, + mtedesc, retaddr); + } flags = info.page[0].flags | info.page[1].flags; if (unlikely(flags != 0)) { @@ -5002,26 +5240,67 @@ void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr, uint32_t desc, } } -#define DO_STN_1(N, NAME, ESZ) \ -void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ - sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ +static inline QEMU_ALWAYS_INLINE +void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr, + uint32_t desc, const uintptr_t ra, + const int esz, const int msz, const int N, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + int bit55 = extract64(addr, 55, 1); + + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* Perform gross MTE suppression early. */ + if (!tbi_check(desc, bit55) || + tcma_check(desc, bit55, allocation_tag_from_addr(addr))) { + mtedesc = 0; + } + + sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn, + N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN); } -#define DO_STN_2(N, NAME, ESZ, MSZ) \ -void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ - sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ -} \ -void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ - target_ulong addr, uint32_t desc) \ -{ \ - sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ - sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ +#define DO_STN_1(N, NAME, ESZ) \ +void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \ + sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \ +} \ +void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \ + sve_st1##NAME##_host, sve_st1##NAME##_tlb); \ +} + +#define DO_STN_2(N, NAME, ESZ, MSZ) \ +void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ + sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \ +} \ +void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \ + sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \ +} \ +void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ + sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \ +} \ +void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \ + target_ulong addr, uint32_t desc) \ +{ \ + sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \ + sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \ } DO_STN_1(1, bb, MO_8) @@ -5090,7 +5369,8 @@ static target_ulong off_zd_d(void *reg, intptr_t reg_ofs) static inline QEMU_ALWAYS_INLINE void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, target_ulong base, uint32_t desc, uintptr_t retaddr, - int esize, int msize, zreg_off_fn *off_fn, + uint32_t mtedesc, int esize, int msize, + zreg_off_fn *off_fn, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { @@ -5118,7 +5398,9 @@ void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, cpu_check_watchpoint(env_cpu(env), addr, msize, info.attrs, BP_MEM_READ, retaddr); } - /* TODO: MTE check */ + if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) { + mte_check1(env, mtedesc, addr, retaddr); + } host_fn(&scratch, reg_off, info.host); } else { /* Element crosses the page boundary. */ @@ -5129,7 +5411,9 @@ void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, msize, info.attrs, BP_MEM_READ, retaddr); } - /* TODO: MTE check */ + if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) { + mte_check1(env, mtedesc, addr, retaddr); + } tlb_fn(env, &scratch, reg_off, addr, retaddr); } } @@ -5142,20 +5426,53 @@ void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, memcpy(vd, &scratch, reg_max); } +static inline QEMU_ALWAYS_INLINE +void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + int esize, int msize, zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* + * ??? TODO: For the 32-bit offset extractions, base + ofs cannot + * offset base entirely over the address space hole to change the + * pointer tag, or change the bit55 selector. So we could here + * examine TBI + TCMA like we do for sve_ldN_r_mte(). + */ + sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, + esize, msize, off_fn, host_fn, tlb_fn); +} + #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \ void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ void *vm, target_ulong base, uint32_t desc) \ { \ - sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ + sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ + off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \ void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ void *vm, target_ulong base, uint32_t desc) \ { \ - sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ + sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } DO_LD1_ZPZ_S(bsu, zsu, MO_8) @@ -5234,7 +5551,8 @@ DO_LD1_ZPZ_D(dd_be, zd, MO_64) static inline QEMU_ALWAYS_INLINE void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, target_ulong base, uint32_t desc, uintptr_t retaddr, - const int esz, const int msz, zreg_off_fn *off_fn, + uint32_t mtedesc, const int esz, const int msz, + zreg_off_fn *off_fn, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { @@ -5259,6 +5577,9 @@ void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, * Probe the first element, allowing faults. */ addr = base + (off_fn(vm, reg_off) << scale); + if (mtedesc) { + mte_check1(env, mtedesc, addr, retaddr); + } tlb_fn(env, vd, reg_off, addr, retaddr); /* After any fault, zero the other elements. */ @@ -5291,7 +5612,11 @@ void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, (env_cpu(env), addr, msize) & BP_MEM_READ)) { goto fault; } - /* TODO: MTE check. */ + if (mtedesc && + arm_tlb_mte_tagged(&info.attrs) && + !mte_probe1(env, mtedesc, addr)) { + goto fault; + } host_fn(vd, reg_off, info.host); } @@ -5304,20 +5629,58 @@ void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, record_fault(env, reg_off, reg_max); } -#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ -void HELPER(sve_ldff##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ - off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +static inline QEMU_ALWAYS_INLINE +void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + const int esz, const int msz, + zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* + * ??? TODO: For the 32-bit offset extractions, base + ofs cannot + * offset base entirely over the address space hole to change the + * pointer tag, or change the bit55 selector. So we could here + * examine TBI + TCMA like we do for sve_ldN_r_mte(). + */ + sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, + esz, msz, off_fn, host_fn, tlb_fn); } -#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ -void HELPER(sve_ldff##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ - void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ - off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \ +void HELPER(sve_ldff##MEM##_##OFS) \ + (CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \ + off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ldff##MEM##_##OFS##_mte) \ + (CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \ + off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} + +#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \ +void HELPER(sve_ldff##MEM##_##OFS) \ + (CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ +} \ +void HELPER(sve_ldff##MEM##_##OFS##_mte) \ + (CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \ + off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \ } DO_LDFF1_ZPZ_S(bsu, zsu, MO_8) @@ -5389,7 +5752,8 @@ DO_LDFF1_ZPZ_D(dd_be, zd, MO_64) static inline QEMU_ALWAYS_INLINE void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, target_ulong base, uint32_t desc, uintptr_t retaddr, - int esize, int msize, zreg_off_fn *off_fn, + uint32_t mtedesc, int esize, int msize, + zreg_off_fn *off_fn, sve_ldst1_host_fn *host_fn, sve_ldst1_tlb_fn *tlb_fn) { @@ -5433,7 +5797,10 @@ void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, cpu_check_watchpoint(env_cpu(env), addr, msize, info.attrs, BP_MEM_WRITE, retaddr); } - /* TODO: MTE check. */ + + if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) { + mte_check1(env, mtedesc, addr, retaddr); + } } i += 1; reg_off += esize; @@ -5463,20 +5830,53 @@ void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm, } while (reg_off < reg_max); } -#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ -void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ +static inline QEMU_ALWAYS_INLINE +void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm, + target_ulong base, uint32_t desc, uintptr_t retaddr, + int esize, int msize, zreg_off_fn *off_fn, + sve_ldst1_host_fn *host_fn, + sve_ldst1_tlb_fn *tlb_fn) +{ + uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + /* Remove mtedesc from the normal sve descriptor. */ + desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT); + + /* + * ??? TODO: For the 32-bit offset extractions, base + ofs cannot + * offset base entirely over the address space hole to change the + * pointer tag, or change the bit55 selector. So we could here + * examine TBI + TCMA like we do for sve_ldN_r_mte(). + */ + sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc, + esize, msize, off_fn, host_fn, tlb_fn); +} + +#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \ +void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ - off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +{ \ + sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \ + off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +} \ +void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \ + off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ } -#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ -void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ +#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \ +void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \ void *vm, target_ulong base, uint32_t desc) \ -{ \ - sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ - off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +{ \ + sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \ + off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ +} \ +void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \ + void *vm, target_ulong base, uint32_t desc) \ +{ \ + sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \ + off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \ } DO_ST1_ZPZ_S(bs, zsu, MO_8) diff --git a/target/arm/tlb_helper.c b/target/arm/tlb_helper.c index 7388494a55..b35dc8a011 100644 --- a/target/arm/tlb_helper.c +++ b/target/arm/tlb_helper.c @@ -10,8 +10,6 @@ #include "internals.h" #include "exec/exec-all.h" -#if !defined(CONFIG_USER_ONLY) - static inline uint32_t merge_syn_data_abort(uint32_t template_syn, unsigned int target_el, bool same_el, bool ea, @@ -122,6 +120,8 @@ void arm_cpu_do_unaligned_access(CPUState *cs, vaddr vaddr, arm_deliver_fault(cpu, vaddr, access_type, mmu_idx, &fi); } +#if !defined(CONFIG_USER_ONLY) + /* * arm_cpu_do_transaction_failed: handle a memory system error response * (eg "no device/memory present at address") by raising an external abort @@ -166,6 +166,7 @@ bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size, int prot, ret; MemTxAttrs attrs = {}; ARMMMUFaultInfo fi = {}; + ARMCacheAttrs cacheattrs = {}; /* * Walk the page table and (if the mapping exists) add the page @@ -175,7 +176,8 @@ bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size, */ ret = get_phys_addr(&cpu->env, address, access_type, core_to_arm_mmu_idx(&cpu->env, mmu_idx), - &phys_addr, &attrs, &prot, &page_size, &fi, NULL); + &phys_addr, &attrs, &prot, &page_size, + &fi, &cacheattrs); if (likely(!ret)) { /* * Map a single [sub]page. Regions smaller than our declared @@ -186,6 +188,11 @@ bool arm_cpu_tlb_fill(CPUState *cs, vaddr address, int size, phys_addr &= TARGET_PAGE_MASK; address &= TARGET_PAGE_MASK; } + /* Notice and record tagged memory. */ + if (cpu_isar_feature(aa64_mte, cpu) && cacheattrs.attrs == 0xf0) { + arm_tlb_mte_tagged(&attrs) = true; + } + tlb_set_page_with_attrs(cs, address, phys_addr, attrs, prot, mmu_idx, page_size); return true; diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index a0e72ad694..73d753f11f 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -204,20 +204,20 @@ static void gen_a64_set_pc(DisasContext *s, TCGv_i64 src) } /* - * Return a "clean" address for ADDR according to TBID. - * This is always a fresh temporary, as we need to be able to - * increment this independently of a dirty write-back address. + * Handle MTE and/or TBI. + * + * For TBI, ideally, we would do nothing. Proper behaviour on fault is + * for the tag to be present in the FAR_ELx register. But for user-only + * mode we do not have a TLB with which to implement this, so we must + * remove the top byte now. + * + * Always return a fresh temporary that we can increment independently + * of the write-back address. */ -static TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr) + +TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr) { TCGv_i64 clean = new_tmp_a64(s); - /* - * In order to get the correct value in the FAR_ELx register, - * we must present the memory subsystem with the "dirty" address - * including the TBI. In system mode we can make this work via - * the TLB, dropping the TBI during translation. But for user-only - * mode we don't have that option, and must remove the top byte now. - */ #ifdef CONFIG_USER_ONLY gen_top_byte_ignore(s, clean, addr, s->tbid); #else @@ -226,6 +226,92 @@ static TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr) return clean; } +/* Insert a zero tag into src, with the result at dst. */ +static void gen_address_with_allocation_tag0(TCGv_i64 dst, TCGv_i64 src) +{ + tcg_gen_andi_i64(dst, src, ~MAKE_64BIT_MASK(56, 4)); +} + +static void gen_probe_access(DisasContext *s, TCGv_i64 ptr, + MMUAccessType acc, int log2_size) +{ + TCGv_i32 t_acc = tcg_const_i32(acc); + TCGv_i32 t_idx = tcg_const_i32(get_mem_index(s)); + TCGv_i32 t_size = tcg_const_i32(1 << log2_size); + + gen_helper_probe_access(cpu_env, ptr, t_acc, t_idx, t_size); + tcg_temp_free_i32(t_acc); + tcg_temp_free_i32(t_idx); + tcg_temp_free_i32(t_size); +} + +/* + * For MTE, check a single logical or atomic access. This probes a single + * address, the exact one specified. The size and alignment of the access + * is not relevant to MTE, per se, but watchpoints do require the size, + * and we want to recognize those before making any other changes to state. + */ +static TCGv_i64 gen_mte_check1_mmuidx(DisasContext *s, TCGv_i64 addr, + bool is_write, bool tag_checked, + int log2_size, bool is_unpriv, + int core_idx) +{ + if (tag_checked && s->mte_active[is_unpriv]) { + TCGv_i32 tcg_desc; + TCGv_i64 ret; + int desc = 0; + + desc = FIELD_DP32(desc, MTEDESC, MIDX, core_idx); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); + desc = FIELD_DP32(desc, MTEDESC, ESIZE, 1 << log2_size); + tcg_desc = tcg_const_i32(desc); + + ret = new_tmp_a64(s); + gen_helper_mte_check1(ret, cpu_env, tcg_desc, addr); + tcg_temp_free_i32(tcg_desc); + + return ret; + } + return clean_data_tbi(s, addr); +} + +TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write, + bool tag_checked, int log2_size) +{ + return gen_mte_check1_mmuidx(s, addr, is_write, tag_checked, log2_size, + false, get_mem_index(s)); +} + +/* + * For MTE, check multiple logical sequential accesses. + */ +TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write, + bool tag_checked, int log2_esize, int total_size) +{ + if (tag_checked && s->mte_active[0] && total_size != (1 << log2_esize)) { + TCGv_i32 tcg_desc; + TCGv_i64 ret; + int desc = 0; + + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); + desc = FIELD_DP32(desc, MTEDESC, ESIZE, 1 << log2_esize); + desc = FIELD_DP32(desc, MTEDESC, TSIZE, total_size); + tcg_desc = tcg_const_i32(desc); + + ret = new_tmp_a64(s); + gen_helper_mte_checkN(ret, cpu_env, tcg_desc, addr); + tcg_temp_free_i32(tcg_desc); + + return ret; + } + return gen_mte_check1(s, addr, is_write, tag_checked, log2_esize); +} + typedef struct DisasCompare64 { TCGCond cond; TCGv_i64 value; @@ -1616,7 +1702,28 @@ static void handle_msr_i(DisasContext *s, uint32_t insn, gen_helper_msr_i_daifclear(cpu_env, t1); tcg_temp_free_i32(t1); /* For DAIFClear, exit the cpu loop to re-evaluate pending IRQs. */ - s->base.is_jmp = DISAS_UPDATE; + s->base.is_jmp = DISAS_UPDATE_EXIT; + break; + + case 0x1c: /* TCO */ + if (dc_isar_feature(aa64_mte, s)) { + /* Full MTE is enabled -- set the TCO bit as directed. */ + if (crm & 1) { + set_pstate_bits(PSTATE_TCO); + } else { + clear_pstate_bits(PSTATE_TCO); + } + t1 = tcg_const_i32(s->current_el); + gen_helper_rebuild_hflags_a64(cpu_env, t1); + tcg_temp_free_i32(t1); + /* Many factors, including TCO, go into MTE_ACTIVE. */ + s->base.is_jmp = DISAS_UPDATE_NOCHAIN; + } else if (dc_isar_feature(aa64_mte_insn_reg, s)) { + /* Only "instructions accessible at EL0" -- PSTATE.TCO is WI. */ + s->base.is_jmp = DISAS_NEXT; + } else { + goto do_unallocated; + } break; default: @@ -1750,9 +1857,62 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread, return; case ARM_CP_DC_ZVA: /* Writes clear the aligned block of memory which rt points into. */ - tcg_rt = clean_data_tbi(s, cpu_reg(s, rt)); + if (s->mte_active[0]) { + TCGv_i32 t_desc; + int desc = 0; + + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + t_desc = tcg_const_i32(desc); + + tcg_rt = new_tmp_a64(s); + gen_helper_mte_check_zva(tcg_rt, cpu_env, t_desc, cpu_reg(s, rt)); + tcg_temp_free_i32(t_desc); + } else { + tcg_rt = clean_data_tbi(s, cpu_reg(s, rt)); + } gen_helper_dc_zva(cpu_env, tcg_rt); return; + case ARM_CP_DC_GVA: + { + TCGv_i64 clean_addr, tag; + + /* + * DC_GVA, like DC_ZVA, requires that we supply the original + * pointer for an invalid page. Probe that address first. + */ + tcg_rt = cpu_reg(s, rt); + clean_addr = clean_data_tbi(s, tcg_rt); + gen_probe_access(s, clean_addr, MMU_DATA_STORE, MO_8); + + if (s->ata) { + /* Extract the tag from the register to match STZGM. */ + tag = tcg_temp_new_i64(); + tcg_gen_shri_i64(tag, tcg_rt, 56); + gen_helper_stzgm_tags(cpu_env, clean_addr, tag); + tcg_temp_free_i64(tag); + } + } + return; + case ARM_CP_DC_GZVA: + { + TCGv_i64 clean_addr, tag; + + /* For DC_GZVA, we can rely on DC_ZVA for the proper fault. */ + tcg_rt = cpu_reg(s, rt); + clean_addr = clean_data_tbi(s, tcg_rt); + gen_helper_dc_zva(cpu_env, clean_addr); + + if (s->ata) { + /* Extract the tag from the register to match STZGM. */ + tag = tcg_temp_new_i64(); + tcg_gen_shri_i64(tag, tcg_rt, 56); + gen_helper_stzgm_tags(cpu_env, clean_addr, tag); + tcg_temp_free_i64(tag); + } + } + return; default: break; } @@ -1795,7 +1955,7 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread, if ((tb_cflags(s->base.tb) & CF_USE_ICOUNT) && (ri->type & ARM_CP_IO)) { /* I/O operations must end the TB here (whether read or write) */ - s->base.is_jmp = DISAS_UPDATE; + s->base.is_jmp = DISAS_UPDATE_EXIT; } if (!isread && !(ri->type & ARM_CP_SUPPRESS_TB_END)) { /* @@ -1810,7 +1970,7 @@ static void handle_sys(DisasContext *s, uint32_t insn, bool isread, * but allow this to be suppressed by the register definition * (usually only necessary to work around guest bugs). */ - s->base.is_jmp = DISAS_UPDATE; + s->base.is_jmp = DISAS_UPDATE_EXIT; } } @@ -2327,7 +2487,7 @@ static void gen_compare_and_swap(DisasContext *s, int rs, int rt, if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size); tcg_gen_atomic_cmpxchg_i64(tcg_rs, clean_addr, tcg_rs, tcg_rt, memidx, size | MO_ALIGN | s->be_data); } @@ -2345,7 +2505,9 @@ static void gen_compare_and_swap_pair(DisasContext *s, int rs, int rt, if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + + /* This is a single atomic access, despite the "pair". */ + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), true, rn != 31, size + 1); if (size == 2) { TCGv_i64 cmp = tcg_temp_new_i64(); @@ -2470,7 +2632,8 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (is_lasr) { tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); } - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + true, rn != 31, size); gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, false); return; @@ -2479,7 +2642,8 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + false, rn != 31, size); s->is_ldex = true; gen_load_exclusive(s, rt, rt2, clean_addr, size, false); if (is_lasr) { @@ -2499,7 +2663,8 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) gen_check_sp_alignment(s); } tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + true, rn != 31, size); do_gpr_st(s, cpu_reg(s, rt), clean_addr, size, true, rt, disas_ldst_compute_iss_sf(size, false, 0), is_lasr); return; @@ -2515,7 +2680,8 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + false, rn != 31, size); do_gpr_ld(s, cpu_reg(s, rt), clean_addr, size, false, false, true, rt, disas_ldst_compute_iss_sf(size, false, 0), is_lasr); tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ); @@ -2529,7 +2695,8 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (is_lasr) { tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL); } - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + true, rn != 31, size); gen_store_exclusive(s, rs, rt, rt2, clean_addr, size, true); return; } @@ -2547,7 +2714,8 @@ static void disas_ldst_excl(DisasContext *s, uint32_t insn) if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), + false, rn != 31, size); s->is_ldex = true; gen_load_exclusive(s, rt, rt2, clean_addr, size, true); if (is_lasr) { @@ -2650,7 +2818,7 @@ static void disas_ld_lit(DisasContext *s, uint32_t insn) * +-----+-------+---+---+-------+---+-------+-------+------+------+ * * opc: LDP/STP/LDNP/STNP 00 -> 32 bit, 10 -> 64 bit - * LDPSW 01 + * LDPSW/STGP 01 * LDP/STP/LDNP/STNP (SIMD) 00 -> 32 bit, 01 -> 64 bit, 10 -> 128 bit * V: 0 -> GPR, 1 -> Vector * idx: 00 -> signed offset with non-temporal hint, 01 -> post-index, @@ -2675,6 +2843,7 @@ static void disas_ldst_pair(DisasContext *s, uint32_t insn) bool is_signed = false; bool postindex = false; bool wback = false; + bool set_tag = false; TCGv_i64 clean_addr, dirty_addr; @@ -2687,6 +2856,14 @@ static void disas_ldst_pair(DisasContext *s, uint32_t insn) if (is_vector) { size = 2 + opc; + } else if (opc == 1 && !is_load) { + /* STGP */ + if (!dc_isar_feature(aa64_mte_insn_reg, s) || index == 0) { + unallocated_encoding(s); + return; + } + size = 3; + set_tag = true; } else { size = 2 + extract32(opc, 1, 1); is_signed = extract32(opc, 0, 1); @@ -2727,7 +2904,7 @@ static void disas_ldst_pair(DisasContext *s, uint32_t insn) return; } - offset <<= size; + offset <<= (set_tag ? LOG2_TAG_GRANULE : size); if (rn == 31) { gen_check_sp_alignment(s); @@ -2737,7 +2914,24 @@ static void disas_ldst_pair(DisasContext *s, uint32_t insn) if (!postindex) { tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); } - clean_addr = clean_data_tbi(s, dirty_addr); + + if (set_tag) { + if (!s->ata) { + /* + * TODO: We could rely on the stores below, at least for + * system mode, if we arrange to add MO_ALIGN_16. + */ + gen_helper_stg_stub(cpu_env, dirty_addr); + } else if (tb_cflags(s->base.tb) & CF_PARALLEL) { + gen_helper_stg_parallel(cpu_env, dirty_addr, dirty_addr); + } else { + gen_helper_stg(cpu_env, dirty_addr, dirty_addr); + } + } + + clean_addr = gen_mte_checkN(s, dirty_addr, !is_load, + (wback || rn != 31) && !set_tag, + size, 2 << size); if (is_vector) { if (is_load) { @@ -2818,6 +3012,7 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn, bool iss_valid = !is_vector; bool post_index; bool writeback; + int memidx; TCGv_i64 clean_addr, dirty_addr; @@ -2875,7 +3070,11 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn, if (!post_index) { tcg_gen_addi_i64(dirty_addr, dirty_addr, imm9); } - clean_addr = clean_data_tbi(s, dirty_addr); + + memidx = is_unpriv ? get_a64_user_mem_index(s) : get_mem_index(s); + clean_addr = gen_mte_check1_mmuidx(s, dirty_addr, is_store, + writeback || rn != 31, + size, is_unpriv, memidx); if (is_vector) { if (is_store) { @@ -2885,7 +3084,6 @@ static void disas_ldst_reg_imm9(DisasContext *s, uint32_t insn, } } else { TCGv_i64 tcg_rt = cpu_reg(s, rt); - int memidx = is_unpriv ? get_a64_user_mem_index(s) : get_mem_index(s); bool iss_sf = disas_ldst_compute_iss_sf(size, is_signed, opc); if (is_store) { @@ -2982,7 +3180,7 @@ static void disas_ldst_reg_roffset(DisasContext *s, uint32_t insn, ext_and_shift_reg(tcg_rm, tcg_rm, opt, shift ? size : 0); tcg_gen_add_i64(dirty_addr, dirty_addr, tcg_rm); - clean_addr = clean_data_tbi(s, dirty_addr); + clean_addr = gen_mte_check1(s, dirty_addr, is_store, true, size); if (is_vector) { if (is_store) { @@ -3067,7 +3265,7 @@ static void disas_ldst_reg_unsigned_imm(DisasContext *s, uint32_t insn, dirty_addr = read_cpu_reg_sp(s, rn, 1); offset = imm12 << size; tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); - clean_addr = clean_data_tbi(s, dirty_addr); + clean_addr = gen_mte_check1(s, dirty_addr, is_store, rn != 31, size); if (is_vector) { if (is_store) { @@ -3160,7 +3358,7 @@ static void disas_ldst_atomic(DisasContext *s, uint32_t insn, if (rn == 31) { gen_check_sp_alignment(s); } - clean_addr = clean_data_tbi(s, cpu_reg_sp(s, rn)); + clean_addr = gen_mte_check1(s, cpu_reg_sp(s, rn), false, rn != 31, size); if (o3_opc == 014) { /* @@ -3237,7 +3435,8 @@ static void disas_ldst_pac(DisasContext *s, uint32_t insn, tcg_gen_addi_i64(dirty_addr, dirty_addr, offset); /* Note that "clean" and "dirty" here refer to TBI not PAC. */ - clean_addr = clean_data_tbi(s, dirty_addr); + clean_addr = gen_mte_check1(s, dirty_addr, false, + is_wback || rn != 31, size); tcg_rt = cpu_reg(s, rt); do_gpr_ld(s, tcg_rt, clean_addr, size, /* is_signed */ false, @@ -3399,7 +3598,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn) TCGv_i64 clean_addr, tcg_rn, tcg_ebytes; MemOp endian = s->be_data; - int ebytes; /* bytes per element */ + int total; /* total bytes */ int elements; /* elements per vector */ int rpt; /* num iterations */ int selem; /* structure elements */ @@ -3469,19 +3668,26 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn) endian = MO_LE; } - /* Consecutive little-endian elements from a single register + total = rpt * selem * (is_q ? 16 : 8); + tcg_rn = cpu_reg_sp(s, rn); + + /* + * Issue the MTE check vs the logical repeat count, before we + * promote consecutive little-endian elements below. + */ + clean_addr = gen_mte_checkN(s, tcg_rn, is_store, is_postidx || rn != 31, + size, total); + + /* + * Consecutive little-endian elements from a single register * can be promoted to a larger little-endian operation. */ if (selem == 1 && endian == MO_LE) { size = 3; } - ebytes = 1 << size; - elements = (is_q ? 16 : 8) / ebytes; - - tcg_rn = cpu_reg_sp(s, rn); - clean_addr = clean_data_tbi(s, tcg_rn); - tcg_ebytes = tcg_const_i64(ebytes); + elements = (is_q ? 16 : 8) >> size; + tcg_ebytes = tcg_const_i64(1 << size); for (r = 0; r < rpt; r++) { int e; for (e = 0; e < elements; e++) { @@ -3515,7 +3721,7 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn) if (is_postidx) { if (rm == 31) { - tcg_gen_addi_i64(tcg_rn, tcg_rn, rpt * elements * selem * ebytes); + tcg_gen_addi_i64(tcg_rn, tcg_rn, total); } else { tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm)); } @@ -3561,7 +3767,7 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn) int selem = (extract32(opc, 0, 1) << 1 | R) + 1; bool replicate = false; int index = is_q << 3 | S << 2 | size; - int ebytes, xs; + int xs, total; TCGv_i64 clean_addr, tcg_rn, tcg_ebytes; if (extract32(insn, 31, 1)) { @@ -3615,16 +3821,17 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn) return; } - ebytes = 1 << scale; - if (rn == 31) { gen_check_sp_alignment(s); } + total = selem << scale; tcg_rn = cpu_reg_sp(s, rn); - clean_addr = clean_data_tbi(s, tcg_rn); - tcg_ebytes = tcg_const_i64(ebytes); + clean_addr = gen_mte_checkN(s, tcg_rn, !is_load, is_postidx || rn != 31, + scale, total); + + tcg_ebytes = tcg_const_i64(1 << scale); for (xs = 0; xs < selem; xs++) { if (replicate) { /* Load and replicate to all elements */ @@ -3651,13 +3858,216 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn) if (is_postidx) { if (rm == 31) { - tcg_gen_addi_i64(tcg_rn, tcg_rn, selem * ebytes); + tcg_gen_addi_i64(tcg_rn, tcg_rn, total); } else { tcg_gen_add_i64(tcg_rn, tcg_rn, cpu_reg(s, rm)); } } } +/* + * Load/Store memory tags + * + * 31 30 29 24 22 21 12 10 5 0 + * +-----+-------------+-----+---+------+-----+------+------+ + * | 1 1 | 0 1 1 0 0 1 | op1 | 1 | imm9 | op2 | Rn | Rt | + * +-----+-------------+-----+---+------+-----+------+------+ + */ +static void disas_ldst_tag(DisasContext *s, uint32_t insn) +{ + int rt = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + uint64_t offset = sextract64(insn, 12, 9) << LOG2_TAG_GRANULE; + int op2 = extract32(insn, 10, 2); + int op1 = extract32(insn, 22, 2); + bool is_load = false, is_pair = false, is_zero = false, is_mult = false; + int index = 0; + TCGv_i64 addr, clean_addr, tcg_rt; + + /* We checked insn bits [29:24,21] in the caller. */ + if (extract32(insn, 30, 2) != 3) { + goto do_unallocated; + } + + /* + * @index is a tri-state variable which has 3 states: + * < 0 : post-index, writeback + * = 0 : signed offset + * > 0 : pre-index, writeback + */ + switch (op1) { + case 0: + if (op2 != 0) { + /* STG */ + index = op2 - 2; + } else { + /* STZGM */ + if (s->current_el == 0 || offset != 0) { + goto do_unallocated; + } + is_mult = is_zero = true; + } + break; + case 1: + if (op2 != 0) { + /* STZG */ + is_zero = true; + index = op2 - 2; + } else { + /* LDG */ + is_load = true; + } + break; + case 2: + if (op2 != 0) { + /* ST2G */ + is_pair = true; + index = op2 - 2; + } else { + /* STGM */ + if (s->current_el == 0 || offset != 0) { + goto do_unallocated; + } + is_mult = true; + } + break; + case 3: + if (op2 != 0) { + /* STZ2G */ + is_pair = is_zero = true; + index = op2 - 2; + } else { + /* LDGM */ + if (s->current_el == 0 || offset != 0) { + goto do_unallocated; + } + is_mult = is_load = true; + } + break; + + default: + do_unallocated: + unallocated_encoding(s); + return; + } + + if (is_mult + ? !dc_isar_feature(aa64_mte, s) + : !dc_isar_feature(aa64_mte_insn_reg, s)) { + goto do_unallocated; + } + + if (rn == 31) { + gen_check_sp_alignment(s); + } + + addr = read_cpu_reg_sp(s, rn, true); + if (index >= 0) { + /* pre-index or signed offset */ + tcg_gen_addi_i64(addr, addr, offset); + } + + if (is_mult) { + tcg_rt = cpu_reg(s, rt); + + if (is_zero) { + int size = 4 << s->dcz_blocksize; + + if (s->ata) { + gen_helper_stzgm_tags(cpu_env, addr, tcg_rt); + } + /* + * The non-tags portion of STZGM is mostly like DC_ZVA, + * except the alignment happens before the access. + */ + clean_addr = clean_data_tbi(s, addr); + tcg_gen_andi_i64(clean_addr, clean_addr, -size); + gen_helper_dc_zva(cpu_env, clean_addr); + } else if (s->ata) { + if (is_load) { + gen_helper_ldgm(tcg_rt, cpu_env, addr); + } else { + gen_helper_stgm(cpu_env, addr, tcg_rt); + } + } else { + MMUAccessType acc = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE; + int size = 4 << GMID_EL1_BS; + + clean_addr = clean_data_tbi(s, addr); + tcg_gen_andi_i64(clean_addr, clean_addr, -size); + gen_probe_access(s, clean_addr, acc, size); + + if (is_load) { + /* The result tags are zeros. */ + tcg_gen_movi_i64(tcg_rt, 0); + } + } + return; + } + + if (is_load) { + tcg_gen_andi_i64(addr, addr, -TAG_GRANULE); + tcg_rt = cpu_reg(s, rt); + if (s->ata) { + gen_helper_ldg(tcg_rt, cpu_env, addr, tcg_rt); + } else { + clean_addr = clean_data_tbi(s, addr); + gen_probe_access(s, clean_addr, MMU_DATA_LOAD, MO_8); + gen_address_with_allocation_tag0(tcg_rt, addr); + } + } else { + tcg_rt = cpu_reg_sp(s, rt); + if (!s->ata) { + /* + * For STG and ST2G, we need to check alignment and probe memory. + * TODO: For STZG and STZ2G, we could rely on the stores below, + * at least for system mode; user-only won't enforce alignment. + */ + if (is_pair) { + gen_helper_st2g_stub(cpu_env, addr); + } else { + gen_helper_stg_stub(cpu_env, addr); + } + } else if (tb_cflags(s->base.tb) & CF_PARALLEL) { + if (is_pair) { + gen_helper_st2g_parallel(cpu_env, addr, tcg_rt); + } else { + gen_helper_stg_parallel(cpu_env, addr, tcg_rt); + } + } else { + if (is_pair) { + gen_helper_st2g(cpu_env, addr, tcg_rt); + } else { + gen_helper_stg(cpu_env, addr, tcg_rt); + } + } + } + + if (is_zero) { + TCGv_i64 clean_addr = clean_data_tbi(s, addr); + TCGv_i64 tcg_zero = tcg_const_i64(0); + int mem_index = get_mem_index(s); + int i, n = (1 + is_pair) << LOG2_TAG_GRANULE; + + tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, + MO_Q | MO_ALIGN_16); + for (i = 8; i < n; i += 8) { + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tcg_gen_qemu_st_i64(tcg_zero, clean_addr, mem_index, MO_Q); + } + tcg_temp_free_i64(tcg_zero); + } + + if (index != 0) { + /* pre-index or post-index */ + if (index < 0) { + /* post-index */ + tcg_gen_addi_i64(addr, addr, offset); + } + tcg_gen_mov_i64(cpu_reg_sp(s, rn), addr); + } +} + /* Loads and stores */ static void disas_ldst(DisasContext *s, uint32_t insn) { @@ -3682,13 +4092,14 @@ static void disas_ldst(DisasContext *s, uint32_t insn) case 0x0d: /* AdvSIMD load/store single structure */ disas_ldst_single_struct(s, insn); break; - case 0x19: /* LDAPR/STLR (unscaled immediate) */ - if (extract32(insn, 10, 2) != 0 || - extract32(insn, 21, 1) != 0) { + case 0x19: + if (extract32(insn, 21, 1) != 0) { + disas_ldst_tag(s, insn); + } else if (extract32(insn, 10, 2) == 0) { + disas_ldst_ldapr_stlr(s, insn); + } else { unallocated_encoding(s); - break; } - disas_ldst_ldapr_stlr(s, insn); break; default: unallocated_encoding(s); @@ -3727,22 +4138,22 @@ static void disas_pc_rel_adr(DisasContext *s, uint32_t insn) /* * Add/subtract (immediate) * - * 31 30 29 28 24 23 22 21 10 9 5 4 0 - * +--+--+--+-----------+-----+-------------+-----+-----+ - * |sf|op| S| 1 0 0 0 1 |shift| imm12 | Rn | Rd | - * +--+--+--+-----------+-----+-------------+-----+-----+ + * 31 30 29 28 23 22 21 10 9 5 4 0 + * +--+--+--+-------------+--+-------------+-----+-----+ + * |sf|op| S| 1 0 0 0 1 0 |sh| imm12 | Rn | Rd | + * +--+--+--+-------------+--+-------------+-----+-----+ * * sf: 0 -> 32bit, 1 -> 64bit * op: 0 -> add , 1 -> sub * S: 1 -> set flags - * shift: 00 -> LSL imm by 0, 01 -> LSL imm by 12 + * sh: 1 -> LSL imm by 12 */ static void disas_add_sub_imm(DisasContext *s, uint32_t insn) { int rd = extract32(insn, 0, 5); int rn = extract32(insn, 5, 5); uint64_t imm = extract32(insn, 10, 12); - int shift = extract32(insn, 22, 2); + bool shift = extract32(insn, 22, 1); bool setflags = extract32(insn, 29, 1); bool sub_op = extract32(insn, 30, 1); bool is_64bit = extract32(insn, 31, 1); @@ -3751,15 +4162,8 @@ static void disas_add_sub_imm(DisasContext *s, uint32_t insn) TCGv_i64 tcg_rd = setflags ? cpu_reg(s, rd) : cpu_reg_sp(s, rd); TCGv_i64 tcg_result; - switch (shift) { - case 0x0: - break; - case 0x1: + if (shift) { imm <<= 12; - break; - default: - unallocated_encoding(s); - return; } tcg_result = tcg_temp_new_i64(); @@ -3788,6 +4192,54 @@ static void disas_add_sub_imm(DisasContext *s, uint32_t insn) tcg_temp_free_i64(tcg_result); } +/* + * Add/subtract (immediate, with tags) + * + * 31 30 29 28 23 22 21 16 14 10 9 5 4 0 + * +--+--+--+-------------+--+---------+--+-------+-----+-----+ + * |sf|op| S| 1 0 0 0 1 1 |o2| uimm6 |o3| uimm4 | Rn | Rd | + * +--+--+--+-------------+--+---------+--+-------+-----+-----+ + * + * op: 0 -> add, 1 -> sub + */ +static void disas_add_sub_imm_with_tags(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int uimm4 = extract32(insn, 10, 4); + int uimm6 = extract32(insn, 16, 6); + bool sub_op = extract32(insn, 30, 1); + TCGv_i64 tcg_rn, tcg_rd; + int imm; + + /* Test all of sf=1, S=0, o2=0, o3=0. */ + if ((insn & 0xa040c000u) != 0x80000000u || + !dc_isar_feature(aa64_mte_insn_reg, s)) { + unallocated_encoding(s); + return; + } + + imm = uimm6 << LOG2_TAG_GRANULE; + if (sub_op) { + imm = -imm; + } + + tcg_rn = cpu_reg_sp(s, rn); + tcg_rd = cpu_reg_sp(s, rd); + + if (s->ata) { + TCGv_i32 offset = tcg_const_i32(imm); + TCGv_i32 tag_offset = tcg_const_i32(uimm4); + + gen_helper_addsubg(tcg_rd, cpu_env, tcg_rn, offset, tag_offset); + tcg_temp_free_i32(tag_offset); + tcg_temp_free_i32(offset); + } else { + tcg_gen_addi_i64(tcg_rd, tcg_rn, imm); + gen_address_with_allocation_tag0(tcg_rd, tcg_rd); + } +} + /* The input should be a value in the bottom e bits (with higher * bits zero); returns that value replicated into every element * of size e in a 64 bit integer. @@ -4147,9 +4599,12 @@ static void disas_data_proc_imm(DisasContext *s, uint32_t insn) case 0x20: case 0x21: /* PC-rel. addressing */ disas_pc_rel_adr(s, insn); break; - case 0x22: case 0x23: /* Add/subtract (immediate) */ + case 0x22: /* Add/subtract (immediate) */ disas_add_sub_imm(s, insn); break; + case 0x23: /* Add/subtract (immediate, with tags) */ + disas_add_sub_imm_with_tags(s, insn); + break; case 0x24: /* Logical (immediate) */ disas_logic_imm(s, insn); break; @@ -5244,25 +5699,72 @@ static void handle_crc32(DisasContext *s, */ static void disas_data_proc_2src(DisasContext *s, uint32_t insn) { - unsigned int sf, rm, opcode, rn, rd; + unsigned int sf, rm, opcode, rn, rd, setflag; sf = extract32(insn, 31, 1); + setflag = extract32(insn, 29, 1); rm = extract32(insn, 16, 5); opcode = extract32(insn, 10, 6); rn = extract32(insn, 5, 5); rd = extract32(insn, 0, 5); - if (extract32(insn, 29, 1)) { + if (setflag && opcode != 0) { unallocated_encoding(s); return; } switch (opcode) { + case 0: /* SUBP(S) */ + if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) { + goto do_unallocated; + } else { + TCGv_i64 tcg_n, tcg_m, tcg_d; + + tcg_n = read_cpu_reg_sp(s, rn, true); + tcg_m = read_cpu_reg_sp(s, rm, true); + tcg_gen_sextract_i64(tcg_n, tcg_n, 0, 56); + tcg_gen_sextract_i64(tcg_m, tcg_m, 0, 56); + tcg_d = cpu_reg(s, rd); + + if (setflag) { + gen_sub_CC(true, tcg_d, tcg_n, tcg_m); + } else { + tcg_gen_sub_i64(tcg_d, tcg_n, tcg_m); + } + } + break; case 2: /* UDIV */ handle_div(s, false, sf, rm, rn, rd); break; case 3: /* SDIV */ handle_div(s, true, sf, rm, rn, rd); break; + case 4: /* IRG */ + if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) { + goto do_unallocated; + } + if (s->ata) { + gen_helper_irg(cpu_reg_sp(s, rd), cpu_env, + cpu_reg_sp(s, rn), cpu_reg(s, rm)); + } else { + gen_address_with_allocation_tag0(cpu_reg_sp(s, rd), + cpu_reg_sp(s, rn)); + } + break; + case 5: /* GMI */ + if (sf == 0 || !dc_isar_feature(aa64_mte_insn_reg, s)) { + goto do_unallocated; + } else { + TCGv_i64 t1 = tcg_const_i64(1); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_extract_i64(t2, cpu_reg_sp(s, rn), 56, 4); + tcg_gen_shl_i64(t1, t1, t2); + tcg_gen_or_i64(cpu_reg(s, rd), cpu_reg(s, rm), t1); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); + } + break; case 8: /* LSLV */ handle_shift_reg(s, A64_SHIFT_TYPE_LSL, sf, rm, rn, rd); break; @@ -9534,7 +10036,7 @@ static void handle_2misc_fcmp_zero(DisasContext *s, int opcode, TCGv_i64 tcg_op = tcg_temp_new_i64(); TCGv_i64 tcg_zero = tcg_const_i64(0); TCGv_i64 tcg_res = tcg_temp_new_i64(); - NeonGenTwoDoubleOPFn *genfn; + NeonGenTwoDoubleOpFn *genfn; bool swap = false; int pass; @@ -9576,7 +10078,7 @@ static void handle_2misc_fcmp_zero(DisasContext *s, int opcode, TCGv_i32 tcg_op = tcg_temp_new_i32(); TCGv_i32 tcg_zero = tcg_const_i32(0); TCGv_i32 tcg_res = tcg_temp_new_i32(); - NeonGenTwoSingleOPFn *genfn; + NeonGenTwoSingleOpFn *genfn; bool swap = false; int pass, maxpasses; @@ -11370,18 +11872,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) genfn(tcg_res, tcg_op1, tcg_op2); } - if (opcode == 0xf) { - /* SABA, UABA: accumulating ops */ - static NeonGenTwoOpFn * const fns[3] = { - gen_helper_neon_add_u8, - gen_helper_neon_add_u16, - tcg_gen_add_i32, - }; - - read_vec_element_i32(s, tcg_op1, rd, pass, MO_32); - fns[size](tcg_res, tcg_op1, tcg_res); - } - write_vec_element_i32(s, tcg_res, rd, pass, MO_32); tcg_temp_free_i32(tcg_res); @@ -11917,8 +12407,8 @@ static void handle_2misc_pairwise(DisasContext *s, int opcode, bool u, } else { for (pass = 0; pass < maxpass; pass++) { TCGv_i64 tcg_op = tcg_temp_new_i64(); - NeonGenOneOpFn *genfn; - static NeonGenOneOpFn * const fns[2][2] = { + NeonGenOne64OpFn *genfn; + static NeonGenOne64OpFn * const fns[2][2] = { { gen_helper_neon_addlp_s8, gen_helper_neon_addlp_u8 }, { gen_helper_neon_addlp_s16, gen_helper_neon_addlp_u16 }, }; @@ -13983,7 +14473,7 @@ static bool is_guarded_page(CPUARMState *env, DisasContext *s) * table entry even for that case. */ return (tlb_hit(entry->addr_code, addr) && - env_tlb(env)->d[mmu_idx].iotlb[index].attrs.target_tlb_bit0); + arm_tlb_bti_gp(&env_tlb(env)->d[mmu_idx].iotlb[index].attrs)); #endif } @@ -14162,6 +14652,7 @@ static void aarch64_tr_init_disas_context(DisasContextBase *dcbase, dc->mmu_idx = core_to_aa64_mmu_idx(core_mmu_idx); dc->tbii = FIELD_EX32(tb_flags, TBFLAG_A64, TBII); dc->tbid = FIELD_EX32(tb_flags, TBFLAG_A64, TBID); + dc->tcma = FIELD_EX32(tb_flags, TBFLAG_A64, TCMA); dc->current_el = arm_mmu_idx_to_el(dc->mmu_idx); #if !defined(CONFIG_USER_ONLY) dc->user = (dc->current_el == 0); @@ -14173,10 +14664,19 @@ static void aarch64_tr_init_disas_context(DisasContextBase *dcbase, dc->bt = FIELD_EX32(tb_flags, TBFLAG_A64, BT); dc->btype = FIELD_EX32(tb_flags, TBFLAG_A64, BTYPE); dc->unpriv = FIELD_EX32(tb_flags, TBFLAG_A64, UNPRIV); + dc->ata = FIELD_EX32(tb_flags, TBFLAG_A64, ATA); + dc->mte_active[0] = FIELD_EX32(tb_flags, TBFLAG_A64, MTE_ACTIVE); + dc->mte_active[1] = FIELD_EX32(tb_flags, TBFLAG_A64, MTE0_ACTIVE); dc->vec_len = 0; dc->vec_stride = 0; dc->cp_regs = arm_cpu->cp_regs; dc->features = env->features; + dc->dcz_blocksize = arm_cpu->dcz_blocksize; + +#ifdef CONFIG_USER_ONLY + /* In sve_probe_page, we assume TBI is enabled. */ + tcg_debug_assert(dc->tbid & 1); +#endif /* Single step state. The code-generation logic here is: * SS_ACTIVE == 0: @@ -14304,12 +14804,15 @@ static void aarch64_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu) gen_goto_tb(dc, 1, dc->base.pc_next); break; default: - case DISAS_UPDATE: + case DISAS_UPDATE_EXIT: gen_a64_set_pc_im(dc->base.pc_next); /* fall through */ case DISAS_EXIT: tcg_gen_exit_tb(NULL, 0); break; + case DISAS_UPDATE_NOCHAIN: + gen_a64_set_pc_im(dc->base.pc_next); + /* fall through */ case DISAS_JUMP: tcg_gen_lookup_and_goto_ptr(); break; diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h index da0f59a2ce..49e4865918 100644 --- a/target/arm/translate-a64.h +++ b/target/arm/translate-a64.h @@ -40,6 +40,11 @@ TCGv_ptr get_fpstatus_ptr(bool); bool logic_imm_decode_wmask(uint64_t *result, unsigned int immn, unsigned int imms, unsigned int immr); bool sve_access_check(DisasContext *s); +TCGv_i64 clean_data_tbi(DisasContext *s, TCGv_i64 addr); +TCGv_i64 gen_mte_check1(DisasContext *s, TCGv_i64 addr, bool is_write, + bool tag_checked, int log2_size); +TCGv_i64 gen_mte_checkN(DisasContext *s, TCGv_i64 addr, bool is_write, + bool tag_checked, int count, int log2_esize); /* We should have at some point before trying to access an FP register * done the necessary access check, so assert that diff --git a/target/arm/translate-neon.inc.c b/target/arm/translate-neon.inc.c index a5aa56bbde..f6cb921573 100644 --- a/target/arm/translate-neon.inc.c +++ b/target/arm/translate-neon.inc.c @@ -54,6 +54,107 @@ static inline int rsub_8(DisasContext *s, int x) #include "decode-neon-ls.inc.c" #include "decode-neon-shared.inc.c" +/* Return the offset of a 2**SIZE piece of a NEON register, at index ELE, + * where 0 is the least significant end of the register. + */ +static inline long +neon_element_offset(int reg, int element, MemOp size) +{ + int element_size = 1 << size; + int ofs = element * element_size; +#ifdef HOST_WORDS_BIGENDIAN + /* Calculate the offset assuming fully little-endian, + * then XOR to account for the order of the 8-byte units. + */ + if (element_size < 8) { + ofs ^= 8 - element_size; + } +#endif + return neon_reg_offset(reg, 0) + ofs; +} + +static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop) +{ + long offset = neon_element_offset(reg, ele, mop & MO_SIZE); + + switch (mop) { + case MO_UB: + tcg_gen_ld8u_i32(var, cpu_env, offset); + break; + case MO_UW: + tcg_gen_ld16u_i32(var, cpu_env, offset); + break; + case MO_UL: + tcg_gen_ld_i32(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } +} + +static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop) +{ + long offset = neon_element_offset(reg, ele, mop & MO_SIZE); + + switch (mop) { + case MO_UB: + tcg_gen_ld8u_i64(var, cpu_env, offset); + break; + case MO_UW: + tcg_gen_ld16u_i64(var, cpu_env, offset); + break; + case MO_UL: + tcg_gen_ld32u_i64(var, cpu_env, offset); + break; + case MO_Q: + tcg_gen_ld_i64(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } +} + +static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var) +{ + long offset = neon_element_offset(reg, ele, size); + + switch (size) { + case MO_8: + tcg_gen_st8_i32(var, cpu_env, offset); + break; + case MO_16: + tcg_gen_st16_i32(var, cpu_env, offset); + break; + case MO_32: + tcg_gen_st_i32(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } +} + +static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var) +{ + long offset = neon_element_offset(reg, ele, size); + + switch (size) { + case MO_8: + tcg_gen_st8_i64(var, cpu_env, offset); + break; + case MO_16: + tcg_gen_st16_i64(var, cpu_env, offset); + break; + case MO_32: + tcg_gen_st32_i64(var, cpu_env, offset); + break; + case MO_64: + tcg_gen_st_i64(var, cpu_env, offset); + break; + default: + g_assert_not_reached(); + } +} + static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a) { int opr_sz; @@ -1664,7 +1765,7 @@ static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a) } static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a, - NeonGenTwoSingleOPFn *fn) + NeonGenTwoSingleOpFn *fn) { /* FP operations in 2-reg-and-shift group */ TCGv_i32 tmp, shiftv; @@ -2970,3 +3071,1091 @@ static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a) a->q ? 16 : 8, a->q ? 16 : 8); return true; } + +static bool trans_VREV64(DisasContext *s, arg_VREV64 *a) +{ + int pass, half; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (a->size == 3) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + for (pass = 0; pass < (a->q ? 2 : 1); pass++) { + TCGv_i32 tmp[2]; + + for (half = 0; half < 2; half++) { + tmp[half] = neon_load_reg(a->vm, pass * 2 + half); + switch (a->size) { + case 0: + tcg_gen_bswap32_i32(tmp[half], tmp[half]); + break; + case 1: + gen_swap_half(tmp[half], tmp[half]); + break; + case 2: + break; + default: + g_assert_not_reached(); + } + } + neon_store_reg(a->vd, pass * 2, tmp[1]); + neon_store_reg(a->vd, pass * 2 + 1, tmp[0]); + } + return true; +} + +static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a, + NeonGenWidenFn *widenfn, + NeonGenTwo64OpFn *opfn, + NeonGenTwo64OpFn *accfn) +{ + /* + * Pairwise long operations: widen both halves of the pair, + * combine the pairs with the opfn, and then possibly accumulate + * into the destination with the accfn. + */ + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!widenfn) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + for (pass = 0; pass < a->q + 1; pass++) { + TCGv_i32 tmp; + TCGv_i64 rm0_64, rm1_64, rd_64; + + rm0_64 = tcg_temp_new_i64(); + rm1_64 = tcg_temp_new_i64(); + rd_64 = tcg_temp_new_i64(); + tmp = neon_load_reg(a->vm, pass * 2); + widenfn(rm0_64, tmp); + tcg_temp_free_i32(tmp); + tmp = neon_load_reg(a->vm, pass * 2 + 1); + widenfn(rm1_64, tmp); + tcg_temp_free_i32(tmp); + opfn(rd_64, rm0_64, rm1_64); + tcg_temp_free_i64(rm0_64); + tcg_temp_free_i64(rm1_64); + + if (accfn) { + TCGv_i64 tmp64 = tcg_temp_new_i64(); + neon_load_reg64(tmp64, a->vd + pass); + accfn(rd_64, tmp64, rd_64); + tcg_temp_free_i64(tmp64); + } + neon_store_reg64(rd_64, a->vd + pass); + tcg_temp_free_i64(rd_64); + } + return true; +} + +static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_s8, + gen_helper_neon_widen_s16, + tcg_gen_ext_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL); +} + +static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL); +} + +static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_s8, + gen_helper_neon_widen_s16, + tcg_gen_ext_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + gen_helper_neon_addl_u16, + gen_helper_neon_addl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], + accfn[a->size]); +} + +static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + gen_helper_neon_addl_u16, + gen_helper_neon_addl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], + accfn[a->size]); +} + +typedef void ZipFn(TCGv_ptr, TCGv_ptr); + +static bool do_zip_uzp(DisasContext *s, arg_2misc *a, + ZipFn *fn) +{ + TCGv_ptr pd, pm; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!fn) { + /* Bad size or size/q combination */ + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + pd = vfp_reg_ptr(true, a->vd); + pm = vfp_reg_ptr(true, a->vm); + fn(pd, pm); + tcg_temp_free_ptr(pd); + tcg_temp_free_ptr(pm); + return true; +} + +static bool trans_VUZP(DisasContext *s, arg_2misc *a) +{ + static ZipFn * const fn[2][4] = { + { + gen_helper_neon_unzip8, + gen_helper_neon_unzip16, + NULL, + NULL, + }, { + gen_helper_neon_qunzip8, + gen_helper_neon_qunzip16, + gen_helper_neon_qunzip32, + NULL, + } + }; + return do_zip_uzp(s, a, fn[a->q][a->size]); +} + +static bool trans_VZIP(DisasContext *s, arg_2misc *a) +{ + static ZipFn * const fn[2][4] = { + { + gen_helper_neon_zip8, + gen_helper_neon_zip16, + NULL, + NULL, + }, { + gen_helper_neon_qzip8, + gen_helper_neon_qzip16, + gen_helper_neon_qzip32, + NULL, + } + }; + return do_zip_uzp(s, a, fn[a->q][a->size]); +} + +static bool do_vmovn(DisasContext *s, arg_2misc *a, + NeonGenNarrowEnvFn *narrowfn) +{ + TCGv_i64 rm; + TCGv_i32 rd0, rd1; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->vm & 1) { + return false; + } + + if (!narrowfn) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rm = tcg_temp_new_i64(); + rd0 = tcg_temp_new_i32(); + rd1 = tcg_temp_new_i32(); + + neon_load_reg64(rm, a->vm); + narrowfn(rd0, cpu_env, rm); + neon_load_reg64(rm, a->vm + 1); + narrowfn(rd1, cpu_env, rm); + neon_store_reg(a->vd, 0, rd0); + neon_store_reg(a->vd, 1, rd1); + tcg_temp_free_i64(rm); + return true; +} + +#define DO_VMOVN(INSN, FUNC) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + static NeonGenNarrowEnvFn * const narrowfn[] = { \ + FUNC##8, \ + FUNC##16, \ + FUNC##32, \ + NULL, \ + }; \ + return do_vmovn(s, a, narrowfn[a->size]); \ + } + +DO_VMOVN(VMOVN, gen_neon_narrow_u) +DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat) +DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s) +DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u) + +static bool trans_VSHLL(DisasContext *s, arg_2misc *a) +{ + TCGv_i32 rm0, rm1; + TCGv_i64 rd; + static NeonGenWidenFn * const widenfns[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + NULL, + }; + NeonGenWidenFn *widenfn = widenfns[a->size]; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->vd & 1) { + return false; + } + + if (!widenfn) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rd = tcg_temp_new_i64(); + + rm0 = neon_load_reg(a->vm, 0); + rm1 = neon_load_reg(a->vm, 1); + + widenfn(rd, rm0); + tcg_gen_shli_i64(rd, rd, 8 << a->size); + neon_store_reg64(rd, a->vd); + widenfn(rd, rm1); + tcg_gen_shli_i64(rd, rd, 8 << a->size); + neon_store_reg64(rd, a->vd + 1); + + tcg_temp_free_i64(rd); + tcg_temp_free_i32(rm0); + tcg_temp_free_i32(rm1); + return true; +} + +static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a) +{ + TCGv_ptr fpst; + TCGv_i32 ahp, tmp, tmp2, tmp3; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON) || + !dc_isar_feature(aa32_fp16_spconv, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vm & 1) || (a->size != 1)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = get_fpstatus_ptr(true); + ahp = get_ahp_flag(); + tmp = neon_load_reg(a->vm, 0); + gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp); + tmp2 = neon_load_reg(a->vm, 1); + gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp); + tcg_gen_shli_i32(tmp2, tmp2, 16); + tcg_gen_or_i32(tmp2, tmp2, tmp); + tcg_temp_free_i32(tmp); + tmp = neon_load_reg(a->vm, 2); + gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp); + tmp3 = neon_load_reg(a->vm, 3); + neon_store_reg(a->vd, 0, tmp2); + gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp); + tcg_gen_shli_i32(tmp3, tmp3, 16); + tcg_gen_or_i32(tmp3, tmp3, tmp); + neon_store_reg(a->vd, 1, tmp3); + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(ahp); + tcg_temp_free_ptr(fpst); + + return true; +} + +static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a) +{ + TCGv_ptr fpst; + TCGv_i32 ahp, tmp, tmp2, tmp3; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON) || + !dc_isar_feature(aa32_fp16_spconv, s)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd & 1) || (a->size != 1)) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = get_fpstatus_ptr(true); + ahp = get_ahp_flag(); + tmp3 = tcg_temp_new_i32(); + tmp = neon_load_reg(a->vm, 0); + tmp2 = neon_load_reg(a->vm, 1); + tcg_gen_ext16u_i32(tmp3, tmp); + gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp); + neon_store_reg(a->vd, 0, tmp3); + tcg_gen_shri_i32(tmp, tmp, 16); + gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp); + neon_store_reg(a->vd, 1, tmp); + tmp3 = tcg_temp_new_i32(); + tcg_gen_ext16u_i32(tmp3, tmp2); + gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp); + neon_store_reg(a->vd, 2, tmp3); + tcg_gen_shri_i32(tmp2, tmp2, 16); + gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp); + neon_store_reg(a->vd, 3, tmp2); + tcg_temp_free_i32(ahp); + tcg_temp_free_ptr(fpst); + + return true; +} + +static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn) +{ + int vec_size = a->q ? 16 : 8; + int rd_ofs = neon_reg_offset(a->vd, 0); + int rm_ofs = neon_reg_offset(a->vm, 0); + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->size == 3) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size); + + return true; +} + +#define DO_2MISC_VEC(INSN, FN) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + return do_2misc_vec(s, a, FN); \ + } + +DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg) +DO_2MISC_VEC(VABS, tcg_gen_gvec_abs) +DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0) +DO_2MISC_VEC(VCGT0, gen_gvec_cgt0) +DO_2MISC_VEC(VCLE0, gen_gvec_cle0) +DO_2MISC_VEC(VCGE0, gen_gvec_cge0) +DO_2MISC_VEC(VCLT0, gen_gvec_clt0) + +static bool trans_VMVN(DisasContext *s, arg_2misc *a) +{ + if (a->size != 0) { + return false; + } + return do_2misc_vec(s, a, tcg_gen_gvec_not); +} + +#define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA) \ + static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \ + uint32_t rm_ofs, uint32_t oprsz, \ + uint32_t maxsz) \ + { \ + tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz, \ + DATA, FUNC); \ + } + +#define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA) \ + static void WRAPNAME(unsigned vece, uint32_t rd_ofs, \ + uint32_t rm_ofs, uint32_t oprsz, \ + uint32_t maxsz) \ + { \ + tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC); \ + } + +WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0) +WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1) +WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0) +WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1) +WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0) +WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0) +WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0) + +#define DO_2M_CRYPTO(INSN, FEATURE, SIZE) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) { \ + return false; \ + } \ + return do_2misc_vec(s, a, gen_##INSN); \ + } + +DO_2M_CRYPTO(AESE, aa32_aes, 0) +DO_2M_CRYPTO(AESD, aa32_aes, 0) +DO_2M_CRYPTO(AESMC, aa32_aes, 0) +DO_2M_CRYPTO(AESIMC, aa32_aes, 0) +DO_2M_CRYPTO(SHA1H, aa32_sha1, 2) +DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2) +DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2) + +static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn) +{ + int pass; + + /* Handle a 2-reg-misc operation by iterating 32 bits at a time */ + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (!fn) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + TCGv_i32 tmp = neon_load_reg(a->vm, pass); + fn(tmp, tmp); + neon_store_reg(a->vd, pass, tmp); + } + + return true; +} + +static bool trans_VREV32(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + tcg_gen_bswap32_i32, + gen_swap_half, + NULL, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static bool trans_VREV16(DisasContext *s, arg_2misc *a) +{ + if (a->size != 0) { + return false; + } + return do_2misc(s, a, gen_rev16); +} + +static bool trans_VCLS(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + gen_helper_neon_cls_s8, + gen_helper_neon_cls_s16, + gen_helper_neon_cls_s32, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm) +{ + tcg_gen_clzi_i32(rd, rm, 32); +} + +static bool trans_VCLZ(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + gen_helper_neon_clz_u8, + gen_helper_neon_clz_u16, + do_VCLZ_32, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static bool trans_VCNT(DisasContext *s, arg_2misc *a) +{ + if (a->size != 0) { + return false; + } + return do_2misc(s, a, gen_helper_neon_cnt_u8); +} + +static bool trans_VABS_F(DisasContext *s, arg_2misc *a) +{ + if (a->size != 2) { + return false; + } + /* TODO: FP16 : size == 1 */ + return do_2misc(s, a, gen_helper_vfp_abss); +} + +static bool trans_VNEG_F(DisasContext *s, arg_2misc *a) +{ + if (a->size != 2) { + return false; + } + /* TODO: FP16 : size == 1 */ + return do_2misc(s, a, gen_helper_vfp_negs); +} + +static bool trans_VRECPE(DisasContext *s, arg_2misc *a) +{ + if (a->size != 2) { + return false; + } + return do_2misc(s, a, gen_helper_recpe_u32); +} + +static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a) +{ + if (a->size != 2) { + return false; + } + return do_2misc(s, a, gen_helper_rsqrte_u32); +} + +#define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \ + static void WRAPNAME(TCGv_i32 d, TCGv_i32 m) \ + { \ + FUNC(d, cpu_env, m); \ + } + +WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8) +WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16) +WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32) +WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8) +WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16) +WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32) + +static bool trans_VQABS(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + gen_VQABS_s8, + gen_VQABS_s16, + gen_VQABS_s32, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static bool trans_VQNEG(DisasContext *s, arg_2misc *a) +{ + static NeonGenOneOpFn * const fn[] = { + gen_VQNEG_s8, + gen_VQNEG_s16, + gen_VQNEG_s32, + NULL, + }; + return do_2misc(s, a, fn[a->size]); +} + +static bool do_2misc_fp(DisasContext *s, arg_2misc *a, + NeonGenOneSingleOpFn *fn) +{ + int pass; + TCGv_ptr fpst; + + /* Handle a 2-reg-misc operation by iterating 32 bits at a time */ + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->size != 2) { + /* TODO: FP16 will be the size == 1 case */ + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = get_fpstatus_ptr(1); + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + TCGv_i32 tmp = neon_load_reg(a->vm, pass); + fn(tmp, tmp, fpst); + neon_store_reg(a->vd, pass, tmp); + } + tcg_temp_free_ptr(fpst); + + return true; +} + +#define DO_2MISC_FP(INSN, FUNC) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + return do_2misc_fp(s, a, FUNC); \ + } + +DO_2MISC_FP(VRECPE_F, gen_helper_recpe_f32) +DO_2MISC_FP(VRSQRTE_F, gen_helper_rsqrte_f32) +DO_2MISC_FP(VCVT_FS, gen_helper_vfp_sitos) +DO_2MISC_FP(VCVT_FU, gen_helper_vfp_uitos) +DO_2MISC_FP(VCVT_SF, gen_helper_vfp_tosizs) +DO_2MISC_FP(VCVT_UF, gen_helper_vfp_touizs) + +static bool trans_VRINTX(DisasContext *s, arg_2misc *a) +{ + if (!arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + return do_2misc_fp(s, a, gen_helper_rints_exact); +} + +#define WRAP_FP_CMP0_FWD(WRAPNAME, FUNC) \ + static void WRAPNAME(TCGv_i32 d, TCGv_i32 m, TCGv_ptr fpst) \ + { \ + TCGv_i32 zero = tcg_const_i32(0); \ + FUNC(d, m, zero, fpst); \ + tcg_temp_free_i32(zero); \ + } +#define WRAP_FP_CMP0_REV(WRAPNAME, FUNC) \ + static void WRAPNAME(TCGv_i32 d, TCGv_i32 m, TCGv_ptr fpst) \ + { \ + TCGv_i32 zero = tcg_const_i32(0); \ + FUNC(d, zero, m, fpst); \ + tcg_temp_free_i32(zero); \ + } + +#define DO_FP_CMP0(INSN, FUNC, REV) \ + WRAP_FP_CMP0_##REV(gen_##INSN, FUNC) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + return do_2misc_fp(s, a, gen_##INSN); \ + } + +DO_FP_CMP0(VCGT0_F, gen_helper_neon_cgt_f32, FWD) +DO_FP_CMP0(VCGE0_F, gen_helper_neon_cge_f32, FWD) +DO_FP_CMP0(VCEQ0_F, gen_helper_neon_ceq_f32, FWD) +DO_FP_CMP0(VCLE0_F, gen_helper_neon_cge_f32, REV) +DO_FP_CMP0(VCLT0_F, gen_helper_neon_cgt_f32, REV) + +static bool do_vrint(DisasContext *s, arg_2misc *a, int rmode) +{ + /* + * Handle a VRINT* operation by iterating 32 bits at a time, + * with a specified rounding mode in operation. + */ + int pass; + TCGv_ptr fpst; + TCGv_i32 tcg_rmode; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON) || + !arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->size != 2) { + /* TODO: FP16 will be the size == 1 case */ + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = get_fpstatus_ptr(1); + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); + gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env); + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + TCGv_i32 tmp = neon_load_reg(a->vm, pass); + gen_helper_rints(tmp, tmp, fpst); + neon_store_reg(a->vd, pass, tmp); + } + gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env); + tcg_temp_free_i32(tcg_rmode); + tcg_temp_free_ptr(fpst); + + return true; +} + +#define DO_VRINT(INSN, RMODE) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + return do_vrint(s, a, RMODE); \ + } + +DO_VRINT(VRINTN, FPROUNDING_TIEEVEN) +DO_VRINT(VRINTA, FPROUNDING_TIEAWAY) +DO_VRINT(VRINTZ, FPROUNDING_ZERO) +DO_VRINT(VRINTM, FPROUNDING_NEGINF) +DO_VRINT(VRINTP, FPROUNDING_POSINF) + +static bool do_vcvt(DisasContext *s, arg_2misc *a, int rmode, bool is_signed) +{ + /* + * Handle a VCVT* operation by iterating 32 bits at a time, + * with a specified rounding mode in operation. + */ + int pass; + TCGv_ptr fpst; + TCGv_i32 tcg_rmode, tcg_shift; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON) || + !arm_dc_feature(s, ARM_FEATURE_V8)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->size != 2) { + /* TODO: FP16 will be the size == 1 case */ + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + fpst = get_fpstatus_ptr(1); + tcg_shift = tcg_const_i32(0); + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); + gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env); + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + TCGv_i32 tmp = neon_load_reg(a->vm, pass); + if (is_signed) { + gen_helper_vfp_tosls(tmp, tmp, tcg_shift, fpst); + } else { + gen_helper_vfp_touls(tmp, tmp, tcg_shift, fpst); + } + neon_store_reg(a->vd, pass, tmp); + } + gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, cpu_env); + tcg_temp_free_i32(tcg_rmode); + tcg_temp_free_i32(tcg_shift); + tcg_temp_free_ptr(fpst); + + return true; +} + +#define DO_VCVT(INSN, RMODE, SIGNED) \ + static bool trans_##INSN(DisasContext *s, arg_2misc *a) \ + { \ + return do_vcvt(s, a, RMODE, SIGNED); \ + } + +DO_VCVT(VCVTAU, FPROUNDING_TIEAWAY, false) +DO_VCVT(VCVTAS, FPROUNDING_TIEAWAY, true) +DO_VCVT(VCVTNU, FPROUNDING_TIEEVEN, false) +DO_VCVT(VCVTNS, FPROUNDING_TIEEVEN, true) +DO_VCVT(VCVTPU, FPROUNDING_POSINF, false) +DO_VCVT(VCVTPS, FPROUNDING_POSINF, true) +DO_VCVT(VCVTMU, FPROUNDING_NEGINF, false) +DO_VCVT(VCVTMS, FPROUNDING_NEGINF, true) + +static bool trans_VSWP(DisasContext *s, arg_2misc *a) +{ + TCGv_i64 rm, rd; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if (a->size != 0) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + rm = tcg_temp_new_i64(); + rd = tcg_temp_new_i64(); + for (pass = 0; pass < (a->q ? 2 : 1); pass++) { + neon_load_reg64(rm, a->vm + pass); + neon_load_reg64(rd, a->vd + pass); + neon_store_reg64(rm, a->vd + pass); + neon_store_reg64(rd, a->vm + pass); + } + tcg_temp_free_i64(rm); + tcg_temp_free_i64(rd); + + return true; +} +static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 rd, tmp; + + rd = tcg_temp_new_i32(); + tmp = tcg_temp_new_i32(); + + tcg_gen_shli_i32(rd, t0, 8); + tcg_gen_andi_i32(rd, rd, 0xff00ff00); + tcg_gen_andi_i32(tmp, t1, 0x00ff00ff); + tcg_gen_or_i32(rd, rd, tmp); + + tcg_gen_shri_i32(t1, t1, 8); + tcg_gen_andi_i32(t1, t1, 0x00ff00ff); + tcg_gen_andi_i32(tmp, t0, 0xff00ff00); + tcg_gen_or_i32(t1, t1, tmp); + tcg_gen_mov_i32(t0, rd); + + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(rd); +} + +static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1) +{ + TCGv_i32 rd, tmp; + + rd = tcg_temp_new_i32(); + tmp = tcg_temp_new_i32(); + + tcg_gen_shli_i32(rd, t0, 16); + tcg_gen_andi_i32(tmp, t1, 0xffff); + tcg_gen_or_i32(rd, rd, tmp); + tcg_gen_shri_i32(t1, t1, 16); + tcg_gen_andi_i32(tmp, t0, 0xffff0000); + tcg_gen_or_i32(t1, t1, tmp); + tcg_gen_mov_i32(t0, rd); + + tcg_temp_free_i32(tmp); + tcg_temp_free_i32(rd); +} + +static bool trans_VTRN(DisasContext *s, arg_2misc *a) +{ + TCGv_i32 tmp, tmp2; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (a->size == 3) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + if (a->size == 2) { + for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) { + tmp = neon_load_reg(a->vm, pass); + tmp2 = neon_load_reg(a->vd, pass + 1); + neon_store_reg(a->vm, pass, tmp2); + neon_store_reg(a->vd, pass + 1, tmp); + } + } else { + for (pass = 0; pass < (a->q ? 4 : 2); pass++) { + tmp = neon_load_reg(a->vm, pass); + tmp2 = neon_load_reg(a->vd, pass); + if (a->size == 0) { + gen_neon_trn_u8(tmp, tmp2); + } else { + gen_neon_trn_u16(tmp, tmp2); + } + neon_store_reg(a->vm, pass, tmp2); + neon_store_reg(a->vd, pass, tmp); + } + } + return true; +} diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c index ac7b3119e5..f318ca265f 100644 --- a/target/arm/translate-sve.c +++ b/target/arm/translate-sve.c @@ -4342,71 +4342,76 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm) int len_remain = len % 8; int nparts = len / 8 + ctpop8(len_remain); int midx = get_mem_index(s); - TCGv_i64 addr, t0, t1; + TCGv_i64 dirty_addr, clean_addr, t0, t1; - addr = tcg_temp_new_i64(); - t0 = tcg_temp_new_i64(); + dirty_addr = tcg_temp_new_i64(); + tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); + clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8); + tcg_temp_free_i64(dirty_addr); - /* Note that unpredicated load/store of vector/predicate registers + /* + * Note that unpredicated load/store of vector/predicate registers * are defined as a stream of bytes, which equates to little-endian - * operations on larger quantities. There is no nice way to force - * a little-endian load for aarch64_be-linux-user out of line. - * + * operations on larger quantities. * Attempt to keep code expansion to a minimum by limiting the * amount of unrolling done. */ if (nparts <= 4) { int i; + t0 = tcg_temp_new_i64(); for (i = 0; i < len_align; i += 8) { - tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i); - tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEQ); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ); tcg_gen_st_i64(t0, cpu_env, vofs + i); + tcg_gen_addi_i64(clean_addr, cpu_reg_sp(s, rn), 8); } + tcg_temp_free_i64(t0); } else { TCGLabel *loop = gen_new_label(); TCGv_ptr tp, i = tcg_const_local_ptr(0); - gen_set_label(loop); + /* Copy the clean address into a local temp, live across the loop. */ + t0 = clean_addr; + clean_addr = tcg_temp_local_new_i64(); + tcg_gen_mov_i64(clean_addr, t0); + tcg_temp_free_i64(t0); - /* Minimize the number of local temps that must be re-read from - * the stack each iteration. Instead, re-compute values other - * than the loop counter. - */ - tp = tcg_temp_new_ptr(); - tcg_gen_addi_ptr(tp, i, imm); - tcg_gen_extu_ptr_i64(addr, tp); - tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn)); + gen_set_label(loop); - tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEQ); + t0 = tcg_temp_new_i64(); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEQ); + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tp = tcg_temp_new_ptr(); tcg_gen_add_ptr(tp, cpu_env, i); tcg_gen_addi_ptr(i, i, 8); tcg_gen_st_i64(t0, tp, vofs); tcg_temp_free_ptr(tp); + tcg_temp_free_i64(t0); tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); tcg_temp_free_ptr(i); } - /* Predicate register loads can be any multiple of 2. + /* + * Predicate register loads can be any multiple of 2. * Note that we still store the entire 64-bit unit into cpu_env. */ if (len_remain) { - tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align); - + t0 = tcg_temp_new_i64(); switch (len_remain) { case 2: case 4: case 8: - tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LE | ctz32(len_remain)); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, + MO_LE | ctz32(len_remain)); break; case 6: t1 = tcg_temp_new_i64(); - tcg_gen_qemu_ld_i64(t0, addr, midx, MO_LEUL); - tcg_gen_addi_i64(addr, addr, 4); - tcg_gen_qemu_ld_i64(t1, addr, midx, MO_LEUW); + tcg_gen_qemu_ld_i64(t0, clean_addr, midx, MO_LEUL); + tcg_gen_addi_i64(clean_addr, clean_addr, 4); + tcg_gen_qemu_ld_i64(t1, clean_addr, midx, MO_LEUW); tcg_gen_deposit_i64(t0, t0, t1, 32, 32); tcg_temp_free_i64(t1); break; @@ -4415,9 +4420,9 @@ static void do_ldr(DisasContext *s, uint32_t vofs, int len, int rn, int imm) g_assert_not_reached(); } tcg_gen_st_i64(t0, cpu_env, vofs + len_align); + tcg_temp_free_i64(t0); } - tcg_temp_free_i64(addr); - tcg_temp_free_i64(t0); + tcg_temp_free_i64(clean_addr); } /* Similarly for stores. */ @@ -4427,10 +4432,12 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm) int len_remain = len % 8; int nparts = len / 8 + ctpop8(len_remain); int midx = get_mem_index(s); - TCGv_i64 addr, t0; + TCGv_i64 dirty_addr, clean_addr, t0; - addr = tcg_temp_new_i64(); - t0 = tcg_temp_new_i64(); + dirty_addr = tcg_temp_new_i64(); + tcg_gen_addi_i64(dirty_addr, cpu_reg_sp(s, rn), imm); + clean_addr = gen_mte_checkN(s, dirty_addr, false, rn != 31, len, MO_8); + tcg_temp_free_i64(dirty_addr); /* Note that unpredicated load/store of vector/predicate registers * are defined as a stream of bytes, which equates to little-endian @@ -4443,33 +4450,35 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm) if (nparts <= 4) { int i; + t0 = tcg_temp_new_i64(); for (i = 0; i < len_align; i += 8) { tcg_gen_ld_i64(t0, cpu_env, vofs + i); - tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + i); - tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ); + tcg_gen_addi_i64(clean_addr, cpu_reg_sp(s, rn), 8); } + tcg_temp_free_i64(t0); } else { TCGLabel *loop = gen_new_label(); - TCGv_ptr t2, i = tcg_const_local_ptr(0); - - gen_set_label(loop); - - t2 = tcg_temp_new_ptr(); - tcg_gen_add_ptr(t2, cpu_env, i); - tcg_gen_ld_i64(t0, t2, vofs); + TCGv_ptr tp, i = tcg_const_local_ptr(0); - /* Minimize the number of local temps that must be re-read from - * the stack each iteration. Instead, re-compute values other - * than the loop counter. - */ - tcg_gen_addi_ptr(t2, i, imm); - tcg_gen_extu_ptr_i64(addr, t2); - tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, rn)); - tcg_temp_free_ptr(t2); + /* Copy the clean address into a local temp, live across the loop. */ + t0 = clean_addr; + clean_addr = tcg_temp_local_new_i64(); + tcg_gen_mov_i64(clean_addr, t0); + tcg_temp_free_i64(t0); - tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEQ); + gen_set_label(loop); + t0 = tcg_temp_new_i64(); + tp = tcg_temp_new_ptr(); + tcg_gen_add_ptr(tp, cpu_env, i); + tcg_gen_ld_i64(t0, tp, vofs); tcg_gen_addi_ptr(i, i, 8); + tcg_temp_free_ptr(tp); + + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEQ); + tcg_gen_addi_i64(clean_addr, clean_addr, 8); + tcg_temp_free_i64(t0); tcg_gen_brcondi_ptr(TCG_COND_LTU, i, len_align, loop); tcg_temp_free_ptr(i); @@ -4477,29 +4486,30 @@ static void do_str(DisasContext *s, uint32_t vofs, int len, int rn, int imm) /* Predicate register stores can be any multiple of 2. */ if (len_remain) { + t0 = tcg_temp_new_i64(); tcg_gen_ld_i64(t0, cpu_env, vofs + len_align); - tcg_gen_addi_i64(addr, cpu_reg_sp(s, rn), imm + len_align); switch (len_remain) { case 2: case 4: case 8: - tcg_gen_qemu_st_i64(t0, addr, midx, MO_LE | ctz32(len_remain)); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, + MO_LE | ctz32(len_remain)); break; case 6: - tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUL); - tcg_gen_addi_i64(addr, addr, 4); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUL); + tcg_gen_addi_i64(clean_addr, clean_addr, 4); tcg_gen_shri_i64(t0, t0, 32); - tcg_gen_qemu_st_i64(t0, addr, midx, MO_LEUW); + tcg_gen_qemu_st_i64(t0, clean_addr, midx, MO_LEUW); break; default: g_assert_not_reached(); } + tcg_temp_free_i64(t0); } - tcg_temp_free_i64(addr); - tcg_temp_free_i64(t0); + tcg_temp_free_i64(clean_addr); } static bool trans_LDR_zri(DisasContext *s, arg_rri *a) @@ -4565,18 +4575,34 @@ static const uint8_t dtype_esz[16] = { }; static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, - int dtype, gen_helper_gvec_mem *fn) + int dtype, uint32_t mte_n, bool is_write, + gen_helper_gvec_mem *fn) { unsigned vsz = vec_full_reg_size(s); TCGv_ptr t_pg; TCGv_i32 t_desc; - int desc; + int desc = 0; - /* For e.g. LD4, there are not enough arguments to pass all 4 + /* + * For e.g. LD4, there are not enough arguments to pass all 4 * registers as pointers, so encode the regno into the data field. * For consistency, do this even for LD1. */ - desc = simd_desc(vsz, vsz, zt); + if (s->mte_active[0]) { + int msz = dtype_msz(dtype); + + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); + desc = FIELD_DP32(desc, MTEDESC, ESIZE, 1 << msz); + desc = FIELD_DP32(desc, MTEDESC, TSIZE, mte_n << msz); + desc <<= SVE_MTEDESC_SHIFT; + } else { + addr = clean_data_tbi(s, addr); + } + + desc = simd_desc(vsz, vsz, zt | desc); t_desc = tcg_const_i32(desc); t_pg = tcg_temp_new_ptr(); @@ -4590,64 +4616,132 @@ static void do_mem_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, static void do_ld_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, int dtype, int nreg) { - static gen_helper_gvec_mem * const fns[2][16][4] = { - /* Little-endian */ - { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, + static gen_helper_gvec_mem * const fns[2][2][16][4] = { + { /* mte inactive, little-endian */ + { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r }, - { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL }, - - { gen_helper_sve_ld1sds_le_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1hh_le_r, gen_helper_sve_ld2hh_le_r, - gen_helper_sve_ld3hh_le_r, gen_helper_sve_ld4hh_le_r }, - { gen_helper_sve_ld1hsu_le_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1hdu_le_r, NULL, NULL, NULL }, - - { gen_helper_sve_ld1hds_le_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1hss_le_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1ss_le_r, gen_helper_sve_ld2ss_le_r, - gen_helper_sve_ld3ss_le_r, gen_helper_sve_ld4ss_le_r }, - { gen_helper_sve_ld1sdu_le_r, NULL, NULL, NULL }, - - { gen_helper_sve_ld1bds_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1dd_le_r, gen_helper_sve_ld2dd_le_r, - gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r } }, - - /* Big-endian */ - { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, - gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r }, - { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL }, - - { gen_helper_sve_ld1sds_be_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1hh_be_r, gen_helper_sve_ld2hh_be_r, - gen_helper_sve_ld3hh_be_r, gen_helper_sve_ld4hh_be_r }, - { gen_helper_sve_ld1hsu_be_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1hdu_be_r, NULL, NULL, NULL }, - - { gen_helper_sve_ld1hds_be_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1hss_be_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1ss_be_r, gen_helper_sve_ld2ss_be_r, - gen_helper_sve_ld3ss_be_r, gen_helper_sve_ld4ss_be_r }, - { gen_helper_sve_ld1sdu_be_r, NULL, NULL, NULL }, - - { gen_helper_sve_ld1bds_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, - { gen_helper_sve_ld1dd_be_r, gen_helper_sve_ld2dd_be_r, - gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r } } + { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_le_r, gen_helper_sve_ld2hh_le_r, + gen_helper_sve_ld3hh_le_r, gen_helper_sve_ld4hh_le_r }, + { gen_helper_sve_ld1hsu_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_le_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_le_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_le_r, gen_helper_sve_ld2ss_le_r, + gen_helper_sve_ld3ss_le_r, gen_helper_sve_ld4ss_le_r }, + { gen_helper_sve_ld1sdu_le_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_le_r, gen_helper_sve_ld2dd_le_r, + gen_helper_sve_ld3dd_le_r, gen_helper_sve_ld4dd_le_r } }, + + /* mte inactive, big-endian */ + { { gen_helper_sve_ld1bb_r, gen_helper_sve_ld2bb_r, + gen_helper_sve_ld3bb_r, gen_helper_sve_ld4bb_r }, + { gen_helper_sve_ld1bhu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_be_r, gen_helper_sve_ld2hh_be_r, + gen_helper_sve_ld3hh_be_r, gen_helper_sve_ld4hh_be_r }, + { gen_helper_sve_ld1hsu_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_be_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_be_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_be_r, gen_helper_sve_ld2ss_be_r, + gen_helper_sve_ld3ss_be_r, gen_helper_sve_ld4ss_be_r }, + { gen_helper_sve_ld1sdu_be_r, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_be_r, gen_helper_sve_ld2dd_be_r, + gen_helper_sve_ld3dd_be_r, gen_helper_sve_ld4dd_be_r } } }, + + { /* mte active, little-endian */ + { { gen_helper_sve_ld1bb_r_mte, + gen_helper_sve_ld2bb_r_mte, + gen_helper_sve_ld3bb_r_mte, + gen_helper_sve_ld4bb_r_mte }, + { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_le_r_mte, + gen_helper_sve_ld2hh_le_r_mte, + gen_helper_sve_ld3hh_le_r_mte, + gen_helper_sve_ld4hh_le_r_mte }, + { gen_helper_sve_ld1hsu_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_le_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_le_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_le_r_mte, + gen_helper_sve_ld2ss_le_r_mte, + gen_helper_sve_ld3ss_le_r_mte, + gen_helper_sve_ld4ss_le_r_mte }, + { gen_helper_sve_ld1sdu_le_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_le_r_mte, + gen_helper_sve_ld2dd_le_r_mte, + gen_helper_sve_ld3dd_le_r_mte, + gen_helper_sve_ld4dd_le_r_mte } }, + + /* mte active, big-endian */ + { { gen_helper_sve_ld1bb_r_mte, + gen_helper_sve_ld2bb_r_mte, + gen_helper_sve_ld3bb_r_mte, + gen_helper_sve_ld4bb_r_mte }, + { gen_helper_sve_ld1bhu_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bsu_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bdu_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1sds_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hh_be_r_mte, + gen_helper_sve_ld2hh_be_r_mte, + gen_helper_sve_ld3hh_be_r_mte, + gen_helper_sve_ld4hh_be_r_mte }, + { gen_helper_sve_ld1hsu_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hdu_be_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1hds_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1hss_be_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1ss_be_r_mte, + gen_helper_sve_ld2ss_be_r_mte, + gen_helper_sve_ld3ss_be_r_mte, + gen_helper_sve_ld4ss_be_r_mte }, + { gen_helper_sve_ld1sdu_be_r_mte, NULL, NULL, NULL }, + + { gen_helper_sve_ld1bds_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bss_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1bhs_r_mte, NULL, NULL, NULL }, + { gen_helper_sve_ld1dd_be_r_mte, + gen_helper_sve_ld2dd_be_r_mte, + gen_helper_sve_ld3dd_be_r_mte, + gen_helper_sve_ld4dd_be_r_mte } } }, }; - gen_helper_gvec_mem *fn = fns[s->be_data == MO_BE][dtype][nreg]; + gen_helper_gvec_mem *fn + = fns[s->mte_active[0]][s->be_data == MO_BE][dtype][nreg]; - /* While there are holes in the table, they are not + /* + * While there are holes in the table, they are not * accessible via the instruction encoding. */ assert(fn != NULL); - do_mem_zpa(s, zt, pg, addr, dtype, fn); + do_mem_zpa(s, zt, pg, addr, dtype, nreg, false, fn); } static bool trans_LD_zprr(DisasContext *s, arg_rprr_load *a) @@ -4681,104 +4775,188 @@ static bool trans_LD_zpri(DisasContext *s, arg_rpri_load *a) static bool trans_LDFF1_zprr(DisasContext *s, arg_rprr_load *a) { - static gen_helper_gvec_mem * const fns[2][16] = { - /* Little-endian */ - { gen_helper_sve_ldff1bb_r, - gen_helper_sve_ldff1bhu_r, - gen_helper_sve_ldff1bsu_r, - gen_helper_sve_ldff1bdu_r, - - gen_helper_sve_ldff1sds_le_r, - gen_helper_sve_ldff1hh_le_r, - gen_helper_sve_ldff1hsu_le_r, - gen_helper_sve_ldff1hdu_le_r, - - gen_helper_sve_ldff1hds_le_r, - gen_helper_sve_ldff1hss_le_r, - gen_helper_sve_ldff1ss_le_r, - gen_helper_sve_ldff1sdu_le_r, - - gen_helper_sve_ldff1bds_r, - gen_helper_sve_ldff1bss_r, - gen_helper_sve_ldff1bhs_r, - gen_helper_sve_ldff1dd_le_r }, - - /* Big-endian */ - { gen_helper_sve_ldff1bb_r, - gen_helper_sve_ldff1bhu_r, - gen_helper_sve_ldff1bsu_r, - gen_helper_sve_ldff1bdu_r, - - gen_helper_sve_ldff1sds_be_r, - gen_helper_sve_ldff1hh_be_r, - gen_helper_sve_ldff1hsu_be_r, - gen_helper_sve_ldff1hdu_be_r, - - gen_helper_sve_ldff1hds_be_r, - gen_helper_sve_ldff1hss_be_r, - gen_helper_sve_ldff1ss_be_r, - gen_helper_sve_ldff1sdu_be_r, - - gen_helper_sve_ldff1bds_r, - gen_helper_sve_ldff1bss_r, - gen_helper_sve_ldff1bhs_r, - gen_helper_sve_ldff1dd_be_r }, + static gen_helper_gvec_mem * const fns[2][2][16] = { + { /* mte inactive, little-endian */ + { gen_helper_sve_ldff1bb_r, + gen_helper_sve_ldff1bhu_r, + gen_helper_sve_ldff1bsu_r, + gen_helper_sve_ldff1bdu_r, + + gen_helper_sve_ldff1sds_le_r, + gen_helper_sve_ldff1hh_le_r, + gen_helper_sve_ldff1hsu_le_r, + gen_helper_sve_ldff1hdu_le_r, + + gen_helper_sve_ldff1hds_le_r, + gen_helper_sve_ldff1hss_le_r, + gen_helper_sve_ldff1ss_le_r, + gen_helper_sve_ldff1sdu_le_r, + + gen_helper_sve_ldff1bds_r, + gen_helper_sve_ldff1bss_r, + gen_helper_sve_ldff1bhs_r, + gen_helper_sve_ldff1dd_le_r }, + + /* mte inactive, big-endian */ + { gen_helper_sve_ldff1bb_r, + gen_helper_sve_ldff1bhu_r, + gen_helper_sve_ldff1bsu_r, + gen_helper_sve_ldff1bdu_r, + + gen_helper_sve_ldff1sds_be_r, + gen_helper_sve_ldff1hh_be_r, + gen_helper_sve_ldff1hsu_be_r, + gen_helper_sve_ldff1hdu_be_r, + + gen_helper_sve_ldff1hds_be_r, + gen_helper_sve_ldff1hss_be_r, + gen_helper_sve_ldff1ss_be_r, + gen_helper_sve_ldff1sdu_be_r, + + gen_helper_sve_ldff1bds_r, + gen_helper_sve_ldff1bss_r, + gen_helper_sve_ldff1bhs_r, + gen_helper_sve_ldff1dd_be_r } }, + + { /* mte active, little-endian */ + { gen_helper_sve_ldff1bb_r_mte, + gen_helper_sve_ldff1bhu_r_mte, + gen_helper_sve_ldff1bsu_r_mte, + gen_helper_sve_ldff1bdu_r_mte, + + gen_helper_sve_ldff1sds_le_r_mte, + gen_helper_sve_ldff1hh_le_r_mte, + gen_helper_sve_ldff1hsu_le_r_mte, + gen_helper_sve_ldff1hdu_le_r_mte, + + gen_helper_sve_ldff1hds_le_r_mte, + gen_helper_sve_ldff1hss_le_r_mte, + gen_helper_sve_ldff1ss_le_r_mte, + gen_helper_sve_ldff1sdu_le_r_mte, + + gen_helper_sve_ldff1bds_r_mte, + gen_helper_sve_ldff1bss_r_mte, + gen_helper_sve_ldff1bhs_r_mte, + gen_helper_sve_ldff1dd_le_r_mte }, + + /* mte active, big-endian */ + { gen_helper_sve_ldff1bb_r_mte, + gen_helper_sve_ldff1bhu_r_mte, + gen_helper_sve_ldff1bsu_r_mte, + gen_helper_sve_ldff1bdu_r_mte, + + gen_helper_sve_ldff1sds_be_r_mte, + gen_helper_sve_ldff1hh_be_r_mte, + gen_helper_sve_ldff1hsu_be_r_mte, + gen_helper_sve_ldff1hdu_be_r_mte, + + gen_helper_sve_ldff1hds_be_r_mte, + gen_helper_sve_ldff1hss_be_r_mte, + gen_helper_sve_ldff1ss_be_r_mte, + gen_helper_sve_ldff1sdu_be_r_mte, + + gen_helper_sve_ldff1bds_r_mte, + gen_helper_sve_ldff1bss_r_mte, + gen_helper_sve_ldff1bhs_r_mte, + gen_helper_sve_ldff1dd_be_r_mte } }, }; if (sve_access_check(s)) { TCGv_i64 addr = new_tmp_a64(s); tcg_gen_shli_i64(addr, cpu_reg(s, a->rm), dtype_msz(a->dtype)); tcg_gen_add_i64(addr, addr, cpu_reg_sp(s, a->rn)); - do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, - fns[s->be_data == MO_BE][a->dtype]); + do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false, + fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]); } return true; } static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a) { - static gen_helper_gvec_mem * const fns[2][16] = { - /* Little-endian */ - { gen_helper_sve_ldnf1bb_r, - gen_helper_sve_ldnf1bhu_r, - gen_helper_sve_ldnf1bsu_r, - gen_helper_sve_ldnf1bdu_r, - - gen_helper_sve_ldnf1sds_le_r, - gen_helper_sve_ldnf1hh_le_r, - gen_helper_sve_ldnf1hsu_le_r, - gen_helper_sve_ldnf1hdu_le_r, - - gen_helper_sve_ldnf1hds_le_r, - gen_helper_sve_ldnf1hss_le_r, - gen_helper_sve_ldnf1ss_le_r, - gen_helper_sve_ldnf1sdu_le_r, - - gen_helper_sve_ldnf1bds_r, - gen_helper_sve_ldnf1bss_r, - gen_helper_sve_ldnf1bhs_r, - gen_helper_sve_ldnf1dd_le_r }, - - /* Big-endian */ - { gen_helper_sve_ldnf1bb_r, - gen_helper_sve_ldnf1bhu_r, - gen_helper_sve_ldnf1bsu_r, - gen_helper_sve_ldnf1bdu_r, - - gen_helper_sve_ldnf1sds_be_r, - gen_helper_sve_ldnf1hh_be_r, - gen_helper_sve_ldnf1hsu_be_r, - gen_helper_sve_ldnf1hdu_be_r, - - gen_helper_sve_ldnf1hds_be_r, - gen_helper_sve_ldnf1hss_be_r, - gen_helper_sve_ldnf1ss_be_r, - gen_helper_sve_ldnf1sdu_be_r, - - gen_helper_sve_ldnf1bds_r, - gen_helper_sve_ldnf1bss_r, - gen_helper_sve_ldnf1bhs_r, - gen_helper_sve_ldnf1dd_be_r }, + static gen_helper_gvec_mem * const fns[2][2][16] = { + { /* mte inactive, little-endian */ + { gen_helper_sve_ldnf1bb_r, + gen_helper_sve_ldnf1bhu_r, + gen_helper_sve_ldnf1bsu_r, + gen_helper_sve_ldnf1bdu_r, + + gen_helper_sve_ldnf1sds_le_r, + gen_helper_sve_ldnf1hh_le_r, + gen_helper_sve_ldnf1hsu_le_r, + gen_helper_sve_ldnf1hdu_le_r, + + gen_helper_sve_ldnf1hds_le_r, + gen_helper_sve_ldnf1hss_le_r, + gen_helper_sve_ldnf1ss_le_r, + gen_helper_sve_ldnf1sdu_le_r, + + gen_helper_sve_ldnf1bds_r, + gen_helper_sve_ldnf1bss_r, + gen_helper_sve_ldnf1bhs_r, + gen_helper_sve_ldnf1dd_le_r }, + + /* mte inactive, big-endian */ + { gen_helper_sve_ldnf1bb_r, + gen_helper_sve_ldnf1bhu_r, + gen_helper_sve_ldnf1bsu_r, + gen_helper_sve_ldnf1bdu_r, + + gen_helper_sve_ldnf1sds_be_r, + gen_helper_sve_ldnf1hh_be_r, + gen_helper_sve_ldnf1hsu_be_r, + gen_helper_sve_ldnf1hdu_be_r, + + gen_helper_sve_ldnf1hds_be_r, + gen_helper_sve_ldnf1hss_be_r, + gen_helper_sve_ldnf1ss_be_r, + gen_helper_sve_ldnf1sdu_be_r, + + gen_helper_sve_ldnf1bds_r, + gen_helper_sve_ldnf1bss_r, + gen_helper_sve_ldnf1bhs_r, + gen_helper_sve_ldnf1dd_be_r } }, + + { /* mte inactive, little-endian */ + { gen_helper_sve_ldnf1bb_r_mte, + gen_helper_sve_ldnf1bhu_r_mte, + gen_helper_sve_ldnf1bsu_r_mte, + gen_helper_sve_ldnf1bdu_r_mte, + + gen_helper_sve_ldnf1sds_le_r_mte, + gen_helper_sve_ldnf1hh_le_r_mte, + gen_helper_sve_ldnf1hsu_le_r_mte, + gen_helper_sve_ldnf1hdu_le_r_mte, + + gen_helper_sve_ldnf1hds_le_r_mte, + gen_helper_sve_ldnf1hss_le_r_mte, + gen_helper_sve_ldnf1ss_le_r_mte, + gen_helper_sve_ldnf1sdu_le_r_mte, + + gen_helper_sve_ldnf1bds_r_mte, + gen_helper_sve_ldnf1bss_r_mte, + gen_helper_sve_ldnf1bhs_r_mte, + gen_helper_sve_ldnf1dd_le_r_mte }, + + /* mte inactive, big-endian */ + { gen_helper_sve_ldnf1bb_r_mte, + gen_helper_sve_ldnf1bhu_r_mte, + gen_helper_sve_ldnf1bsu_r_mte, + gen_helper_sve_ldnf1bdu_r_mte, + + gen_helper_sve_ldnf1sds_be_r_mte, + gen_helper_sve_ldnf1hh_be_r_mte, + gen_helper_sve_ldnf1hsu_be_r_mte, + gen_helper_sve_ldnf1hdu_be_r_mte, + + gen_helper_sve_ldnf1hds_be_r_mte, + gen_helper_sve_ldnf1hss_be_r_mte, + gen_helper_sve_ldnf1ss_be_r_mte, + gen_helper_sve_ldnf1sdu_be_r_mte, + + gen_helper_sve_ldnf1bds_r_mte, + gen_helper_sve_ldnf1bss_r_mte, + gen_helper_sve_ldnf1bhs_r_mte, + gen_helper_sve_ldnf1dd_be_r_mte } }, }; if (sve_access_check(s)) { @@ -4788,8 +4966,8 @@ static bool trans_LDNF1_zpri(DisasContext *s, arg_rpri_load *a) TCGv_i64 addr = new_tmp_a64(s); tcg_gen_addi_i64(addr, cpu_reg_sp(s, a->rn), off); - do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, - fns[s->be_data == MO_BE][a->dtype]); + do_mem_zpa(s, a->rd, a->pg, addr, a->dtype, 1, false, + fns[s->mte_active[0]][s->be_data == MO_BE][a->dtype]); } return true; } @@ -4873,16 +5051,18 @@ static bool trans_LD1RQ_zpri(DisasContext *s, arg_rpri_load *a) /* Load and broadcast element. */ static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a) { - if (!sve_access_check(s)) { - return true; - } - unsigned vsz = vec_full_reg_size(s); unsigned psz = pred_full_reg_size(s); unsigned esz = dtype_esz[a->dtype]; unsigned msz = dtype_msz(a->dtype); - TCGLabel *over = gen_new_label(); - TCGv_i64 temp; + TCGLabel *over; + TCGv_i64 temp, clean_addr; + + if (!sve_access_check(s)) { + return true; + } + + over = gen_new_label(); /* If the guarding predicate has no bits set, no load occurs. */ if (psz <= 8) { @@ -4905,7 +5085,9 @@ static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a) /* Load the data. */ temp = tcg_temp_new_i64(); tcg_gen_addi_i64(temp, cpu_reg_sp(s, a->rn), a->imm << msz); - tcg_gen_qemu_ld_i64(temp, temp, get_mem_index(s), + clean_addr = gen_mte_check1(s, temp, false, true, msz); + + tcg_gen_qemu_ld_i64(temp, clean_addr, get_mem_index(s), s->be_data | dtype_mop[a->dtype]); /* Broadcast to *all* elements. */ @@ -4922,73 +5104,125 @@ static bool trans_LD1R_zpri(DisasContext *s, arg_rpri_load *a) static void do_st_zpa(DisasContext *s, int zt, int pg, TCGv_i64 addr, int msz, int esz, int nreg) { - static gen_helper_gvec_mem * const fn_single[2][4][4] = { - { { gen_helper_sve_st1bb_r, - gen_helper_sve_st1bh_r, - gen_helper_sve_st1bs_r, - gen_helper_sve_st1bd_r }, - { NULL, - gen_helper_sve_st1hh_le_r, - gen_helper_sve_st1hs_le_r, - gen_helper_sve_st1hd_le_r }, - { NULL, NULL, - gen_helper_sve_st1ss_le_r, - gen_helper_sve_st1sd_le_r }, - { NULL, NULL, NULL, - gen_helper_sve_st1dd_le_r } }, - { { gen_helper_sve_st1bb_r, - gen_helper_sve_st1bh_r, - gen_helper_sve_st1bs_r, - gen_helper_sve_st1bd_r }, - { NULL, - gen_helper_sve_st1hh_be_r, - gen_helper_sve_st1hs_be_r, - gen_helper_sve_st1hd_be_r }, - { NULL, NULL, - gen_helper_sve_st1ss_be_r, - gen_helper_sve_st1sd_be_r }, - { NULL, NULL, NULL, - gen_helper_sve_st1dd_be_r } }, + static gen_helper_gvec_mem * const fn_single[2][2][4][4] = { + { { { gen_helper_sve_st1bb_r, + gen_helper_sve_st1bh_r, + gen_helper_sve_st1bs_r, + gen_helper_sve_st1bd_r }, + { NULL, + gen_helper_sve_st1hh_le_r, + gen_helper_sve_st1hs_le_r, + gen_helper_sve_st1hd_le_r }, + { NULL, NULL, + gen_helper_sve_st1ss_le_r, + gen_helper_sve_st1sd_le_r }, + { NULL, NULL, NULL, + gen_helper_sve_st1dd_le_r } }, + { { gen_helper_sve_st1bb_r, + gen_helper_sve_st1bh_r, + gen_helper_sve_st1bs_r, + gen_helper_sve_st1bd_r }, + { NULL, + gen_helper_sve_st1hh_be_r, + gen_helper_sve_st1hs_be_r, + gen_helper_sve_st1hd_be_r }, + { NULL, NULL, + gen_helper_sve_st1ss_be_r, + gen_helper_sve_st1sd_be_r }, + { NULL, NULL, NULL, + gen_helper_sve_st1dd_be_r } } }, + + { { { gen_helper_sve_st1bb_r_mte, + gen_helper_sve_st1bh_r_mte, + gen_helper_sve_st1bs_r_mte, + gen_helper_sve_st1bd_r_mte }, + { NULL, + gen_helper_sve_st1hh_le_r_mte, + gen_helper_sve_st1hs_le_r_mte, + gen_helper_sve_st1hd_le_r_mte }, + { NULL, NULL, + gen_helper_sve_st1ss_le_r_mte, + gen_helper_sve_st1sd_le_r_mte }, + { NULL, NULL, NULL, + gen_helper_sve_st1dd_le_r_mte } }, + { { gen_helper_sve_st1bb_r_mte, + gen_helper_sve_st1bh_r_mte, + gen_helper_sve_st1bs_r_mte, + gen_helper_sve_st1bd_r_mte }, + { NULL, + gen_helper_sve_st1hh_be_r_mte, + gen_helper_sve_st1hs_be_r_mte, + gen_helper_sve_st1hd_be_r_mte }, + { NULL, NULL, + gen_helper_sve_st1ss_be_r_mte, + gen_helper_sve_st1sd_be_r_mte }, + { NULL, NULL, NULL, + gen_helper_sve_st1dd_be_r_mte } } }, }; - static gen_helper_gvec_mem * const fn_multiple[2][3][4] = { - { { gen_helper_sve_st2bb_r, - gen_helper_sve_st2hh_le_r, - gen_helper_sve_st2ss_le_r, - gen_helper_sve_st2dd_le_r }, - { gen_helper_sve_st3bb_r, - gen_helper_sve_st3hh_le_r, - gen_helper_sve_st3ss_le_r, - gen_helper_sve_st3dd_le_r }, - { gen_helper_sve_st4bb_r, - gen_helper_sve_st4hh_le_r, - gen_helper_sve_st4ss_le_r, - gen_helper_sve_st4dd_le_r } }, - { { gen_helper_sve_st2bb_r, - gen_helper_sve_st2hh_be_r, - gen_helper_sve_st2ss_be_r, - gen_helper_sve_st2dd_be_r }, - { gen_helper_sve_st3bb_r, - gen_helper_sve_st3hh_be_r, - gen_helper_sve_st3ss_be_r, - gen_helper_sve_st3dd_be_r }, - { gen_helper_sve_st4bb_r, - gen_helper_sve_st4hh_be_r, - gen_helper_sve_st4ss_be_r, - gen_helper_sve_st4dd_be_r } }, + static gen_helper_gvec_mem * const fn_multiple[2][2][3][4] = { + { { { gen_helper_sve_st2bb_r, + gen_helper_sve_st2hh_le_r, + gen_helper_sve_st2ss_le_r, + gen_helper_sve_st2dd_le_r }, + { gen_helper_sve_st3bb_r, + gen_helper_sve_st3hh_le_r, + gen_helper_sve_st3ss_le_r, + gen_helper_sve_st3dd_le_r }, + { gen_helper_sve_st4bb_r, + gen_helper_sve_st4hh_le_r, + gen_helper_sve_st4ss_le_r, + gen_helper_sve_st4dd_le_r } }, + { { gen_helper_sve_st2bb_r, + gen_helper_sve_st2hh_be_r, + gen_helper_sve_st2ss_be_r, + gen_helper_sve_st2dd_be_r }, + { gen_helper_sve_st3bb_r, + gen_helper_sve_st3hh_be_r, + gen_helper_sve_st3ss_be_r, + gen_helper_sve_st3dd_be_r }, + { gen_helper_sve_st4bb_r, + gen_helper_sve_st4hh_be_r, + gen_helper_sve_st4ss_be_r, + gen_helper_sve_st4dd_be_r } } }, + { { { gen_helper_sve_st2bb_r_mte, + gen_helper_sve_st2hh_le_r_mte, + gen_helper_sve_st2ss_le_r_mte, + gen_helper_sve_st2dd_le_r_mte }, + { gen_helper_sve_st3bb_r_mte, + gen_helper_sve_st3hh_le_r_mte, + gen_helper_sve_st3ss_le_r_mte, + gen_helper_sve_st3dd_le_r_mte }, + { gen_helper_sve_st4bb_r_mte, + gen_helper_sve_st4hh_le_r_mte, + gen_helper_sve_st4ss_le_r_mte, + gen_helper_sve_st4dd_le_r_mte } }, + { { gen_helper_sve_st2bb_r_mte, + gen_helper_sve_st2hh_be_r_mte, + gen_helper_sve_st2ss_be_r_mte, + gen_helper_sve_st2dd_be_r_mte }, + { gen_helper_sve_st3bb_r_mte, + gen_helper_sve_st3hh_be_r_mte, + gen_helper_sve_st3ss_be_r_mte, + gen_helper_sve_st3dd_be_r_mte }, + { gen_helper_sve_st4bb_r_mte, + gen_helper_sve_st4hh_be_r_mte, + gen_helper_sve_st4ss_be_r_mte, + gen_helper_sve_st4dd_be_r_mte } } }, }; gen_helper_gvec_mem *fn; int be = s->be_data == MO_BE; if (nreg == 0) { /* ST1 */ - fn = fn_single[be][msz][esz]; + fn = fn_single[s->mte_active[0]][be][msz][esz]; + nreg = 1; } else { /* ST2, ST3, ST4 -- msz == esz, enforced by encoding */ assert(msz == esz); - fn = fn_multiple[be][nreg - 1][msz]; + fn = fn_multiple[s->mte_active[0]][be][nreg - 1][msz]; } assert(fn != NULL); - do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), fn); + do_mem_zpa(s, zt, pg, addr, msz_dtype(s, msz), nreg, true, fn); } static bool trans_ST_zprr(DisasContext *s, arg_rprr_store *a) @@ -5027,7 +5261,7 @@ static bool trans_ST_zpri(DisasContext *s, arg_rpri_store *a) */ static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm, - int scale, TCGv_i64 scalar, int msz, + int scale, TCGv_i64 scalar, int msz, bool is_write, gen_helper_gvec_mem_scatter *fn) { unsigned vsz = vec_full_reg_size(s); @@ -5035,8 +5269,16 @@ static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm, TCGv_ptr t_pg = tcg_temp_new_ptr(); TCGv_ptr t_zt = tcg_temp_new_ptr(); TCGv_i32 t_desc; - int desc; + int desc = 0; + if (s->mte_active[0]) { + desc = FIELD_DP32(desc, MTEDESC, MIDX, get_mem_index(s)); + desc = FIELD_DP32(desc, MTEDESC, TBI, s->tbid); + desc = FIELD_DP32(desc, MTEDESC, TCMA, s->tcma); + desc = FIELD_DP32(desc, MTEDESC, WRITE, is_write); + desc = FIELD_DP32(desc, MTEDESC, ESIZE, 1 << msz); + desc <<= SVE_MTEDESC_SHIFT; + } desc = simd_desc(vsz, vsz, scale); t_desc = tcg_const_i32(desc); @@ -5051,176 +5293,339 @@ static void do_mem_zpz(DisasContext *s, int zt, int pg, int zm, tcg_temp_free_i32(t_desc); } -/* Indexed by [be][ff][xs][u][msz]. */ -static gen_helper_gvec_mem_scatter * const gather_load_fn32[2][2][2][2][3] = { - /* Little-endian */ - { { { { gen_helper_sve_ldbss_zsu, - gen_helper_sve_ldhss_le_zsu, - NULL, }, - { gen_helper_sve_ldbsu_zsu, - gen_helper_sve_ldhsu_le_zsu, - gen_helper_sve_ldss_le_zsu, } }, - { { gen_helper_sve_ldbss_zss, - gen_helper_sve_ldhss_le_zss, - NULL, }, - { gen_helper_sve_ldbsu_zss, - gen_helper_sve_ldhsu_le_zss, - gen_helper_sve_ldss_le_zss, } } }, - - /* First-fault */ - { { { gen_helper_sve_ldffbss_zsu, - gen_helper_sve_ldffhss_le_zsu, - NULL, }, - { gen_helper_sve_ldffbsu_zsu, - gen_helper_sve_ldffhsu_le_zsu, - gen_helper_sve_ldffss_le_zsu, } }, - { { gen_helper_sve_ldffbss_zss, - gen_helper_sve_ldffhss_le_zss, - NULL, }, - { gen_helper_sve_ldffbsu_zss, - gen_helper_sve_ldffhsu_le_zss, - gen_helper_sve_ldffss_le_zss, } } } }, - - /* Big-endian */ - { { { { gen_helper_sve_ldbss_zsu, - gen_helper_sve_ldhss_be_zsu, - NULL, }, - { gen_helper_sve_ldbsu_zsu, - gen_helper_sve_ldhsu_be_zsu, - gen_helper_sve_ldss_be_zsu, } }, - { { gen_helper_sve_ldbss_zss, - gen_helper_sve_ldhss_be_zss, - NULL, }, - { gen_helper_sve_ldbsu_zss, - gen_helper_sve_ldhsu_be_zss, - gen_helper_sve_ldss_be_zss, } } }, - - /* First-fault */ - { { { gen_helper_sve_ldffbss_zsu, - gen_helper_sve_ldffhss_be_zsu, - NULL, }, - { gen_helper_sve_ldffbsu_zsu, - gen_helper_sve_ldffhsu_be_zsu, - gen_helper_sve_ldffss_be_zsu, } }, - { { gen_helper_sve_ldffbss_zss, - gen_helper_sve_ldffhss_be_zss, - NULL, }, - { gen_helper_sve_ldffbsu_zss, - gen_helper_sve_ldffhsu_be_zss, - gen_helper_sve_ldffss_be_zss, } } } }, +/* Indexed by [mte][be][ff][xs][u][msz]. */ +static gen_helper_gvec_mem_scatter * const +gather_load_fn32[2][2][2][2][2][3] = { + { /* MTE Inactive */ + { /* Little-endian */ + { { { gen_helper_sve_ldbss_zsu, + gen_helper_sve_ldhss_le_zsu, + NULL, }, + { gen_helper_sve_ldbsu_zsu, + gen_helper_sve_ldhsu_le_zsu, + gen_helper_sve_ldss_le_zsu, } }, + { { gen_helper_sve_ldbss_zss, + gen_helper_sve_ldhss_le_zss, + NULL, }, + { gen_helper_sve_ldbsu_zss, + gen_helper_sve_ldhsu_le_zss, + gen_helper_sve_ldss_le_zss, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbss_zsu, + gen_helper_sve_ldffhss_le_zsu, + NULL, }, + { gen_helper_sve_ldffbsu_zsu, + gen_helper_sve_ldffhsu_le_zsu, + gen_helper_sve_ldffss_le_zsu, } }, + { { gen_helper_sve_ldffbss_zss, + gen_helper_sve_ldffhss_le_zss, + NULL, }, + { gen_helper_sve_ldffbsu_zss, + gen_helper_sve_ldffhsu_le_zss, + gen_helper_sve_ldffss_le_zss, } } } }, + + { /* Big-endian */ + { { { gen_helper_sve_ldbss_zsu, + gen_helper_sve_ldhss_be_zsu, + NULL, }, + { gen_helper_sve_ldbsu_zsu, + gen_helper_sve_ldhsu_be_zsu, + gen_helper_sve_ldss_be_zsu, } }, + { { gen_helper_sve_ldbss_zss, + gen_helper_sve_ldhss_be_zss, + NULL, }, + { gen_helper_sve_ldbsu_zss, + gen_helper_sve_ldhsu_be_zss, + gen_helper_sve_ldss_be_zss, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbss_zsu, + gen_helper_sve_ldffhss_be_zsu, + NULL, }, + { gen_helper_sve_ldffbsu_zsu, + gen_helper_sve_ldffhsu_be_zsu, + gen_helper_sve_ldffss_be_zsu, } }, + { { gen_helper_sve_ldffbss_zss, + gen_helper_sve_ldffhss_be_zss, + NULL, }, + { gen_helper_sve_ldffbsu_zss, + gen_helper_sve_ldffhsu_be_zss, + gen_helper_sve_ldffss_be_zss, } } } } }, + { /* MTE Active */ + { /* Little-endian */ + { { { gen_helper_sve_ldbss_zsu_mte, + gen_helper_sve_ldhss_le_zsu_mte, + NULL, }, + { gen_helper_sve_ldbsu_zsu_mte, + gen_helper_sve_ldhsu_le_zsu_mte, + gen_helper_sve_ldss_le_zsu_mte, } }, + { { gen_helper_sve_ldbss_zss_mte, + gen_helper_sve_ldhss_le_zss_mte, + NULL, }, + { gen_helper_sve_ldbsu_zss_mte, + gen_helper_sve_ldhsu_le_zss_mte, + gen_helper_sve_ldss_le_zss_mte, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbss_zsu_mte, + gen_helper_sve_ldffhss_le_zsu_mte, + NULL, }, + { gen_helper_sve_ldffbsu_zsu_mte, + gen_helper_sve_ldffhsu_le_zsu_mte, + gen_helper_sve_ldffss_le_zsu_mte, } }, + { { gen_helper_sve_ldffbss_zss_mte, + gen_helper_sve_ldffhss_le_zss_mte, + NULL, }, + { gen_helper_sve_ldffbsu_zss_mte, + gen_helper_sve_ldffhsu_le_zss_mte, + gen_helper_sve_ldffss_le_zss_mte, } } } }, + + { /* Big-endian */ + { { { gen_helper_sve_ldbss_zsu_mte, + gen_helper_sve_ldhss_be_zsu_mte, + NULL, }, + { gen_helper_sve_ldbsu_zsu_mte, + gen_helper_sve_ldhsu_be_zsu_mte, + gen_helper_sve_ldss_be_zsu_mte, } }, + { { gen_helper_sve_ldbss_zss_mte, + gen_helper_sve_ldhss_be_zss_mte, + NULL, }, + { gen_helper_sve_ldbsu_zss_mte, + gen_helper_sve_ldhsu_be_zss_mte, + gen_helper_sve_ldss_be_zss_mte, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbss_zsu_mte, + gen_helper_sve_ldffhss_be_zsu_mte, + NULL, }, + { gen_helper_sve_ldffbsu_zsu_mte, + gen_helper_sve_ldffhsu_be_zsu_mte, + gen_helper_sve_ldffss_be_zsu_mte, } }, + { { gen_helper_sve_ldffbss_zss_mte, + gen_helper_sve_ldffhss_be_zss_mte, + NULL, }, + { gen_helper_sve_ldffbsu_zss_mte, + gen_helper_sve_ldffhsu_be_zss_mte, + gen_helper_sve_ldffss_be_zss_mte, } } } } }, }; /* Note that we overload xs=2 to indicate 64-bit offset. */ -static gen_helper_gvec_mem_scatter * const gather_load_fn64[2][2][3][2][4] = { - /* Little-endian */ - { { { { gen_helper_sve_ldbds_zsu, - gen_helper_sve_ldhds_le_zsu, - gen_helper_sve_ldsds_le_zsu, - NULL, }, - { gen_helper_sve_ldbdu_zsu, - gen_helper_sve_ldhdu_le_zsu, - gen_helper_sve_ldsdu_le_zsu, - gen_helper_sve_lddd_le_zsu, } }, - { { gen_helper_sve_ldbds_zss, - gen_helper_sve_ldhds_le_zss, - gen_helper_sve_ldsds_le_zss, - NULL, }, - { gen_helper_sve_ldbdu_zss, - gen_helper_sve_ldhdu_le_zss, - gen_helper_sve_ldsdu_le_zss, - gen_helper_sve_lddd_le_zss, } }, - { { gen_helper_sve_ldbds_zd, - gen_helper_sve_ldhds_le_zd, - gen_helper_sve_ldsds_le_zd, - NULL, }, - { gen_helper_sve_ldbdu_zd, - gen_helper_sve_ldhdu_le_zd, - gen_helper_sve_ldsdu_le_zd, - gen_helper_sve_lddd_le_zd, } } }, - - /* First-fault */ - { { { gen_helper_sve_ldffbds_zsu, - gen_helper_sve_ldffhds_le_zsu, - gen_helper_sve_ldffsds_le_zsu, - NULL, }, - { gen_helper_sve_ldffbdu_zsu, - gen_helper_sve_ldffhdu_le_zsu, - gen_helper_sve_ldffsdu_le_zsu, - gen_helper_sve_ldffdd_le_zsu, } }, - { { gen_helper_sve_ldffbds_zss, - gen_helper_sve_ldffhds_le_zss, - gen_helper_sve_ldffsds_le_zss, - NULL, }, - { gen_helper_sve_ldffbdu_zss, - gen_helper_sve_ldffhdu_le_zss, - gen_helper_sve_ldffsdu_le_zss, - gen_helper_sve_ldffdd_le_zss, } }, - { { gen_helper_sve_ldffbds_zd, - gen_helper_sve_ldffhds_le_zd, - gen_helper_sve_ldffsds_le_zd, - NULL, }, - { gen_helper_sve_ldffbdu_zd, - gen_helper_sve_ldffhdu_le_zd, - gen_helper_sve_ldffsdu_le_zd, - gen_helper_sve_ldffdd_le_zd, } } } }, - - /* Big-endian */ - { { { { gen_helper_sve_ldbds_zsu, - gen_helper_sve_ldhds_be_zsu, - gen_helper_sve_ldsds_be_zsu, - NULL, }, - { gen_helper_sve_ldbdu_zsu, - gen_helper_sve_ldhdu_be_zsu, - gen_helper_sve_ldsdu_be_zsu, - gen_helper_sve_lddd_be_zsu, } }, - { { gen_helper_sve_ldbds_zss, - gen_helper_sve_ldhds_be_zss, - gen_helper_sve_ldsds_be_zss, - NULL, }, - { gen_helper_sve_ldbdu_zss, - gen_helper_sve_ldhdu_be_zss, - gen_helper_sve_ldsdu_be_zss, - gen_helper_sve_lddd_be_zss, } }, - { { gen_helper_sve_ldbds_zd, - gen_helper_sve_ldhds_be_zd, - gen_helper_sve_ldsds_be_zd, - NULL, }, - { gen_helper_sve_ldbdu_zd, - gen_helper_sve_ldhdu_be_zd, - gen_helper_sve_ldsdu_be_zd, - gen_helper_sve_lddd_be_zd, } } }, - - /* First-fault */ - { { { gen_helper_sve_ldffbds_zsu, - gen_helper_sve_ldffhds_be_zsu, - gen_helper_sve_ldffsds_be_zsu, - NULL, }, - { gen_helper_sve_ldffbdu_zsu, - gen_helper_sve_ldffhdu_be_zsu, - gen_helper_sve_ldffsdu_be_zsu, - gen_helper_sve_ldffdd_be_zsu, } }, - { { gen_helper_sve_ldffbds_zss, - gen_helper_sve_ldffhds_be_zss, - gen_helper_sve_ldffsds_be_zss, - NULL, }, - { gen_helper_sve_ldffbdu_zss, - gen_helper_sve_ldffhdu_be_zss, - gen_helper_sve_ldffsdu_be_zss, - gen_helper_sve_ldffdd_be_zss, } }, - { { gen_helper_sve_ldffbds_zd, - gen_helper_sve_ldffhds_be_zd, - gen_helper_sve_ldffsds_be_zd, - NULL, }, - { gen_helper_sve_ldffbdu_zd, - gen_helper_sve_ldffhdu_be_zd, - gen_helper_sve_ldffsdu_be_zd, - gen_helper_sve_ldffdd_be_zd, } } } }, +static gen_helper_gvec_mem_scatter * const +gather_load_fn64[2][2][2][3][2][4] = { + { /* MTE Inactive */ + { /* Little-endian */ + { { { gen_helper_sve_ldbds_zsu, + gen_helper_sve_ldhds_le_zsu, + gen_helper_sve_ldsds_le_zsu, + NULL, }, + { gen_helper_sve_ldbdu_zsu, + gen_helper_sve_ldhdu_le_zsu, + gen_helper_sve_ldsdu_le_zsu, + gen_helper_sve_lddd_le_zsu, } }, + { { gen_helper_sve_ldbds_zss, + gen_helper_sve_ldhds_le_zss, + gen_helper_sve_ldsds_le_zss, + NULL, }, + { gen_helper_sve_ldbdu_zss, + gen_helper_sve_ldhdu_le_zss, + gen_helper_sve_ldsdu_le_zss, + gen_helper_sve_lddd_le_zss, } }, + { { gen_helper_sve_ldbds_zd, + gen_helper_sve_ldhds_le_zd, + gen_helper_sve_ldsds_le_zd, + NULL, }, + { gen_helper_sve_ldbdu_zd, + gen_helper_sve_ldhdu_le_zd, + gen_helper_sve_ldsdu_le_zd, + gen_helper_sve_lddd_le_zd, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbds_zsu, + gen_helper_sve_ldffhds_le_zsu, + gen_helper_sve_ldffsds_le_zsu, + NULL, }, + { gen_helper_sve_ldffbdu_zsu, + gen_helper_sve_ldffhdu_le_zsu, + gen_helper_sve_ldffsdu_le_zsu, + gen_helper_sve_ldffdd_le_zsu, } }, + { { gen_helper_sve_ldffbds_zss, + gen_helper_sve_ldffhds_le_zss, + gen_helper_sve_ldffsds_le_zss, + NULL, }, + { gen_helper_sve_ldffbdu_zss, + gen_helper_sve_ldffhdu_le_zss, + gen_helper_sve_ldffsdu_le_zss, + gen_helper_sve_ldffdd_le_zss, } }, + { { gen_helper_sve_ldffbds_zd, + gen_helper_sve_ldffhds_le_zd, + gen_helper_sve_ldffsds_le_zd, + NULL, }, + { gen_helper_sve_ldffbdu_zd, + gen_helper_sve_ldffhdu_le_zd, + gen_helper_sve_ldffsdu_le_zd, + gen_helper_sve_ldffdd_le_zd, } } } }, + { /* Big-endian */ + { { { gen_helper_sve_ldbds_zsu, + gen_helper_sve_ldhds_be_zsu, + gen_helper_sve_ldsds_be_zsu, + NULL, }, + { gen_helper_sve_ldbdu_zsu, + gen_helper_sve_ldhdu_be_zsu, + gen_helper_sve_ldsdu_be_zsu, + gen_helper_sve_lddd_be_zsu, } }, + { { gen_helper_sve_ldbds_zss, + gen_helper_sve_ldhds_be_zss, + gen_helper_sve_ldsds_be_zss, + NULL, }, + { gen_helper_sve_ldbdu_zss, + gen_helper_sve_ldhdu_be_zss, + gen_helper_sve_ldsdu_be_zss, + gen_helper_sve_lddd_be_zss, } }, + { { gen_helper_sve_ldbds_zd, + gen_helper_sve_ldhds_be_zd, + gen_helper_sve_ldsds_be_zd, + NULL, }, + { gen_helper_sve_ldbdu_zd, + gen_helper_sve_ldhdu_be_zd, + gen_helper_sve_ldsdu_be_zd, + gen_helper_sve_lddd_be_zd, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbds_zsu, + gen_helper_sve_ldffhds_be_zsu, + gen_helper_sve_ldffsds_be_zsu, + NULL, }, + { gen_helper_sve_ldffbdu_zsu, + gen_helper_sve_ldffhdu_be_zsu, + gen_helper_sve_ldffsdu_be_zsu, + gen_helper_sve_ldffdd_be_zsu, } }, + { { gen_helper_sve_ldffbds_zss, + gen_helper_sve_ldffhds_be_zss, + gen_helper_sve_ldffsds_be_zss, + NULL, }, + { gen_helper_sve_ldffbdu_zss, + gen_helper_sve_ldffhdu_be_zss, + gen_helper_sve_ldffsdu_be_zss, + gen_helper_sve_ldffdd_be_zss, } }, + { { gen_helper_sve_ldffbds_zd, + gen_helper_sve_ldffhds_be_zd, + gen_helper_sve_ldffsds_be_zd, + NULL, }, + { gen_helper_sve_ldffbdu_zd, + gen_helper_sve_ldffhdu_be_zd, + gen_helper_sve_ldffsdu_be_zd, + gen_helper_sve_ldffdd_be_zd, } } } } }, + { /* MTE Active */ + { /* Little-endian */ + { { { gen_helper_sve_ldbds_zsu_mte, + gen_helper_sve_ldhds_le_zsu_mte, + gen_helper_sve_ldsds_le_zsu_mte, + NULL, }, + { gen_helper_sve_ldbdu_zsu_mte, + gen_helper_sve_ldhdu_le_zsu_mte, + gen_helper_sve_ldsdu_le_zsu_mte, + gen_helper_sve_lddd_le_zsu_mte, } }, + { { gen_helper_sve_ldbds_zss_mte, + gen_helper_sve_ldhds_le_zss_mte, + gen_helper_sve_ldsds_le_zss_mte, + NULL, }, + { gen_helper_sve_ldbdu_zss_mte, + gen_helper_sve_ldhdu_le_zss_mte, + gen_helper_sve_ldsdu_le_zss_mte, + gen_helper_sve_lddd_le_zss_mte, } }, + { { gen_helper_sve_ldbds_zd_mte, + gen_helper_sve_ldhds_le_zd_mte, + gen_helper_sve_ldsds_le_zd_mte, + NULL, }, + { gen_helper_sve_ldbdu_zd_mte, + gen_helper_sve_ldhdu_le_zd_mte, + gen_helper_sve_ldsdu_le_zd_mte, + gen_helper_sve_lddd_le_zd_mte, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbds_zsu_mte, + gen_helper_sve_ldffhds_le_zsu_mte, + gen_helper_sve_ldffsds_le_zsu_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zsu_mte, + gen_helper_sve_ldffhdu_le_zsu_mte, + gen_helper_sve_ldffsdu_le_zsu_mte, + gen_helper_sve_ldffdd_le_zsu_mte, } }, + { { gen_helper_sve_ldffbds_zss_mte, + gen_helper_sve_ldffhds_le_zss_mte, + gen_helper_sve_ldffsds_le_zss_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zss_mte, + gen_helper_sve_ldffhdu_le_zss_mte, + gen_helper_sve_ldffsdu_le_zss_mte, + gen_helper_sve_ldffdd_le_zss_mte, } }, + { { gen_helper_sve_ldffbds_zd_mte, + gen_helper_sve_ldffhds_le_zd_mte, + gen_helper_sve_ldffsds_le_zd_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zd_mte, + gen_helper_sve_ldffhdu_le_zd_mte, + gen_helper_sve_ldffsdu_le_zd_mte, + gen_helper_sve_ldffdd_le_zd_mte, } } } }, + { /* Big-endian */ + { { { gen_helper_sve_ldbds_zsu_mte, + gen_helper_sve_ldhds_be_zsu_mte, + gen_helper_sve_ldsds_be_zsu_mte, + NULL, }, + { gen_helper_sve_ldbdu_zsu_mte, + gen_helper_sve_ldhdu_be_zsu_mte, + gen_helper_sve_ldsdu_be_zsu_mte, + gen_helper_sve_lddd_be_zsu_mte, } }, + { { gen_helper_sve_ldbds_zss_mte, + gen_helper_sve_ldhds_be_zss_mte, + gen_helper_sve_ldsds_be_zss_mte, + NULL, }, + { gen_helper_sve_ldbdu_zss_mte, + gen_helper_sve_ldhdu_be_zss_mte, + gen_helper_sve_ldsdu_be_zss_mte, + gen_helper_sve_lddd_be_zss_mte, } }, + { { gen_helper_sve_ldbds_zd_mte, + gen_helper_sve_ldhds_be_zd_mte, + gen_helper_sve_ldsds_be_zd_mte, + NULL, }, + { gen_helper_sve_ldbdu_zd_mte, + gen_helper_sve_ldhdu_be_zd_mte, + gen_helper_sve_ldsdu_be_zd_mte, + gen_helper_sve_lddd_be_zd_mte, } } }, + + /* First-fault */ + { { { gen_helper_sve_ldffbds_zsu_mte, + gen_helper_sve_ldffhds_be_zsu_mte, + gen_helper_sve_ldffsds_be_zsu_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zsu_mte, + gen_helper_sve_ldffhdu_be_zsu_mte, + gen_helper_sve_ldffsdu_be_zsu_mte, + gen_helper_sve_ldffdd_be_zsu_mte, } }, + { { gen_helper_sve_ldffbds_zss_mte, + gen_helper_sve_ldffhds_be_zss_mte, + gen_helper_sve_ldffsds_be_zss_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zss_mte, + gen_helper_sve_ldffhdu_be_zss_mte, + gen_helper_sve_ldffsdu_be_zss_mte, + gen_helper_sve_ldffdd_be_zss_mte, } }, + { { gen_helper_sve_ldffbds_zd_mte, + gen_helper_sve_ldffhds_be_zd_mte, + gen_helper_sve_ldffsds_be_zd_mte, + NULL, }, + { gen_helper_sve_ldffbdu_zd_mte, + gen_helper_sve_ldffhdu_be_zd_mte, + gen_helper_sve_ldffsdu_be_zd_mte, + gen_helper_sve_ldffdd_be_zd_mte, } } } } }, }; static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a) { gen_helper_gvec_mem_scatter *fn = NULL; - int be = s->be_data == MO_BE; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; if (!sve_access_check(s)) { return true; @@ -5228,23 +5633,24 @@ static bool trans_LD1_zprz(DisasContext *s, arg_LD1_zprz *a) switch (a->esz) { case MO_32: - fn = gather_load_fn32[be][a->ff][a->xs][a->u][a->msz]; + fn = gather_load_fn32[mte][be][a->ff][a->xs][a->u][a->msz]; break; case MO_64: - fn = gather_load_fn64[be][a->ff][a->xs][a->u][a->msz]; + fn = gather_load_fn64[mte][be][a->ff][a->xs][a->u][a->msz]; break; } assert(fn != NULL); do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz, - cpu_reg_sp(s, a->rn), a->msz, fn); + cpu_reg_sp(s, a->rn), a->msz, false, fn); return true; } static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a) { gen_helper_gvec_mem_scatter *fn = NULL; - int be = s->be_data == MO_BE; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; TCGv_i64 imm; if (a->esz < a->msz || (a->esz == a->msz && !a->u)) { @@ -5256,10 +5662,10 @@ static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a) switch (a->esz) { case MO_32: - fn = gather_load_fn32[be][a->ff][0][a->u][a->msz]; + fn = gather_load_fn32[mte][be][a->ff][0][a->u][a->msz]; break; case MO_64: - fn = gather_load_fn64[be][a->ff][2][a->u][a->msz]; + fn = gather_load_fn64[mte][be][a->ff][2][a->u][a->msz]; break; } assert(fn != NULL); @@ -5268,63 +5674,108 @@ static bool trans_LD1_zpiz(DisasContext *s, arg_LD1_zpiz *a) * by loading the immediate into the scalar parameter. */ imm = tcg_const_i64(a->imm << a->msz); - do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, a->msz, fn); + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, a->msz, false, fn); tcg_temp_free_i64(imm); return true; } -/* Indexed by [be][xs][msz]. */ -static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][2][3] = { - /* Little-endian */ - { { gen_helper_sve_stbs_zsu, - gen_helper_sve_sths_le_zsu, - gen_helper_sve_stss_le_zsu, }, - { gen_helper_sve_stbs_zss, - gen_helper_sve_sths_le_zss, - gen_helper_sve_stss_le_zss, } }, - /* Big-endian */ - { { gen_helper_sve_stbs_zsu, - gen_helper_sve_sths_be_zsu, - gen_helper_sve_stss_be_zsu, }, - { gen_helper_sve_stbs_zss, - gen_helper_sve_sths_be_zss, - gen_helper_sve_stss_be_zss, } }, +/* Indexed by [mte][be][xs][msz]. */ +static gen_helper_gvec_mem_scatter * const scatter_store_fn32[2][2][2][3] = { + { /* MTE Inactive */ + { /* Little-endian */ + { gen_helper_sve_stbs_zsu, + gen_helper_sve_sths_le_zsu, + gen_helper_sve_stss_le_zsu, }, + { gen_helper_sve_stbs_zss, + gen_helper_sve_sths_le_zss, + gen_helper_sve_stss_le_zss, } }, + { /* Big-endian */ + { gen_helper_sve_stbs_zsu, + gen_helper_sve_sths_be_zsu, + gen_helper_sve_stss_be_zsu, }, + { gen_helper_sve_stbs_zss, + gen_helper_sve_sths_be_zss, + gen_helper_sve_stss_be_zss, } } }, + { /* MTE Active */ + { /* Little-endian */ + { gen_helper_sve_stbs_zsu_mte, + gen_helper_sve_sths_le_zsu_mte, + gen_helper_sve_stss_le_zsu_mte, }, + { gen_helper_sve_stbs_zss_mte, + gen_helper_sve_sths_le_zss_mte, + gen_helper_sve_stss_le_zss_mte, } }, + { /* Big-endian */ + { gen_helper_sve_stbs_zsu_mte, + gen_helper_sve_sths_be_zsu_mte, + gen_helper_sve_stss_be_zsu_mte, }, + { gen_helper_sve_stbs_zss_mte, + gen_helper_sve_sths_be_zss_mte, + gen_helper_sve_stss_be_zss_mte, } } }, }; /* Note that we overload xs=2 to indicate 64-bit offset. */ -static gen_helper_gvec_mem_scatter * const scatter_store_fn64[2][3][4] = { - /* Little-endian */ - { { gen_helper_sve_stbd_zsu, - gen_helper_sve_sthd_le_zsu, - gen_helper_sve_stsd_le_zsu, - gen_helper_sve_stdd_le_zsu, }, - { gen_helper_sve_stbd_zss, - gen_helper_sve_sthd_le_zss, - gen_helper_sve_stsd_le_zss, - gen_helper_sve_stdd_le_zss, }, - { gen_helper_sve_stbd_zd, - gen_helper_sve_sthd_le_zd, - gen_helper_sve_stsd_le_zd, - gen_helper_sve_stdd_le_zd, } }, - /* Big-endian */ - { { gen_helper_sve_stbd_zsu, - gen_helper_sve_sthd_be_zsu, - gen_helper_sve_stsd_be_zsu, - gen_helper_sve_stdd_be_zsu, }, - { gen_helper_sve_stbd_zss, - gen_helper_sve_sthd_be_zss, - gen_helper_sve_stsd_be_zss, - gen_helper_sve_stdd_be_zss, }, - { gen_helper_sve_stbd_zd, - gen_helper_sve_sthd_be_zd, - gen_helper_sve_stsd_be_zd, - gen_helper_sve_stdd_be_zd, } }, +static gen_helper_gvec_mem_scatter * const scatter_store_fn64[2][2][3][4] = { + { /* MTE Inactive */ + { /* Little-endian */ + { gen_helper_sve_stbd_zsu, + gen_helper_sve_sthd_le_zsu, + gen_helper_sve_stsd_le_zsu, + gen_helper_sve_stdd_le_zsu, }, + { gen_helper_sve_stbd_zss, + gen_helper_sve_sthd_le_zss, + gen_helper_sve_stsd_le_zss, + gen_helper_sve_stdd_le_zss, }, + { gen_helper_sve_stbd_zd, + gen_helper_sve_sthd_le_zd, + gen_helper_sve_stsd_le_zd, + gen_helper_sve_stdd_le_zd, } }, + { /* Big-endian */ + { gen_helper_sve_stbd_zsu, + gen_helper_sve_sthd_be_zsu, + gen_helper_sve_stsd_be_zsu, + gen_helper_sve_stdd_be_zsu, }, + { gen_helper_sve_stbd_zss, + gen_helper_sve_sthd_be_zss, + gen_helper_sve_stsd_be_zss, + gen_helper_sve_stdd_be_zss, }, + { gen_helper_sve_stbd_zd, + gen_helper_sve_sthd_be_zd, + gen_helper_sve_stsd_be_zd, + gen_helper_sve_stdd_be_zd, } } }, + { /* MTE Inactive */ + { /* Little-endian */ + { gen_helper_sve_stbd_zsu_mte, + gen_helper_sve_sthd_le_zsu_mte, + gen_helper_sve_stsd_le_zsu_mte, + gen_helper_sve_stdd_le_zsu_mte, }, + { gen_helper_sve_stbd_zss_mte, + gen_helper_sve_sthd_le_zss_mte, + gen_helper_sve_stsd_le_zss_mte, + gen_helper_sve_stdd_le_zss_mte, }, + { gen_helper_sve_stbd_zd_mte, + gen_helper_sve_sthd_le_zd_mte, + gen_helper_sve_stsd_le_zd_mte, + gen_helper_sve_stdd_le_zd_mte, } }, + { /* Big-endian */ + { gen_helper_sve_stbd_zsu_mte, + gen_helper_sve_sthd_be_zsu_mte, + gen_helper_sve_stsd_be_zsu_mte, + gen_helper_sve_stdd_be_zsu_mte, }, + { gen_helper_sve_stbd_zss_mte, + gen_helper_sve_sthd_be_zss_mte, + gen_helper_sve_stsd_be_zss_mte, + gen_helper_sve_stdd_be_zss_mte, }, + { gen_helper_sve_stbd_zd_mte, + gen_helper_sve_sthd_be_zd_mte, + gen_helper_sve_stsd_be_zd_mte, + gen_helper_sve_stdd_be_zd_mte, } } }, }; static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a) { gen_helper_gvec_mem_scatter *fn; - int be = s->be_data == MO_BE; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; if (a->esz < a->msz || (a->msz == 0 && a->scale)) { return false; @@ -5334,23 +5785,24 @@ static bool trans_ST1_zprz(DisasContext *s, arg_ST1_zprz *a) } switch (a->esz) { case MO_32: - fn = scatter_store_fn32[be][a->xs][a->msz]; + fn = scatter_store_fn32[mte][be][a->xs][a->msz]; break; case MO_64: - fn = scatter_store_fn64[be][a->xs][a->msz]; + fn = scatter_store_fn64[mte][be][a->xs][a->msz]; break; default: g_assert_not_reached(); } do_mem_zpz(s, a->rd, a->pg, a->rm, a->scale * a->msz, - cpu_reg_sp(s, a->rn), a->msz, fn); + cpu_reg_sp(s, a->rn), a->msz, true, fn); return true; } static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a) { gen_helper_gvec_mem_scatter *fn = NULL; - int be = s->be_data == MO_BE; + bool be = s->be_data == MO_BE; + bool mte = s->mte_active[0]; TCGv_i64 imm; if (a->esz < a->msz) { @@ -5362,10 +5814,10 @@ static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a) switch (a->esz) { case MO_32: - fn = scatter_store_fn32[be][0][a->msz]; + fn = scatter_store_fn32[mte][be][0][a->msz]; break; case MO_64: - fn = scatter_store_fn64[be][2][a->msz]; + fn = scatter_store_fn64[mte][be][2][a->msz]; break; } assert(fn != NULL); @@ -5374,7 +5826,7 @@ static bool trans_ST1_zpiz(DisasContext *s, arg_ST1_zpiz *a) * by loading the immediate into the scalar parameter. */ imm = tcg_const_i64(a->imm << a->msz); - do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, a->msz, fn); + do_mem_zpz(s, a->rd, a->pg, a->rn, 0, imm, a->msz, true, fn); tcg_temp_free_i64(imm); return true; } diff --git a/target/arm/translate-vfp.inc.c b/target/arm/translate-vfp.inc.c index e1a9017598..afa8a5f888 100644 --- a/target/arm/translate-vfp.inc.c +++ b/target/arm/translate-vfp.inc.c @@ -119,15 +119,14 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled) if (s->v7m_lspact) { /* * Lazy state saving affects external memory and also the NVIC, - * so we must mark it as an IO operation for icount. + * so we must mark it as an IO operation for icount (and cause + * this to be the last insn in the TB). */ if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) { + s->base.is_jmp = DISAS_UPDATE_EXIT; gen_io_start(); } gen_helper_v7m_preserve_fp_state(cpu_env); - if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) { - gen_io_end(); - } /* * If the preserve_fp_state helper doesn't throw an exception * then it will clear LSPACT; we don't need to repeat this for @@ -2861,6 +2860,6 @@ static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a) tcg_temp_free_i32(fptr); /* End the TB, because we have updated FP control bits */ - s->base.is_jmp = DISAS_UPDATE; + s->base.is_jmp = DISAS_UPDATE_EXIT; return true; } diff --git a/target/arm/translate.c b/target/arm/translate.c index 6d18892ade..c39a929b93 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -378,9 +378,9 @@ static void gen_revsh(TCGv_i32 dest, TCGv_i32 var) } /* Swap low and high halfwords. */ -static void gen_swap_half(TCGv_i32 var) +static void gen_swap_half(TCGv_i32 dest, TCGv_i32 var) { - tcg_gen_rotri_i32(var, var, 16); + tcg_gen_rotri_i32(dest, var, 16); } /* Dual 16-bit add. Result placed in t0 and t1 is marked as dead. @@ -1133,25 +1133,6 @@ neon_reg_offset (int reg, int n) return vfp_reg_offset(0, sreg); } -/* Return the offset of a 2**SIZE piece of a NEON register, at index ELE, - * where 0 is the least significant end of the register. - */ -static inline long -neon_element_offset(int reg, int element, MemOp size) -{ - int element_size = 1 << size; - int ofs = element * element_size; -#ifdef HOST_WORDS_BIGENDIAN - /* Calculate the offset assuming fully little-endian, - * then XOR to account for the order of the 8-byte units. - */ - if (element_size < 8) { - ofs ^= 8 - element_size; - } -#endif - return neon_reg_offset(reg, 0) + ofs; -} - static TCGv_i32 neon_load_reg(int reg, int pass) { TCGv_i32 tmp = tcg_temp_new_i32(); @@ -1159,94 +1140,12 @@ static TCGv_i32 neon_load_reg(int reg, int pass) return tmp; } -static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop) -{ - long offset = neon_element_offset(reg, ele, mop & MO_SIZE); - - switch (mop) { - case MO_UB: - tcg_gen_ld8u_i32(var, cpu_env, offset); - break; - case MO_UW: - tcg_gen_ld16u_i32(var, cpu_env, offset); - break; - case MO_UL: - tcg_gen_ld_i32(var, cpu_env, offset); - break; - default: - g_assert_not_reached(); - } -} - -static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop) -{ - long offset = neon_element_offset(reg, ele, mop & MO_SIZE); - - switch (mop) { - case MO_UB: - tcg_gen_ld8u_i64(var, cpu_env, offset); - break; - case MO_UW: - tcg_gen_ld16u_i64(var, cpu_env, offset); - break; - case MO_UL: - tcg_gen_ld32u_i64(var, cpu_env, offset); - break; - case MO_Q: - tcg_gen_ld_i64(var, cpu_env, offset); - break; - default: - g_assert_not_reached(); - } -} - static void neon_store_reg(int reg, int pass, TCGv_i32 var) { tcg_gen_st_i32(var, cpu_env, neon_reg_offset(reg, pass)); tcg_temp_free_i32(var); } -static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var) -{ - long offset = neon_element_offset(reg, ele, size); - - switch (size) { - case MO_8: - tcg_gen_st8_i32(var, cpu_env, offset); - break; - case MO_16: - tcg_gen_st16_i32(var, cpu_env, offset); - break; - case MO_32: - tcg_gen_st_i32(var, cpu_env, offset); - break; - default: - g_assert_not_reached(); - } -} - -static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var) -{ - long offset = neon_element_offset(reg, ele, size); - - switch (size) { - case MO_8: - tcg_gen_st8_i64(var, cpu_env, offset); - break; - case MO_16: - tcg_gen_st16_i64(var, cpu_env, offset); - break; - case MO_32: - tcg_gen_st32_i64(var, cpu_env, offset); - break; - case MO_64: - tcg_gen_st_i64(var, cpu_env, offset); - break; - default: - g_assert_not_reached(); - } -} - static inline void neon_load_reg64(TCGv_i64 var, int reg) { tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(1, reg)); @@ -2876,7 +2775,7 @@ static void gen_msr_banked(DisasContext *s, int r, int sysm, int rn) tcg_temp_free_i32(tcg_tgtmode); tcg_temp_free_i32(tcg_regno); tcg_temp_free_i32(tcg_reg); - s->base.is_jmp = DISAS_UPDATE; + s->base.is_jmp = DISAS_UPDATE_EXIT; } static void gen_mrs_banked(DisasContext *s, int r, int sysm, int rn) @@ -2898,7 +2797,7 @@ static void gen_mrs_banked(DisasContext *s, int r, int sysm, int rn) tcg_temp_free_i32(tcg_tgtmode); tcg_temp_free_i32(tcg_regno); store_reg(s, rn, tcg_reg); - s->base.is_jmp = DISAS_UPDATE; + s->base.is_jmp = DISAS_UPDATE_EXIT; } /* Store value to PC as for an exception return (ie don't @@ -2934,377 +2833,6 @@ static void gen_exception_return(DisasContext *s, TCGv_i32 pc) gen_rfe(s, pc, load_cpu_field(spsr)); } -#define CPU_V001 cpu_V0, cpu_V0, cpu_V1 - -static int gen_neon_unzip(int rd, int rm, int size, int q) -{ - TCGv_ptr pd, pm; - - if (!q && size == 2) { - return 1; - } - pd = vfp_reg_ptr(true, rd); - pm = vfp_reg_ptr(true, rm); - if (q) { - switch (size) { - case 0: - gen_helper_neon_qunzip8(pd, pm); - break; - case 1: - gen_helper_neon_qunzip16(pd, pm); - break; - case 2: - gen_helper_neon_qunzip32(pd, pm); - break; - default: - abort(); - } - } else { - switch (size) { - case 0: - gen_helper_neon_unzip8(pd, pm); - break; - case 1: - gen_helper_neon_unzip16(pd, pm); - break; - default: - abort(); - } - } - tcg_temp_free_ptr(pd); - tcg_temp_free_ptr(pm); - return 0; -} - -static int gen_neon_zip(int rd, int rm, int size, int q) -{ - TCGv_ptr pd, pm; - - if (!q && size == 2) { - return 1; - } - pd = vfp_reg_ptr(true, rd); - pm = vfp_reg_ptr(true, rm); - if (q) { - switch (size) { - case 0: - gen_helper_neon_qzip8(pd, pm); - break; - case 1: - gen_helper_neon_qzip16(pd, pm); - break; - case 2: - gen_helper_neon_qzip32(pd, pm); - break; - default: - abort(); - } - } else { - switch (size) { - case 0: - gen_helper_neon_zip8(pd, pm); - break; - case 1: - gen_helper_neon_zip16(pd, pm); - break; - default: - abort(); - } - } - tcg_temp_free_ptr(pd); - tcg_temp_free_ptr(pm); - return 0; -} - -static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1) -{ - TCGv_i32 rd, tmp; - - rd = tcg_temp_new_i32(); - tmp = tcg_temp_new_i32(); - - tcg_gen_shli_i32(rd, t0, 8); - tcg_gen_andi_i32(rd, rd, 0xff00ff00); - tcg_gen_andi_i32(tmp, t1, 0x00ff00ff); - tcg_gen_or_i32(rd, rd, tmp); - - tcg_gen_shri_i32(t1, t1, 8); - tcg_gen_andi_i32(t1, t1, 0x00ff00ff); - tcg_gen_andi_i32(tmp, t0, 0xff00ff00); - tcg_gen_or_i32(t1, t1, tmp); - tcg_gen_mov_i32(t0, rd); - - tcg_temp_free_i32(tmp); - tcg_temp_free_i32(rd); -} - -static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1) -{ - TCGv_i32 rd, tmp; - - rd = tcg_temp_new_i32(); - tmp = tcg_temp_new_i32(); - - tcg_gen_shli_i32(rd, t0, 16); - tcg_gen_andi_i32(tmp, t1, 0xffff); - tcg_gen_or_i32(rd, rd, tmp); - tcg_gen_shri_i32(t1, t1, 16); - tcg_gen_andi_i32(tmp, t0, 0xffff0000); - tcg_gen_or_i32(t1, t1, tmp); - tcg_gen_mov_i32(t0, rd); - - tcg_temp_free_i32(tmp); - tcg_temp_free_i32(rd); -} - -static inline void gen_neon_narrow(int size, TCGv_i32 dest, TCGv_i64 src) -{ - switch (size) { - case 0: gen_helper_neon_narrow_u8(dest, src); break; - case 1: gen_helper_neon_narrow_u16(dest, src); break; - case 2: tcg_gen_extrl_i64_i32(dest, src); break; - default: abort(); - } -} - -static inline void gen_neon_narrow_sats(int size, TCGv_i32 dest, TCGv_i64 src) -{ - switch (size) { - case 0: gen_helper_neon_narrow_sat_s8(dest, cpu_env, src); break; - case 1: gen_helper_neon_narrow_sat_s16(dest, cpu_env, src); break; - case 2: gen_helper_neon_narrow_sat_s32(dest, cpu_env, src); break; - default: abort(); - } -} - -static inline void gen_neon_narrow_satu(int size, TCGv_i32 dest, TCGv_i64 src) -{ - switch (size) { - case 0: gen_helper_neon_narrow_sat_u8(dest, cpu_env, src); break; - case 1: gen_helper_neon_narrow_sat_u16(dest, cpu_env, src); break; - case 2: gen_helper_neon_narrow_sat_u32(dest, cpu_env, src); break; - default: abort(); - } -} - -static inline void gen_neon_unarrow_sats(int size, TCGv_i32 dest, TCGv_i64 src) -{ - switch (size) { - case 0: gen_helper_neon_unarrow_sat8(dest, cpu_env, src); break; - case 1: gen_helper_neon_unarrow_sat16(dest, cpu_env, src); break; - case 2: gen_helper_neon_unarrow_sat32(dest, cpu_env, src); break; - default: abort(); - } -} - -static inline void gen_neon_widen(TCGv_i64 dest, TCGv_i32 src, int size, int u) -{ - if (u) { - switch (size) { - case 0: gen_helper_neon_widen_u8(dest, src); break; - case 1: gen_helper_neon_widen_u16(dest, src); break; - case 2: tcg_gen_extu_i32_i64(dest, src); break; - default: abort(); - } - } else { - switch (size) { - case 0: gen_helper_neon_widen_s8(dest, src); break; - case 1: gen_helper_neon_widen_s16(dest, src); break; - case 2: tcg_gen_ext_i32_i64(dest, src); break; - default: abort(); - } - } - tcg_temp_free_i32(src); -} - -static inline void gen_neon_addl(int size) -{ - switch (size) { - case 0: gen_helper_neon_addl_u16(CPU_V001); break; - case 1: gen_helper_neon_addl_u32(CPU_V001); break; - case 2: tcg_gen_add_i64(CPU_V001); break; - default: abort(); - } -} - -static void gen_neon_narrow_op(int op, int u, int size, - TCGv_i32 dest, TCGv_i64 src) -{ - if (op) { - if (u) { - gen_neon_unarrow_sats(size, dest, src); - } else { - gen_neon_narrow(size, dest, src); - } - } else { - if (u) { - gen_neon_narrow_satu(size, dest, src); - } else { - gen_neon_narrow_sats(size, dest, src); - } - } -} - -/* Symbolic constants for op fields for Neon 2-register miscellaneous. - * The values correspond to bits [17:16,10:7]; see the ARM ARM DDI0406B - * table A7-13. - */ -#define NEON_2RM_VREV64 0 -#define NEON_2RM_VREV32 1 -#define NEON_2RM_VREV16 2 -#define NEON_2RM_VPADDL 4 -#define NEON_2RM_VPADDL_U 5 -#define NEON_2RM_AESE 6 /* Includes AESD */ -#define NEON_2RM_AESMC 7 /* Includes AESIMC */ -#define NEON_2RM_VCLS 8 -#define NEON_2RM_VCLZ 9 -#define NEON_2RM_VCNT 10 -#define NEON_2RM_VMVN 11 -#define NEON_2RM_VPADAL 12 -#define NEON_2RM_VPADAL_U 13 -#define NEON_2RM_VQABS 14 -#define NEON_2RM_VQNEG 15 -#define NEON_2RM_VCGT0 16 -#define NEON_2RM_VCGE0 17 -#define NEON_2RM_VCEQ0 18 -#define NEON_2RM_VCLE0 19 -#define NEON_2RM_VCLT0 20 -#define NEON_2RM_SHA1H 21 -#define NEON_2RM_VABS 22 -#define NEON_2RM_VNEG 23 -#define NEON_2RM_VCGT0_F 24 -#define NEON_2RM_VCGE0_F 25 -#define NEON_2RM_VCEQ0_F 26 -#define NEON_2RM_VCLE0_F 27 -#define NEON_2RM_VCLT0_F 28 -#define NEON_2RM_VABS_F 30 -#define NEON_2RM_VNEG_F 31 -#define NEON_2RM_VSWP 32 -#define NEON_2RM_VTRN 33 -#define NEON_2RM_VUZP 34 -#define NEON_2RM_VZIP 35 -#define NEON_2RM_VMOVN 36 /* Includes VQMOVN, VQMOVUN */ -#define NEON_2RM_VQMOVN 37 /* Includes VQMOVUN */ -#define NEON_2RM_VSHLL 38 -#define NEON_2RM_SHA1SU1 39 /* Includes SHA256SU0 */ -#define NEON_2RM_VRINTN 40 -#define NEON_2RM_VRINTX 41 -#define NEON_2RM_VRINTA 42 -#define NEON_2RM_VRINTZ 43 -#define NEON_2RM_VCVT_F16_F32 44 -#define NEON_2RM_VRINTM 45 -#define NEON_2RM_VCVT_F32_F16 46 -#define NEON_2RM_VRINTP 47 -#define NEON_2RM_VCVTAU 48 -#define NEON_2RM_VCVTAS 49 -#define NEON_2RM_VCVTNU 50 -#define NEON_2RM_VCVTNS 51 -#define NEON_2RM_VCVTPU 52 -#define NEON_2RM_VCVTPS 53 -#define NEON_2RM_VCVTMU 54 -#define NEON_2RM_VCVTMS 55 -#define NEON_2RM_VRECPE 56 -#define NEON_2RM_VRSQRTE 57 -#define NEON_2RM_VRECPE_F 58 -#define NEON_2RM_VRSQRTE_F 59 -#define NEON_2RM_VCVT_FS 60 -#define NEON_2RM_VCVT_FU 61 -#define NEON_2RM_VCVT_SF 62 -#define NEON_2RM_VCVT_UF 63 - -static bool neon_2rm_is_v8_op(int op) -{ - /* Return true if this neon 2reg-misc op is ARMv8 and up */ - switch (op) { - case NEON_2RM_VRINTN: - case NEON_2RM_VRINTA: - case NEON_2RM_VRINTM: - case NEON_2RM_VRINTP: - case NEON_2RM_VRINTZ: - case NEON_2RM_VRINTX: - case NEON_2RM_VCVTAU: - case NEON_2RM_VCVTAS: - case NEON_2RM_VCVTNU: - case NEON_2RM_VCVTNS: - case NEON_2RM_VCVTPU: - case NEON_2RM_VCVTPS: - case NEON_2RM_VCVTMU: - case NEON_2RM_VCVTMS: - return true; - default: - return false; - } -} - -/* Each entry in this array has bit n set if the insn allows - * size value n (otherwise it will UNDEF). Since unallocated - * op values will have no bits set they always UNDEF. - */ -static const uint8_t neon_2rm_sizes[] = { - [NEON_2RM_VREV64] = 0x7, - [NEON_2RM_VREV32] = 0x3, - [NEON_2RM_VREV16] = 0x1, - [NEON_2RM_VPADDL] = 0x7, - [NEON_2RM_VPADDL_U] = 0x7, - [NEON_2RM_AESE] = 0x1, - [NEON_2RM_AESMC] = 0x1, - [NEON_2RM_VCLS] = 0x7, - [NEON_2RM_VCLZ] = 0x7, - [NEON_2RM_VCNT] = 0x1, - [NEON_2RM_VMVN] = 0x1, - [NEON_2RM_VPADAL] = 0x7, - [NEON_2RM_VPADAL_U] = 0x7, - [NEON_2RM_VQABS] = 0x7, - [NEON_2RM_VQNEG] = 0x7, - [NEON_2RM_VCGT0] = 0x7, - [NEON_2RM_VCGE0] = 0x7, - [NEON_2RM_VCEQ0] = 0x7, - [NEON_2RM_VCLE0] = 0x7, - [NEON_2RM_VCLT0] = 0x7, - [NEON_2RM_SHA1H] = 0x4, - [NEON_2RM_VABS] = 0x7, - [NEON_2RM_VNEG] = 0x7, - [NEON_2RM_VCGT0_F] = 0x4, - [NEON_2RM_VCGE0_F] = 0x4, - [NEON_2RM_VCEQ0_F] = 0x4, - [NEON_2RM_VCLE0_F] = 0x4, - [NEON_2RM_VCLT0_F] = 0x4, - [NEON_2RM_VABS_F] = 0x4, - [NEON_2RM_VNEG_F] = 0x4, - [NEON_2RM_VSWP] = 0x1, - [NEON_2RM_VTRN] = 0x7, - [NEON_2RM_VUZP] = 0x7, - [NEON_2RM_VZIP] = 0x7, - [NEON_2RM_VMOVN] = 0x7, - [NEON_2RM_VQMOVN] = 0x7, - [NEON_2RM_VSHLL] = 0x7, - [NEON_2RM_SHA1SU1] = 0x4, - [NEON_2RM_VRINTN] = 0x4, - [NEON_2RM_VRINTX] = 0x4, - [NEON_2RM_VRINTA] = 0x4, - [NEON_2RM_VRINTZ] = 0x4, - [NEON_2RM_VCVT_F16_F32] = 0x2, - [NEON_2RM_VRINTM] = 0x4, - [NEON_2RM_VCVT_F32_F16] = 0x2, - [NEON_2RM_VRINTP] = 0x4, - [NEON_2RM_VCVTAU] = 0x4, - [NEON_2RM_VCVTAS] = 0x4, - [NEON_2RM_VCVTNU] = 0x4, - [NEON_2RM_VCVTNS] = 0x4, - [NEON_2RM_VCVTPU] = 0x4, - [NEON_2RM_VCVTPS] = 0x4, - [NEON_2RM_VCVTMU] = 0x4, - [NEON_2RM_VCVTMS] = 0x4, - [NEON_2RM_VRECPE] = 0x4, - [NEON_2RM_VRSQRTE] = 0x4, - [NEON_2RM_VRECPE_F] = 0x4, - [NEON_2RM_VRSQRTE_F] = 0x4, - [NEON_2RM_VCVT_FS] = 0x4, - [NEON_2RM_VCVT_FU] = 0x4, - [NEON_2RM_VCVT_SF] = 0x4, - [NEON_2RM_VCVT_UF] = 0x4, -}; - static void gen_gvec_fn3_qc(uint32_t rd_ofs, uint32_t rn_ofs, uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz, gen_helper_gvec_3_ptr *fn) @@ -5016,573 +4544,6 @@ void gen_gvec_uaba(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]); } -/* Translate a NEON data processing instruction. Return nonzero if the - instruction is invalid. - We process data in a mixture of 32-bit and 64-bit chunks. - Mostly we use 32-bit chunks so we can use normal scalar instructions. */ - -static int disas_neon_data_insn(DisasContext *s, uint32_t insn) -{ - int op; - int q; - int rd, rm, rd_ofs, rm_ofs; - int size; - int pass; - int u; - int vec_size; - TCGv_i32 tmp, tmp2, tmp3; - - if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { - return 1; - } - - /* FIXME: this access check should not take precedence over UNDEF - * for invalid encodings; we will generate incorrect syndrome information - * for attempts to execute invalid vfp/neon encodings with FP disabled. - */ - if (s->fp_excp_el) { - gen_exception_insn(s, s->pc_curr, EXCP_UDEF, - syn_simd_access_trap(1, 0xe, false), s->fp_excp_el); - return 0; - } - - if (!s->vfp_enabled) - return 1; - q = (insn & (1 << 6)) != 0; - u = (insn >> 24) & 1; - VFP_DREG_D(rd, insn); - VFP_DREG_M(rm, insn); - size = (insn >> 20) & 3; - vec_size = q ? 16 : 8; - rd_ofs = neon_reg_offset(rd, 0); - rm_ofs = neon_reg_offset(rm, 0); - - if ((insn & (1 << 23)) == 0) { - /* Three register same length: handled by decodetree */ - return 1; - } else if (insn & (1 << 4)) { - /* Two registers and shift or reg and imm: handled by decodetree */ - return 1; - } else { /* (insn & 0x00800010 == 0x00800000) */ - if (size != 3) { - /* - * Three registers of different lengths, or two registers and - * a scalar: handled by decodetree - */ - return 1; - } else { /* size == 3 */ - if (!u) { - /* Extract: handled by decodetree */ - return 1; - } else if ((insn & (1 << 11)) == 0) { - /* Two register misc. */ - op = ((insn >> 12) & 0x30) | ((insn >> 7) & 0xf); - size = (insn >> 18) & 3; - /* UNDEF for unknown op values and bad op-size combinations */ - if ((neon_2rm_sizes[op] & (1 << size)) == 0) { - return 1; - } - if (neon_2rm_is_v8_op(op) && - !arm_dc_feature(s, ARM_FEATURE_V8)) { - return 1; - } - if ((op != NEON_2RM_VMOVN && op != NEON_2RM_VQMOVN) && - q && ((rm | rd) & 1)) { - return 1; - } - switch (op) { - case NEON_2RM_VREV64: - for (pass = 0; pass < (q ? 2 : 1); pass++) { - tmp = neon_load_reg(rm, pass * 2); - tmp2 = neon_load_reg(rm, pass * 2 + 1); - switch (size) { - case 0: tcg_gen_bswap32_i32(tmp, tmp); break; - case 1: gen_swap_half(tmp); break; - case 2: /* no-op */ break; - default: abort(); - } - neon_store_reg(rd, pass * 2 + 1, tmp); - if (size == 2) { - neon_store_reg(rd, pass * 2, tmp2); - } else { - switch (size) { - case 0: tcg_gen_bswap32_i32(tmp2, tmp2); break; - case 1: gen_swap_half(tmp2); break; - default: abort(); - } - neon_store_reg(rd, pass * 2, tmp2); - } - } - break; - case NEON_2RM_VPADDL: case NEON_2RM_VPADDL_U: - case NEON_2RM_VPADAL: case NEON_2RM_VPADAL_U: - for (pass = 0; pass < q + 1; pass++) { - tmp = neon_load_reg(rm, pass * 2); - gen_neon_widen(cpu_V0, tmp, size, op & 1); - tmp = neon_load_reg(rm, pass * 2 + 1); - gen_neon_widen(cpu_V1, tmp, size, op & 1); - switch (size) { - case 0: gen_helper_neon_paddl_u16(CPU_V001); break; - case 1: gen_helper_neon_paddl_u32(CPU_V001); break; - case 2: tcg_gen_add_i64(CPU_V001); break; - default: abort(); - } - if (op >= NEON_2RM_VPADAL) { - /* Accumulate. */ - neon_load_reg64(cpu_V1, rd + pass); - gen_neon_addl(size); - } - neon_store_reg64(cpu_V0, rd + pass); - } - break; - case NEON_2RM_VTRN: - if (size == 2) { - int n; - for (n = 0; n < (q ? 4 : 2); n += 2) { - tmp = neon_load_reg(rm, n); - tmp2 = neon_load_reg(rd, n + 1); - neon_store_reg(rm, n, tmp2); - neon_store_reg(rd, n + 1, tmp); - } - } else { - goto elementwise; - } - break; - case NEON_2RM_VUZP: - if (gen_neon_unzip(rd, rm, size, q)) { - return 1; - } - break; - case NEON_2RM_VZIP: - if (gen_neon_zip(rd, rm, size, q)) { - return 1; - } - break; - case NEON_2RM_VMOVN: case NEON_2RM_VQMOVN: - /* also VQMOVUN; op field and mnemonics don't line up */ - if (rm & 1) { - return 1; - } - tmp2 = NULL; - for (pass = 0; pass < 2; pass++) { - neon_load_reg64(cpu_V0, rm + pass); - tmp = tcg_temp_new_i32(); - gen_neon_narrow_op(op == NEON_2RM_VMOVN, q, size, - tmp, cpu_V0); - if (pass == 0) { - tmp2 = tmp; - } else { - neon_store_reg(rd, 0, tmp2); - neon_store_reg(rd, 1, tmp); - } - } - break; - case NEON_2RM_VSHLL: - if (q || (rd & 1)) { - return 1; - } - tmp = neon_load_reg(rm, 0); - tmp2 = neon_load_reg(rm, 1); - for (pass = 0; pass < 2; pass++) { - if (pass == 1) - tmp = tmp2; - gen_neon_widen(cpu_V0, tmp, size, 1); - tcg_gen_shli_i64(cpu_V0, cpu_V0, 8 << size); - neon_store_reg64(cpu_V0, rd + pass); - } - break; - case NEON_2RM_VCVT_F16_F32: - { - TCGv_ptr fpst; - TCGv_i32 ahp; - - if (!dc_isar_feature(aa32_fp16_spconv, s) || - q || (rm & 1)) { - return 1; - } - fpst = get_fpstatus_ptr(true); - ahp = get_ahp_flag(); - tmp = neon_load_reg(rm, 0); - gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp); - tmp2 = neon_load_reg(rm, 1); - gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp); - tcg_gen_shli_i32(tmp2, tmp2, 16); - tcg_gen_or_i32(tmp2, tmp2, tmp); - tcg_temp_free_i32(tmp); - tmp = neon_load_reg(rm, 2); - gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp); - tmp3 = neon_load_reg(rm, 3); - neon_store_reg(rd, 0, tmp2); - gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp); - tcg_gen_shli_i32(tmp3, tmp3, 16); - tcg_gen_or_i32(tmp3, tmp3, tmp); - neon_store_reg(rd, 1, tmp3); - tcg_temp_free_i32(tmp); - tcg_temp_free_i32(ahp); - tcg_temp_free_ptr(fpst); - break; - } - case NEON_2RM_VCVT_F32_F16: - { - TCGv_ptr fpst; - TCGv_i32 ahp; - if (!dc_isar_feature(aa32_fp16_spconv, s) || - q || (rd & 1)) { - return 1; - } - fpst = get_fpstatus_ptr(true); - ahp = get_ahp_flag(); - tmp3 = tcg_temp_new_i32(); - tmp = neon_load_reg(rm, 0); - tmp2 = neon_load_reg(rm, 1); - tcg_gen_ext16u_i32(tmp3, tmp); - gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp); - neon_store_reg(rd, 0, tmp3); - tcg_gen_shri_i32(tmp, tmp, 16); - gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp); - neon_store_reg(rd, 1, tmp); - tmp3 = tcg_temp_new_i32(); - tcg_gen_ext16u_i32(tmp3, tmp2); - gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp); - neon_store_reg(rd, 2, tmp3); - tcg_gen_shri_i32(tmp2, tmp2, 16); - gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp); - neon_store_reg(rd, 3, tmp2); - tcg_temp_free_i32(ahp); - tcg_temp_free_ptr(fpst); - break; - } - case NEON_2RM_AESE: case NEON_2RM_AESMC: - if (!dc_isar_feature(aa32_aes, s) || ((rm | rd) & 1)) { - return 1; - } - /* - * Bit 6 is the lowest opcode bit; it distinguishes - * between encryption (AESE/AESMC) and decryption - * (AESD/AESIMC). - */ - if (op == NEON_2RM_AESE) { - tcg_gen_gvec_3_ool(vfp_reg_offset(true, rd), - vfp_reg_offset(true, rd), - vfp_reg_offset(true, rm), - 16, 16, extract32(insn, 6, 1), - gen_helper_crypto_aese); - } else { - tcg_gen_gvec_2_ool(vfp_reg_offset(true, rd), - vfp_reg_offset(true, rm), - 16, 16, extract32(insn, 6, 1), - gen_helper_crypto_aesmc); - } - break; - case NEON_2RM_SHA1H: - if (!dc_isar_feature(aa32_sha1, s) || ((rm | rd) & 1)) { - return 1; - } - tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, 16, 16, 0, - gen_helper_crypto_sha1h); - break; - case NEON_2RM_SHA1SU1: - if ((rm | rd) & 1) { - return 1; - } - /* bit 6 (q): set -> SHA256SU0, cleared -> SHA1SU1 */ - if (q) { - if (!dc_isar_feature(aa32_sha2, s)) { - return 1; - } - } else if (!dc_isar_feature(aa32_sha1, s)) { - return 1; - } - tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, 16, 16, 0, - q ? gen_helper_crypto_sha256su0 - : gen_helper_crypto_sha1su1); - break; - case NEON_2RM_VMVN: - tcg_gen_gvec_not(0, rd_ofs, rm_ofs, vec_size, vec_size); - break; - case NEON_2RM_VNEG: - tcg_gen_gvec_neg(size, rd_ofs, rm_ofs, vec_size, vec_size); - break; - case NEON_2RM_VABS: - tcg_gen_gvec_abs(size, rd_ofs, rm_ofs, vec_size, vec_size); - break; - - case NEON_2RM_VCEQ0: - gen_gvec_ceq0(size, rd_ofs, rm_ofs, vec_size, vec_size); - break; - case NEON_2RM_VCGT0: - gen_gvec_cgt0(size, rd_ofs, rm_ofs, vec_size, vec_size); - break; - case NEON_2RM_VCLE0: - gen_gvec_cle0(size, rd_ofs, rm_ofs, vec_size, vec_size); - break; - case NEON_2RM_VCGE0: - gen_gvec_cge0(size, rd_ofs, rm_ofs, vec_size, vec_size); - break; - case NEON_2RM_VCLT0: - gen_gvec_clt0(size, rd_ofs, rm_ofs, vec_size, vec_size); - break; - - default: - elementwise: - for (pass = 0; pass < (q ? 4 : 2); pass++) { - tmp = neon_load_reg(rm, pass); - switch (op) { - case NEON_2RM_VREV32: - switch (size) { - case 0: tcg_gen_bswap32_i32(tmp, tmp); break; - case 1: gen_swap_half(tmp); break; - default: abort(); - } - break; - case NEON_2RM_VREV16: - gen_rev16(tmp, tmp); - break; - case NEON_2RM_VCLS: - switch (size) { - case 0: gen_helper_neon_cls_s8(tmp, tmp); break; - case 1: gen_helper_neon_cls_s16(tmp, tmp); break; - case 2: gen_helper_neon_cls_s32(tmp, tmp); break; - default: abort(); - } - break; - case NEON_2RM_VCLZ: - switch (size) { - case 0: gen_helper_neon_clz_u8(tmp, tmp); break; - case 1: gen_helper_neon_clz_u16(tmp, tmp); break; - case 2: tcg_gen_clzi_i32(tmp, tmp, 32); break; - default: abort(); - } - break; - case NEON_2RM_VCNT: - gen_helper_neon_cnt_u8(tmp, tmp); - break; - case NEON_2RM_VQABS: - switch (size) { - case 0: - gen_helper_neon_qabs_s8(tmp, cpu_env, tmp); - break; - case 1: - gen_helper_neon_qabs_s16(tmp, cpu_env, tmp); - break; - case 2: - gen_helper_neon_qabs_s32(tmp, cpu_env, tmp); - break; - default: abort(); - } - break; - case NEON_2RM_VQNEG: - switch (size) { - case 0: - gen_helper_neon_qneg_s8(tmp, cpu_env, tmp); - break; - case 1: - gen_helper_neon_qneg_s16(tmp, cpu_env, tmp); - break; - case 2: - gen_helper_neon_qneg_s32(tmp, cpu_env, tmp); - break; - default: abort(); - } - break; - case NEON_2RM_VCGT0_F: - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - tmp2 = tcg_const_i32(0); - gen_helper_neon_cgt_f32(tmp, tmp, tmp2, fpstatus); - tcg_temp_free_i32(tmp2); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCGE0_F: - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - tmp2 = tcg_const_i32(0); - gen_helper_neon_cge_f32(tmp, tmp, tmp2, fpstatus); - tcg_temp_free_i32(tmp2); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCEQ0_F: - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - tmp2 = tcg_const_i32(0); - gen_helper_neon_ceq_f32(tmp, tmp, tmp2, fpstatus); - tcg_temp_free_i32(tmp2); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCLE0_F: - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - tmp2 = tcg_const_i32(0); - gen_helper_neon_cge_f32(tmp, tmp2, tmp, fpstatus); - tcg_temp_free_i32(tmp2); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCLT0_F: - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - tmp2 = tcg_const_i32(0); - gen_helper_neon_cgt_f32(tmp, tmp2, tmp, fpstatus); - tcg_temp_free_i32(tmp2); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VABS_F: - gen_helper_vfp_abss(tmp, tmp); - break; - case NEON_2RM_VNEG_F: - gen_helper_vfp_negs(tmp, tmp); - break; - case NEON_2RM_VSWP: - tmp2 = neon_load_reg(rd, pass); - neon_store_reg(rm, pass, tmp2); - break; - case NEON_2RM_VTRN: - tmp2 = neon_load_reg(rd, pass); - switch (size) { - case 0: gen_neon_trn_u8(tmp, tmp2); break; - case 1: gen_neon_trn_u16(tmp, tmp2); break; - default: abort(); - } - neon_store_reg(rm, pass, tmp2); - break; - case NEON_2RM_VRINTN: - case NEON_2RM_VRINTA: - case NEON_2RM_VRINTM: - case NEON_2RM_VRINTP: - case NEON_2RM_VRINTZ: - { - TCGv_i32 tcg_rmode; - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - int rmode; - - if (op == NEON_2RM_VRINTZ) { - rmode = FPROUNDING_ZERO; - } else { - rmode = fp_decode_rm[((op & 0x6) >> 1) ^ 1]; - } - - tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); - gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, - cpu_env); - gen_helper_rints(tmp, tmp, fpstatus); - gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, - cpu_env); - tcg_temp_free_ptr(fpstatus); - tcg_temp_free_i32(tcg_rmode); - break; - } - case NEON_2RM_VRINTX: - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - gen_helper_rints_exact(tmp, tmp, fpstatus); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCVTAU: - case NEON_2RM_VCVTAS: - case NEON_2RM_VCVTNU: - case NEON_2RM_VCVTNS: - case NEON_2RM_VCVTPU: - case NEON_2RM_VCVTPS: - case NEON_2RM_VCVTMU: - case NEON_2RM_VCVTMS: - { - bool is_signed = !extract32(insn, 7, 1); - TCGv_ptr fpst = get_fpstatus_ptr(1); - TCGv_i32 tcg_rmode, tcg_shift; - int rmode = fp_decode_rm[extract32(insn, 8, 2)]; - - tcg_shift = tcg_const_i32(0); - tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); - gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, - cpu_env); - - if (is_signed) { - gen_helper_vfp_tosls(tmp, tmp, - tcg_shift, fpst); - } else { - gen_helper_vfp_touls(tmp, tmp, - tcg_shift, fpst); - } - - gen_helper_set_neon_rmode(tcg_rmode, tcg_rmode, - cpu_env); - tcg_temp_free_i32(tcg_rmode); - tcg_temp_free_i32(tcg_shift); - tcg_temp_free_ptr(fpst); - break; - } - case NEON_2RM_VRECPE: - gen_helper_recpe_u32(tmp, tmp); - break; - case NEON_2RM_VRSQRTE: - gen_helper_rsqrte_u32(tmp, tmp); - break; - case NEON_2RM_VRECPE_F: - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - gen_helper_recpe_f32(tmp, tmp, fpstatus); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VRSQRTE_F: - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - gen_helper_rsqrte_f32(tmp, tmp, fpstatus); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCVT_FS: /* VCVT.F32.S32 */ - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - gen_helper_vfp_sitos(tmp, tmp, fpstatus); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCVT_FU: /* VCVT.F32.U32 */ - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - gen_helper_vfp_uitos(tmp, tmp, fpstatus); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCVT_SF: /* VCVT.S32.F32 */ - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - gen_helper_vfp_tosizs(tmp, tmp, fpstatus); - tcg_temp_free_ptr(fpstatus); - break; - } - case NEON_2RM_VCVT_UF: /* VCVT.U32.F32 */ - { - TCGv_ptr fpstatus = get_fpstatus_ptr(1); - gen_helper_vfp_touizs(tmp, tmp, fpstatus); - tcg_temp_free_ptr(fpstatus); - break; - } - default: - /* Reserved op values were caught by the - * neon_2rm_sizes[] check earlier. - */ - abort(); - } - neon_store_reg(rd, pass, tmp); - } - break; - } - } else { - /* VTBL, VTBX, VDUP: handled by decodetree */ - return 1; - } - } - } - return 0; -} - static int disas_coproc_insn(DisasContext *s, uint32_t insn) { int cpnum, is64, crn, crm, opc1, opc2, isread, rt, rt2; @@ -6153,7 +5114,7 @@ static void gen_srs(DisasContext *s, tcg_temp_free_i32(tmp); } tcg_temp_free_i32(addr); - s->base.is_jmp = DISAS_UPDATE; + s->base.is_jmp = DISAS_UPDATE_EXIT; } /* Generate a label used for skipping this instruction */ @@ -8417,7 +7378,7 @@ static bool op_smlad(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub) t1 = load_reg(s, a->rn); t2 = load_reg(s, a->rm); if (m_swap) { - gen_swap_half(t2); + gen_swap_half(t2, t2); } gen_smul_dual(t1, t2); @@ -8475,7 +7436,7 @@ static bool op_smlald(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub) t1 = load_reg(s, a->rn); t2 = load_reg(s, a->rm); if (m_swap) { - gen_swap_half(t2); + gen_swap_half(t2, t2); } gen_smul_dual(t1, t2); @@ -8824,9 +7785,6 @@ static bool do_ldm(DisasContext *s, arg_ldst_block *a, int min_n) gen_io_start(); } gen_helper_cpsr_write_eret(cpu_env, tmp); - if (tb_cflags(s->base.tb) & CF_USE_ICOUNT) { - gen_io_end(); - } tcg_temp_free_i32(tmp); /* Must exit loop to check un-masked IRQs */ s->base.is_jmp = DISAS_EXIT; @@ -9202,7 +8160,7 @@ static bool trans_SETEND(DisasContext *s, arg_SETEND *a) } if (a->E != (s->be_data == MO_BE)) { gen_helper_setend(cpu_env); - s->base.is_jmp = DISAS_UPDATE; + s->base.is_jmp = DISAS_UPDATE_EXIT; } return true; } @@ -9283,13 +8241,6 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn) } /* fall back to legacy decoder */ - if (((insn >> 25) & 7) == 1) { - /* NEON Data processing. */ - if (disas_neon_data_insn(s, insn)) { - goto illegal_op; - } - return; - } if ((insn & 0x0e000f00) == 0x0c000100) { if (arm_dc_feature(s, ARM_FEATURE_IWMMXT)) { /* iWMMXt register transfer. */ @@ -9477,11 +8428,8 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn) break; } if (((insn >> 24) & 3) == 3) { - /* Translate into the equivalent ARM encoding. */ - insn = (insn & 0xe2ffffff) | ((insn & (1 << 28)) >> 4) | (1 << 28); - if (disas_neon_data_insn(s, insn)) { - goto illegal_op; - } + /* Neon DP, but failed disas_neon_dp() */ + goto illegal_op; } else if (((insn >> 8) & 0xe) == 10) { /* VFP, but failed disas_vfp. */ goto illegal_op; @@ -9925,7 +8873,8 @@ static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu) break; case DISAS_NEXT: case DISAS_TOO_MANY: - case DISAS_UPDATE: + case DISAS_UPDATE_EXIT: + case DISAS_UPDATE_NOCHAIN: gen_set_pc_im(dc, dc->base.pc_next); /* fall through */ default: @@ -9949,10 +8898,13 @@ static void arm_tr_tb_stop(DisasContextBase *dcbase, CPUState *cpu) case DISAS_TOO_MANY: gen_goto_tb(dc, 1, dc->base.pc_next); break; + case DISAS_UPDATE_NOCHAIN: + gen_set_pc_im(dc, dc->base.pc_next); + /* fall through */ case DISAS_JUMP: gen_goto_ptr(); break; - case DISAS_UPDATE: + case DISAS_UPDATE_EXIT: gen_set_pc_im(dc, dc->base.pc_next); /* fall through */ default: diff --git a/target/arm/translate.h b/target/arm/translate.h index 62ed5c4780..16f2699ad7 100644 --- a/target/arm/translate.h +++ b/target/arm/translate.h @@ -30,6 +30,7 @@ typedef struct DisasContext { ARMMMUIdx mmu_idx; /* MMU index to use for normal loads/stores */ uint8_t tbii; /* TBI1|TBI0 for insns */ uint8_t tbid; /* TBI1|TBI0 for data */ + uint8_t tcma; /* TCMA1|TCMA0 for MTE */ bool ns; /* Use non-secure CPREG bank on access */ int fp_excp_el; /* FP exception EL or 0 if enabled */ int sve_excp_el; /* SVE exception EL or 0 if enabled */ @@ -77,6 +78,10 @@ typedef struct DisasContext { bool unpriv; /* True if v8.3-PAuth is active. */ bool pauth_active; + /* True if v8.5-MTE access to tags is enabled. */ + bool ata; + /* True if v8.5-MTE tag checks affect the PE; index with is_unpriv. */ + bool mte_active[2]; /* True with v8.5-BTI and SCTLR_ELx.BT* set. */ bool bt; /* True if any CP15 access is trapped by HSTR_EL2 */ @@ -86,6 +91,8 @@ typedef struct DisasContext { * < 0, set by the current instruction. */ int8_t btype; + /* A copy of cpu->dcz_blocksize. */ + uint8_t dcz_blocksize; /* True if this page is guarded. */ bool guarded_page; /* Bottom two bits of XScale c15_cpar coprocessor access control reg */ @@ -148,7 +155,8 @@ static inline void disas_set_insn_syndrome(DisasContext *s, uint32_t syn) /* is_jmp field values */ #define DISAS_JUMP DISAS_TARGET_0 /* only pc was modified dynamically */ -#define DISAS_UPDATE DISAS_TARGET_1 /* cpu state was modified dynamically */ +/* CPU state was modified dynamically; exit to main loop for interrupts. */ +#define DISAS_UPDATE_EXIT DISAS_TARGET_1 /* These instructions trap after executing, so the A32/T32 decoder must * defer them until after the conditional execution state has been updated. * WFI also needs special handling when single-stepping. @@ -164,13 +172,16 @@ static inline void disas_set_insn_syndrome(DisasContext *s, uint32_t syn) * custom end-of-TB code) */ #define DISAS_BX_EXCRET DISAS_TARGET_8 -/* For instructions which want an immediate exit to the main loop, - * as opposed to attempting to use lookup_and_goto_ptr. Unlike - * DISAS_UPDATE this doesn't write the PC on exiting the translation - * loop so you need to ensure something (gen_a64_set_pc_im or runtime - * helper) has done so before we reach return from cpu_tb_exec. +/* + * For instructions which want an immediate exit to the main loop, as opposed + * to attempting to use lookup_and_goto_ptr. Unlike DISAS_UPDATE_EXIT, this + * doesn't write the PC on exiting the translation loop so you need to ensure + * something (gen_a64_set_pc_im or runtime helper) has done so before we reach + * return from cpu_tb_exec. */ #define DISAS_EXIT DISAS_TARGET_9 +/* CPU state was modified dynamically; no need to exit, but do not chain. */ +#define DISAS_UPDATE_NOCHAIN DISAS_TARGET_10 #ifdef TARGET_AARCH64 void a64_translate_init(void); @@ -363,6 +374,7 @@ typedef void GVecGen4Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); /* Function prototype for gen_ functions for calling Neon helpers */ +typedef void NeonGenOneOpFn(TCGv_i32, TCGv_i32); typedef void NeonGenOneOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32); typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32); typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32); @@ -372,9 +384,10 @@ typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64); typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64); typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32); typedef void NeonGenTwoOpWidenFn(TCGv_i64, TCGv_i32, TCGv_i32); -typedef void NeonGenTwoSingleOPFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr); -typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr); -typedef void NeonGenOneOpFn(TCGv_i64, TCGv_i64); +typedef void NeonGenOneSingleOpFn(TCGv_i32, TCGv_i32, TCGv_ptr); +typedef void NeonGenTwoSingleOpFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr); +typedef void NeonGenTwoDoubleOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr); +typedef void NeonGenOne64OpFn(TCGv_i64, TCGv_i64); typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr); typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); diff --git a/target/i386/cpu.c b/target/i386/cpu.c index b1b311baa2..36cbd3d027 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1404,6 +1404,10 @@ static FeatureDep feature_dependencies[] = { .from = { FEAT_VMX_SECONDARY_CTLS, VMX_SECONDARY_EXEC_ENABLE_VMFUNC }, .to = { FEAT_VMX_VMFUNC, ~0ull }, }, + { + .from = { FEAT_8000_0001_ECX, CPUID_EXT3_SVM }, + .to = { FEAT_SVM, ~0ull }, + }, }; typedef struct X86RegisterInfo32 { @@ -3135,6 +3139,7 @@ static X86CPUDefinition builtin_x86_defs[] = { .versions = (X86CPUVersionDefinition[]) { { .version = 1 }, { .version = 2, + .note = "ARCH_CAPABILITIES", .props = (PropValue[]) { { "arch-capabilities", "on" }, { "rdctl-no", "on" }, @@ -3146,6 +3151,7 @@ static X86CPUDefinition builtin_x86_defs[] = { }, { .version = 3, .alias = "Cascadelake-Server-noTSX", + .note = "ARCH_CAPABILITIES, no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, @@ -3367,6 +3373,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, + .note = "no TSX", .alias = "Icelake-Client-noTSX", .props = (PropValue[]) { { "hle", "off" }, @@ -3484,6 +3491,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, + .note = "no TSX", .alias = "Icelake-Server-noTSX", .props = (PropValue[]) { { "hle", "off" }, @@ -3604,6 +3612,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, + .note = "no MPX, no MONITOR", .props = (PropValue[]) { { "monitor", "off" }, { "mpx", "off" }, diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c index 8ef5b463ea..71cec3962f 100644 --- a/target/i386/fpu_helper.c +++ b/target/i386/fpu_helper.c @@ -25,6 +25,7 @@ #include "exec/exec-all.h" #include "exec/cpu_ldst.h" #include "fpu/softfloat.h" +#include "fpu/softfloat-macros.h" #ifdef CONFIG_SOFTMMU #include "hw/irq.h" @@ -836,27 +837,390 @@ void helper_fbst_ST0(CPUX86State *env, target_ulong ptr) merge_exception_flags(env, old_flags); } -void helper_f2xm1(CPUX86State *env) -{ - double val = floatx80_to_double(env, ST0); +/* 128-bit significand of log(2). */ +#define ln2_sig_high 0xb17217f7d1cf79abULL +#define ln2_sig_low 0xc9e3b39803f2f6afULL - val = pow(2.0, val) - 1.0; - ST0 = double_to_floatx80(env, val); -} +/* + * Polynomial coefficients for an approximation to (2^x - 1) / x, on + * the interval [-1/64, 1/64]. + */ +#define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL) +#define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL) +#define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL) +#define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL) +#define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL) +#define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL) +#define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL) +#define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL) +#define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL) + +struct f2xm1_data { + /* + * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1 + * are very close to exact floatx80 values. + */ + floatx80 t; + /* The value of 2^t. */ + floatx80 exp2; + /* The value of 2^t - 1. */ + floatx80 exp2m1; +}; + +static const struct f2xm1_data f2xm1_table[65] = { + { make_floatx80(0xbfff, 0x8000000000000000ULL), + make_floatx80(0x3ffe, 0x8000000000000000ULL), + make_floatx80(0xbffe, 0x8000000000000000ULL) }, + { make_floatx80(0xbffe, 0xf800000000002e7eULL), + make_floatx80(0x3ffe, 0x82cd8698ac2b9160ULL), + make_floatx80(0xbffd, 0xfa64f2cea7a8dd40ULL) }, + { make_floatx80(0xbffe, 0xefffffffffffe960ULL), + make_floatx80(0x3ffe, 0x85aac367cc488345ULL), + make_floatx80(0xbffd, 0xf4aa7930676ef976ULL) }, + { make_floatx80(0xbffe, 0xe800000000006f10ULL), + make_floatx80(0x3ffe, 0x88980e8092da5c14ULL), + make_floatx80(0xbffd, 0xeecfe2feda4b47d8ULL) }, + { make_floatx80(0xbffe, 0xe000000000008a45ULL), + make_floatx80(0x3ffe, 0x8b95c1e3ea8ba2a5ULL), + make_floatx80(0xbffd, 0xe8d47c382ae8bab6ULL) }, + { make_floatx80(0xbffe, 0xd7ffffffffff8a9eULL), + make_floatx80(0x3ffe, 0x8ea4398b45cd8116ULL), + make_floatx80(0xbffd, 0xe2b78ce97464fdd4ULL) }, + { make_floatx80(0xbffe, 0xd0000000000019a0ULL), + make_floatx80(0x3ffe, 0x91c3d373ab11b919ULL), + make_floatx80(0xbffd, 0xdc785918a9dc8dceULL) }, + { make_floatx80(0xbffe, 0xc7ffffffffff14dfULL), + make_floatx80(0x3ffe, 0x94f4efa8fef76836ULL), + make_floatx80(0xbffd, 0xd61620ae02112f94ULL) }, + { make_floatx80(0xbffe, 0xc000000000006530ULL), + make_floatx80(0x3ffe, 0x9837f0518db87fbbULL), + make_floatx80(0xbffd, 0xcf901f5ce48f008aULL) }, + { make_floatx80(0xbffe, 0xb7ffffffffff1723ULL), + make_floatx80(0x3ffe, 0x9b8d39b9d54eb74cULL), + make_floatx80(0xbffd, 0xc8e58c8c55629168ULL) }, + { make_floatx80(0xbffe, 0xb00000000000b5e1ULL), + make_floatx80(0x3ffe, 0x9ef5326091a0c366ULL), + make_floatx80(0xbffd, 0xc2159b3edcbe7934ULL) }, + { make_floatx80(0xbffe, 0xa800000000006f8aULL), + make_floatx80(0x3ffe, 0xa27043030c49370aULL), + make_floatx80(0xbffd, 0xbb1f79f9e76d91ecULL) }, + { make_floatx80(0xbffe, 0x9fffffffffff816aULL), + make_floatx80(0x3ffe, 0xa5fed6a9b15171cfULL), + make_floatx80(0xbffd, 0xb40252ac9d5d1c62ULL) }, + { make_floatx80(0xbffe, 0x97ffffffffffb621ULL), + make_floatx80(0x3ffe, 0xa9a15ab4ea7c30e6ULL), + make_floatx80(0xbffd, 0xacbd4a962b079e34ULL) }, + { make_floatx80(0xbffe, 0x8fffffffffff162bULL), + make_floatx80(0x3ffe, 0xad583eea42a1b886ULL), + make_floatx80(0xbffd, 0xa54f822b7abc8ef4ULL) }, + { make_floatx80(0xbffe, 0x87ffffffffff4d34ULL), + make_floatx80(0x3ffe, 0xb123f581d2ac7b51ULL), + make_floatx80(0xbffd, 0x9db814fc5aa7095eULL) }, + { make_floatx80(0xbffe, 0x800000000000227dULL), + make_floatx80(0x3ffe, 0xb504f333f9de539dULL), + make_floatx80(0xbffd, 0x95f619980c4358c6ULL) }, + { make_floatx80(0xbffd, 0xefffffffffff3978ULL), + make_floatx80(0x3ffe, 0xb8fbaf4762fbd0a1ULL), + make_floatx80(0xbffd, 0x8e08a1713a085ebeULL) }, + { make_floatx80(0xbffd, 0xe00000000000df81ULL), + make_floatx80(0x3ffe, 0xbd08a39f580bfd8cULL), + make_floatx80(0xbffd, 0x85eeb8c14fe804e8ULL) }, + { make_floatx80(0xbffd, 0xd00000000000bccfULL), + make_floatx80(0x3ffe, 0xc12c4cca667062f6ULL), + make_floatx80(0xbffc, 0xfb4eccd6663e7428ULL) }, + { make_floatx80(0xbffd, 0xc00000000000eff0ULL), + make_floatx80(0x3ffe, 0xc5672a1155069abeULL), + make_floatx80(0xbffc, 0xea6357baabe59508ULL) }, + { make_floatx80(0xbffd, 0xb000000000000fe6ULL), + make_floatx80(0x3ffe, 0xc9b9bd866e2f234bULL), + make_floatx80(0xbffc, 0xd91909e6474372d4ULL) }, + { make_floatx80(0xbffd, 0x9fffffffffff2172ULL), + make_floatx80(0x3ffe, 0xce248c151f84bf00ULL), + make_floatx80(0xbffc, 0xc76dcfab81ed0400ULL) }, + { make_floatx80(0xbffd, 0x8fffffffffffafffULL), + make_floatx80(0x3ffe, 0xd2a81d91f12afb2bULL), + make_floatx80(0xbffc, 0xb55f89b83b541354ULL) }, + { make_floatx80(0xbffc, 0xffffffffffff81a3ULL), + make_floatx80(0x3ffe, 0xd744fccad69d7d5eULL), + make_floatx80(0xbffc, 0xa2ec0cd4a58a0a88ULL) }, + { make_floatx80(0xbffc, 0xdfffffffffff1568ULL), + make_floatx80(0x3ffe, 0xdbfbb797daf25a44ULL), + make_floatx80(0xbffc, 0x901121a0943696f0ULL) }, + { make_floatx80(0xbffc, 0xbfffffffffff68daULL), + make_floatx80(0x3ffe, 0xe0ccdeec2a94f811ULL), + make_floatx80(0xbffb, 0xf999089eab583f78ULL) }, + { make_floatx80(0xbffc, 0x9fffffffffff4690ULL), + make_floatx80(0x3ffe, 0xe5b906e77c83657eULL), + make_floatx80(0xbffb, 0xd237c8c41be4d410ULL) }, + { make_floatx80(0xbffb, 0xffffffffffff8aeeULL), + make_floatx80(0x3ffe, 0xeac0c6e7dd24427cULL), + make_floatx80(0xbffb, 0xa9f9c8c116ddec20ULL) }, + { make_floatx80(0xbffb, 0xbfffffffffff2d18ULL), + make_floatx80(0x3ffe, 0xefe4b99bdcdb06ebULL), + make_floatx80(0xbffb, 0x80da33211927c8a8ULL) }, + { make_floatx80(0xbffa, 0xffffffffffff8ccbULL), + make_floatx80(0x3ffe, 0xf5257d152486d0f4ULL), + make_floatx80(0xbffa, 0xada82eadb792f0c0ULL) }, + { make_floatx80(0xbff9, 0xffffffffffff11feULL), + make_floatx80(0x3ffe, 0xfa83b2db722a0846ULL), + make_floatx80(0xbff9, 0xaf89a491babef740ULL) }, + { floatx80_zero, + make_floatx80(0x3fff, 0x8000000000000000ULL), + floatx80_zero }, + { make_floatx80(0x3ff9, 0xffffffffffff2680ULL), + make_floatx80(0x3fff, 0x82cd8698ac2b9f6fULL), + make_floatx80(0x3ff9, 0xb361a62b0ae7dbc0ULL) }, + { make_floatx80(0x3ffb, 0x800000000000b500ULL), + make_floatx80(0x3fff, 0x85aac367cc488345ULL), + make_floatx80(0x3ffa, 0xb5586cf9891068a0ULL) }, + { make_floatx80(0x3ffb, 0xbfffffffffff4b67ULL), + make_floatx80(0x3fff, 0x88980e8092da7cceULL), + make_floatx80(0x3ffb, 0x8980e8092da7cce0ULL) }, + { make_floatx80(0x3ffb, 0xffffffffffffff57ULL), + make_floatx80(0x3fff, 0x8b95c1e3ea8bd6dfULL), + make_floatx80(0x3ffb, 0xb95c1e3ea8bd6df0ULL) }, + { make_floatx80(0x3ffc, 0x9fffffffffff811fULL), + make_floatx80(0x3fff, 0x8ea4398b45cd4780ULL), + make_floatx80(0x3ffb, 0xea4398b45cd47800ULL) }, + { make_floatx80(0x3ffc, 0xbfffffffffff9980ULL), + make_floatx80(0x3fff, 0x91c3d373ab11b919ULL), + make_floatx80(0x3ffc, 0x8e1e9b9d588dc8c8ULL) }, + { make_floatx80(0x3ffc, 0xdffffffffffff631ULL), + make_floatx80(0x3fff, 0x94f4efa8fef70864ULL), + make_floatx80(0x3ffc, 0xa7a77d47f7b84320ULL) }, + { make_floatx80(0x3ffc, 0xffffffffffff2499ULL), + make_floatx80(0x3fff, 0x9837f0518db892d4ULL), + make_floatx80(0x3ffc, 0xc1bf828c6dc496a0ULL) }, + { make_floatx80(0x3ffd, 0x8fffffffffff80fbULL), + make_floatx80(0x3fff, 0x9b8d39b9d54e3a79ULL), + make_floatx80(0x3ffc, 0xdc69cdceaa71d3c8ULL) }, + { make_floatx80(0x3ffd, 0x9fffffffffffbc23ULL), + make_floatx80(0x3fff, 0x9ef5326091a10313ULL), + make_floatx80(0x3ffc, 0xf7a993048d081898ULL) }, + { make_floatx80(0x3ffd, 0xafffffffffff20ecULL), + make_floatx80(0x3fff, 0xa27043030c49370aULL), + make_floatx80(0x3ffd, 0x89c10c0c3124dc28ULL) }, + { make_floatx80(0x3ffd, 0xc00000000000fd2cULL), + make_floatx80(0x3fff, 0xa5fed6a9b15171cfULL), + make_floatx80(0x3ffd, 0x97fb5aa6c545c73cULL) }, + { make_floatx80(0x3ffd, 0xd0000000000093beULL), + make_floatx80(0x3fff, 0xa9a15ab4ea7c30e6ULL), + make_floatx80(0x3ffd, 0xa6856ad3a9f0c398ULL) }, + { make_floatx80(0x3ffd, 0xe00000000000c2aeULL), + make_floatx80(0x3fff, 0xad583eea42a17876ULL), + make_floatx80(0x3ffd, 0xb560fba90a85e1d8ULL) }, + { make_floatx80(0x3ffd, 0xefffffffffff1e3fULL), + make_floatx80(0x3fff, 0xb123f581d2abef6cULL), + make_floatx80(0x3ffd, 0xc48fd6074aafbdb0ULL) }, + { make_floatx80(0x3ffd, 0xffffffffffff1c23ULL), + make_floatx80(0x3fff, 0xb504f333f9de2cadULL), + make_floatx80(0x3ffd, 0xd413cccfe778b2b4ULL) }, + { make_floatx80(0x3ffe, 0x8800000000006344ULL), + make_floatx80(0x3fff, 0xb8fbaf4762fbd0a1ULL), + make_floatx80(0x3ffd, 0xe3eebd1d8bef4284ULL) }, + { make_floatx80(0x3ffe, 0x9000000000005d67ULL), + make_floatx80(0x3fff, 0xbd08a39f580c668dULL), + make_floatx80(0x3ffd, 0xf4228e7d60319a34ULL) }, + { make_floatx80(0x3ffe, 0x9800000000009127ULL), + make_floatx80(0x3fff, 0xc12c4cca6670e042ULL), + make_floatx80(0x3ffe, 0x82589994cce1c084ULL) }, + { make_floatx80(0x3ffe, 0x9fffffffffff06f9ULL), + make_floatx80(0x3fff, 0xc5672a11550655c3ULL), + make_floatx80(0x3ffe, 0x8ace5422aa0cab86ULL) }, + { make_floatx80(0x3ffe, 0xa7fffffffffff80dULL), + make_floatx80(0x3fff, 0xc9b9bd866e2f234bULL), + make_floatx80(0x3ffe, 0x93737b0cdc5e4696ULL) }, + { make_floatx80(0x3ffe, 0xafffffffffff1470ULL), + make_floatx80(0x3fff, 0xce248c151f83fd69ULL), + make_floatx80(0x3ffe, 0x9c49182a3f07fad2ULL) }, + { make_floatx80(0x3ffe, 0xb800000000000e0aULL), + make_floatx80(0x3fff, 0xd2a81d91f12aec5cULL), + make_floatx80(0x3ffe, 0xa5503b23e255d8b8ULL) }, + { make_floatx80(0x3ffe, 0xc00000000000b7faULL), + make_floatx80(0x3fff, 0xd744fccad69dd630ULL), + make_floatx80(0x3ffe, 0xae89f995ad3bac60ULL) }, + { make_floatx80(0x3ffe, 0xc800000000003aa6ULL), + make_floatx80(0x3fff, 0xdbfbb797daf25a44ULL), + make_floatx80(0x3ffe, 0xb7f76f2fb5e4b488ULL) }, + { make_floatx80(0x3ffe, 0xd00000000000a6aeULL), + make_floatx80(0x3fff, 0xe0ccdeec2a954685ULL), + make_floatx80(0x3ffe, 0xc199bdd8552a8d0aULL) }, + { make_floatx80(0x3ffe, 0xd800000000004165ULL), + make_floatx80(0x3fff, 0xe5b906e77c837155ULL), + make_floatx80(0x3ffe, 0xcb720dcef906e2aaULL) }, + { make_floatx80(0x3ffe, 0xe00000000000582cULL), + make_floatx80(0x3fff, 0xeac0c6e7dd24713aULL), + make_floatx80(0x3ffe, 0xd5818dcfba48e274ULL) }, + { make_floatx80(0x3ffe, 0xe800000000001a5dULL), + make_floatx80(0x3fff, 0xefe4b99bdcdb06ebULL), + make_floatx80(0x3ffe, 0xdfc97337b9b60dd6ULL) }, + { make_floatx80(0x3ffe, 0xefffffffffffc1efULL), + make_floatx80(0x3fff, 0xf5257d152486a2faULL), + make_floatx80(0x3ffe, 0xea4afa2a490d45f4ULL) }, + { make_floatx80(0x3ffe, 0xf800000000001069ULL), + make_floatx80(0x3fff, 0xfa83b2db722a0e5cULL), + make_floatx80(0x3ffe, 0xf50765b6e4541cb8ULL) }, + { make_floatx80(0x3fff, 0x8000000000000000ULL), + make_floatx80(0x4000, 0x8000000000000000ULL), + make_floatx80(0x3fff, 0x8000000000000000ULL) }, +}; -void helper_fyl2x(CPUX86State *env) +void helper_f2xm1(CPUX86State *env) { - double fptemp = floatx80_to_double(env, ST0); + uint8_t old_flags = save_exception_flags(env); + uint64_t sig = extractFloatx80Frac(ST0); + int32_t exp = extractFloatx80Exp(ST0); + bool sign = extractFloatx80Sign(ST0); - if (fptemp > 0.0) { - fptemp = log(fptemp) / log(2.0); /* log2(ST) */ - fptemp *= floatx80_to_double(env, ST1); - ST1 = double_to_floatx80(env, fptemp); - fpop(env); + if (floatx80_invalid_encoding(ST0)) { + float_raise(float_flag_invalid, &env->fp_status); + ST0 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_any_nan(ST0)) { + if (floatx80_is_signaling_nan(ST0, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST0 = floatx80_silence_nan(ST0, &env->fp_status); + } + } else if (exp > 0x3fff || + (exp == 0x3fff && sig != (0x8000000000000000ULL))) { + /* Out of range for the instruction, treat as invalid. */ + float_raise(float_flag_invalid, &env->fp_status); + ST0 = floatx80_default_nan(&env->fp_status); + } else if (exp == 0x3fff) { + /* Argument 1 or -1, exact result 1 or -0.5. */ + if (sign) { + ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL); + } + } else if (exp < 0x3fb0) { + if (!floatx80_is_zero(ST0)) { + /* + * Multiplying the argument by an extra-precision version + * of log(2) is sufficiently precise. Zero arguments are + * returned unchanged. + */ + uint64_t sig0, sig1, sig2; + if (exp == 0) { + normalizeFloatx80Subnormal(sig, &exp, &sig); + } + mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1, + &sig2); + /* This result is inexact. */ + sig1 |= 1; + ST0 = normalizeRoundAndPackFloatx80(80, sign, exp, sig0, sig1, + &env->fp_status); + } } else { - env->fpus &= ~0x4700; - env->fpus |= 0x400; + floatx80 tmp, y, accum; + bool asign, bsign; + int32_t n, aexp, bexp; + uint64_t asig0, asig1, asig2, bsig0, bsig1; + FloatRoundMode save_mode = env->fp_status.float_rounding_mode; + signed char save_prec = env->fp_status.floatx80_rounding_precision; + env->fp_status.float_rounding_mode = float_round_nearest_even; + env->fp_status.floatx80_rounding_precision = 80; + + /* Find the nearest multiple of 1/32 to the argument. */ + tmp = floatx80_scalbn(ST0, 5, &env->fp_status); + n = 32 + floatx80_to_int32(tmp, &env->fp_status); + y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status); + + if (floatx80_is_zero(y)) { + /* + * Use the value of 2^t - 1 from the table, to avoid + * needing to special-case zero as a result of + * multiplication below. + */ + ST0 = f2xm1_table[n].t; + set_float_exception_flags(float_flag_inexact, &env->fp_status); + env->fp_status.float_rounding_mode = save_mode; + } else { + /* + * Compute the lower parts of a polynomial expansion for + * (2^y - 1) / y. + */ + accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status); + accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status); + accum = floatx80_mul(accum, y, &env->fp_status); + accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status); + accum = floatx80_mul(accum, y, &env->fp_status); + accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status); + accum = floatx80_mul(accum, y, &env->fp_status); + accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status); + accum = floatx80_mul(accum, y, &env->fp_status); + accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status); + accum = floatx80_mul(accum, y, &env->fp_status); + accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status); + accum = floatx80_mul(accum, y, &env->fp_status); + accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status); + + /* + * The full polynomial expansion is f2xm1_coeff_0 + accum + * (where accum has much lower magnitude, and so, in + * particular, carry out of the addition is not possible). + * (This expansion is only accurate to about 70 bits, not + * 128 bits.) + */ + aexp = extractFloatx80Exp(f2xm1_coeff_0); + asign = extractFloatx80Sign(f2xm1_coeff_0); + shift128RightJamming(extractFloatx80Frac(accum), 0, + aexp - extractFloatx80Exp(accum), + &asig0, &asig1); + bsig0 = extractFloatx80Frac(f2xm1_coeff_0); + bsig1 = 0; + if (asign == extractFloatx80Sign(accum)) { + add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } else { + sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } + /* And thus compute an approximation to 2^y - 1. */ + mul128By64To192(asig0, asig1, extractFloatx80Frac(y), + &asig0, &asig1, &asig2); + aexp += extractFloatx80Exp(y) - 0x3ffe; + asign ^= extractFloatx80Sign(y); + if (n != 32) { + /* + * Multiply this by the precomputed value of 2^t and + * add that of 2^t - 1. + */ + mul128By64To192(asig0, asig1, + extractFloatx80Frac(f2xm1_table[n].exp2), + &asig0, &asig1, &asig2); + aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe; + bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1); + bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1); + bsig1 = 0; + if (bexp < aexp) { + shift128RightJamming(bsig0, bsig1, aexp - bexp, + &bsig0, &bsig1); + } else if (aexp < bexp) { + shift128RightJamming(asig0, asig1, bexp - aexp, + &asig0, &asig1); + aexp = bexp; + } + /* The sign of 2^t - 1 is always that of the result. */ + bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1); + if (asign == bsign) { + /* Avoid possible carry out of the addition. */ + shift128RightJamming(asig0, asig1, 1, + &asig0, &asig1); + shift128RightJamming(bsig0, bsig1, 1, + &bsig0, &bsig1); + ++aexp; + add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1); + } else { + sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + asign = bsign; + } + } + env->fp_status.float_rounding_mode = save_mode; + /* This result is inexact. */ + asig1 |= 1; + ST0 = normalizeRoundAndPackFloatx80(80, asign, aexp, asig0, asig1, + &env->fp_status); + } + + env->fp_status.floatx80_rounding_precision = save_prec; } + merge_exception_flags(env, old_flags); } void helper_fptan(CPUX86State *env) @@ -875,14 +1239,493 @@ void helper_fptan(CPUX86State *env) } } +/* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision. */ +#define pi_4_exp 0x3ffe +#define pi_4_sig_high 0xc90fdaa22168c234ULL +#define pi_4_sig_low 0xc4c6628b80dc1cd1ULL +#define pi_2_exp 0x3fff +#define pi_2_sig_high 0xc90fdaa22168c234ULL +#define pi_2_sig_low 0xc4c6628b80dc1cd1ULL +#define pi_34_exp 0x4000 +#define pi_34_sig_high 0x96cbe3f9990e91a7ULL +#define pi_34_sig_low 0x9394c9e8a0a5159dULL +#define pi_exp 0x4000 +#define pi_sig_high 0xc90fdaa22168c234ULL +#define pi_sig_low 0xc4c6628b80dc1cd1ULL + +/* + * Polynomial coefficients for an approximation to atan(x), with only + * odd powers of x used, for x in the interval [-1/16, 1/16]. (Unlike + * for some other approximations, no low part is needed for the first + * coefficient here to achieve a sufficiently accurate result, because + * the coefficient in this minimax approximation is very close to + * exactly 1.) + */ +#define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL) +#define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL) +#define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL) +#define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL) +#define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL) +#define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL) +#define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL) + +struct fpatan_data { + /* High and low parts of atan(x). */ + floatx80 atan_high, atan_low; +}; + +static const struct fpatan_data fpatan_table[9] = { + { floatx80_zero, + floatx80_zero }, + { make_floatx80(0x3ffb, 0xfeadd4d5617b6e33ULL), + make_floatx80(0xbfb9, 0xdda19d8305ddc420ULL) }, + { make_floatx80(0x3ffc, 0xfadbafc96406eb15ULL), + make_floatx80(0x3fbb, 0xdb8f3debef442fccULL) }, + { make_floatx80(0x3ffd, 0xb7b0ca0f26f78474ULL), + make_floatx80(0xbfbc, 0xeab9bdba460376faULL) }, + { make_floatx80(0x3ffd, 0xed63382b0dda7b45ULL), + make_floatx80(0x3fbc, 0xdfc88bd978751a06ULL) }, + { make_floatx80(0x3ffe, 0x8f005d5ef7f59f9bULL), + make_floatx80(0x3fbd, 0xb906bc2ccb886e90ULL) }, + { make_floatx80(0x3ffe, 0xa4bc7d1934f70924ULL), + make_floatx80(0x3fbb, 0xcd43f9522bed64f8ULL) }, + { make_floatx80(0x3ffe, 0xb8053e2bc2319e74ULL), + make_floatx80(0xbfbc, 0xd3496ab7bd6eef0cULL) }, + { make_floatx80(0x3ffe, 0xc90fdaa22168c235ULL), + make_floatx80(0xbfbc, 0xece675d1fc8f8cbcULL) }, +}; + void helper_fpatan(CPUX86State *env) { - double fptemp, fpsrcop; + uint8_t old_flags = save_exception_flags(env); + uint64_t arg0_sig = extractFloatx80Frac(ST0); + int32_t arg0_exp = extractFloatx80Exp(ST0); + bool arg0_sign = extractFloatx80Sign(ST0); + uint64_t arg1_sig = extractFloatx80Frac(ST1); + int32_t arg1_exp = extractFloatx80Exp(ST1); + bool arg1_sign = extractFloatx80Sign(ST1); + + if (floatx80_is_signaling_nan(ST0, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST0, &env->fp_status); + } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST1, &env->fp_status); + } else if (floatx80_invalid_encoding(ST0) || + floatx80_invalid_encoding(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_any_nan(ST0)) { + ST1 = ST0; + } else if (floatx80_is_any_nan(ST1)) { + /* Pass this NaN through. */ + } else if (floatx80_is_zero(ST1) && !arg0_sign) { + /* Pass this zero through. */ + } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) || + arg0_exp - arg1_exp >= 80) && + !arg0_sign) { + /* + * Dividing ST1 by ST0 gives the correct result up to + * rounding, and avoids spurious underflow exceptions that + * might result from passing some small values through the + * polynomial approximation, but if a finite nonzero result of + * division is exact, the result of fpatan is still inexact + * (and underflowing where appropriate). + */ + signed char save_prec = env->fp_status.floatx80_rounding_precision; + env->fp_status.floatx80_rounding_precision = 80; + ST1 = floatx80_div(ST1, ST0, &env->fp_status); + env->fp_status.floatx80_rounding_precision = save_prec; + if (!floatx80_is_zero(ST1) && + !(get_float_exception_flags(&env->fp_status) & + float_flag_inexact)) { + /* + * The mathematical result is very slightly closer to zero + * than this exact result. Round a value with the + * significand adjusted accordingly to get the correct + * exceptions, and possibly an adjusted result depending + * on the rounding mode. + */ + uint64_t sig = extractFloatx80Frac(ST1); + int32_t exp = extractFloatx80Exp(ST1); + bool sign = extractFloatx80Sign(ST1); + if (exp == 0) { + normalizeFloatx80Subnormal(sig, &exp, &sig); + } + ST1 = normalizeRoundAndPackFloatx80(80, sign, exp, sig - 1, + -1, &env->fp_status); + } + } else { + /* The result is inexact. */ + bool rsign = arg1_sign; + int32_t rexp; + uint64_t rsig0, rsig1; + if (floatx80_is_zero(ST1)) { + /* + * ST0 is negative. The result is pi with the sign of + * ST1. + */ + rexp = pi_exp; + rsig0 = pi_sig_high; + rsig1 = pi_sig_low; + } else if (floatx80_is_infinity(ST1)) { + if (floatx80_is_infinity(ST0)) { + if (arg0_sign) { + rexp = pi_34_exp; + rsig0 = pi_34_sig_high; + rsig1 = pi_34_sig_low; + } else { + rexp = pi_4_exp; + rsig0 = pi_4_sig_high; + rsig1 = pi_4_sig_low; + } + } else { + rexp = pi_2_exp; + rsig0 = pi_2_sig_high; + rsig1 = pi_2_sig_low; + } + } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) { + rexp = pi_2_exp; + rsig0 = pi_2_sig_high; + rsig1 = pi_2_sig_low; + } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) { + /* ST0 is negative. */ + rexp = pi_exp; + rsig0 = pi_sig_high; + rsig1 = pi_sig_low; + } else { + /* + * ST0 and ST1 are finite, nonzero and with exponents not + * too far apart. + */ + int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp; + int32_t azexp, axexp; + bool adj_sub, ysign, zsign; + uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1; + uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2; + uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1; + uint64_t azsig0, azsig1; + uint64_t azsig2, azsig3, axsig0, axsig1; + floatx80 x8; + FloatRoundMode save_mode = env->fp_status.float_rounding_mode; + signed char save_prec = env->fp_status.floatx80_rounding_precision; + env->fp_status.float_rounding_mode = float_round_nearest_even; + env->fp_status.floatx80_rounding_precision = 80; + + if (arg0_exp == 0) { + normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig); + } + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + if (arg0_exp > arg1_exp || + (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) { + /* Work with abs(ST1) / abs(ST0). */ + num_exp = arg1_exp; + num_sig = arg1_sig; + den_exp = arg0_exp; + den_sig = arg0_sig; + if (arg0_sign) { + /* The result is subtracted from pi. */ + adj_exp = pi_exp; + adj_sig0 = pi_sig_high; + adj_sig1 = pi_sig_low; + adj_sub = true; + } else { + /* The result is used as-is. */ + adj_exp = 0; + adj_sig0 = 0; + adj_sig1 = 0; + adj_sub = false; + } + } else { + /* Work with abs(ST0) / abs(ST1). */ + num_exp = arg0_exp; + num_sig = arg0_sig; + den_exp = arg1_exp; + den_sig = arg1_sig; + /* The result is added to or subtracted from pi/2. */ + adj_exp = pi_2_exp; + adj_sig0 = pi_2_sig_high; + adj_sig1 = pi_2_sig_low; + adj_sub = !arg0_sign; + } + + /* + * Compute x = num/den, where 0 < x <= 1 and x is not too + * small. + */ + xexp = num_exp - den_exp + 0x3ffe; + remsig0 = num_sig; + remsig1 = 0; + if (den_sig <= remsig0) { + shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1); + ++xexp; + } + xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig); + mul64To128(den_sig, xsig0, &msig0, &msig1); + sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1); + while ((int64_t) remsig0 < 0) { + --xsig0; + add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1); + } + xsig1 = estimateDiv128To64(remsig1, 0, den_sig); + /* + * No need to correct any estimation error in xsig1; even + * with such error, it is accurate enough. + */ + + /* + * Split x as x = t + y, where t = n/8 is the nearest + * multiple of 1/8 to x. + */ + x8 = normalizeRoundAndPackFloatx80(80, false, xexp + 3, xsig0, + xsig1, &env->fp_status); + n = floatx80_to_int32(x8, &env->fp_status); + if (n == 0) { + ysign = false; + yexp = xexp; + ysig0 = xsig0; + ysig1 = xsig1; + texp = 0; + tsig = 0; + } else { + int shift = clz32(n) + 32; + texp = 0x403b - shift; + tsig = n; + tsig <<= shift; + if (texp == xexp) { + sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1); + if ((int64_t) ysig0 >= 0) { + ysign = false; + if (ysig0 == 0) { + if (ysig1 == 0) { + yexp = 0; + } else { + shift = clz64(ysig1) + 64; + yexp = xexp - shift; + shift128Left(ysig0, ysig1, shift, + &ysig0, &ysig1); + } + } else { + shift = clz64(ysig0); + yexp = xexp - shift; + shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1); + } + } else { + ysign = true; + sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1); + if (ysig0 == 0) { + shift = clz64(ysig1) + 64; + } else { + shift = clz64(ysig0); + } + yexp = xexp - shift; + shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1); + } + } else { + /* + * t's exponent must be greater than x's because t + * is positive and the nearest multiple of 1/8 to + * x, and if x has a greater exponent, the power + * of 2 with that exponent is also a multiple of + * 1/8. + */ + uint64_t usig0, usig1; + shift128RightJamming(xsig0, xsig1, texp - xexp, + &usig0, &usig1); + ysign = true; + sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1); + if (ysig0 == 0) { + shift = clz64(ysig1) + 64; + } else { + shift = clz64(ysig0); + } + yexp = texp - shift; + shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1); + } + } + + /* + * Compute z = y/(1+tx), so arctan(x) = arctan(t) + + * arctan(z). + */ + zsign = ysign; + if (texp == 0 || yexp == 0) { + zexp = yexp; + zsig0 = ysig0; + zsig1 = ysig1; + } else { + /* + * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1. + */ + int32_t dexp = texp + xexp - 0x3ffe; + uint64_t dsig0, dsig1, dsig2; + mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2); + /* + * dexp <= 0x3fff (and if equal, dsig0 has a leading 0 + * bit). Add 1 to produce the denominator 1+tx. + */ + shift128RightJamming(dsig0, dsig1, 0x3fff - dexp, + &dsig0, &dsig1); + dsig0 |= 0x8000000000000000ULL; + zexp = yexp - 1; + remsig0 = ysig0; + remsig1 = ysig1; + remsig2 = 0; + if (dsig0 <= remsig0) { + shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1); + ++zexp; + } + zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0); + mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2); + sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2, + &remsig0, &remsig1, &remsig2); + while ((int64_t) remsig0 < 0) { + --zsig0; + add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1, + &remsig0, &remsig1, &remsig2); + } + zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0); + /* No need to correct any estimation error in zsig1. */ + } + + if (zexp == 0) { + azexp = 0; + azsig0 = 0; + azsig1 = 0; + } else { + floatx80 z2, accum; + uint64_t z2sig0, z2sig1, z2sig2, z2sig3; + /* Compute z^2. */ + mul128To256(zsig0, zsig1, zsig0, zsig1, + &z2sig0, &z2sig1, &z2sig2, &z2sig3); + z2 = normalizeRoundAndPackFloatx80(80, false, + zexp + zexp - 0x3ffe, + z2sig0, z2sig1, + &env->fp_status); + + /* Compute the lower parts of the polynomial expansion. */ + accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status); + accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status); + accum = floatx80_mul(accum, z2, &env->fp_status); + accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status); + accum = floatx80_mul(accum, z2, &env->fp_status); + accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status); + accum = floatx80_mul(accum, z2, &env->fp_status); + accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status); + accum = floatx80_mul(accum, z2, &env->fp_status); + accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status); + accum = floatx80_mul(accum, z2, &env->fp_status); + + /* + * The full polynomial expansion is z*(fpatan_coeff_0 + accum). + * fpatan_coeff_0 is 1, and accum is negative and much smaller. + */ + aexp = extractFloatx80Exp(fpatan_coeff_0); + shift128RightJamming(extractFloatx80Frac(accum), 0, + aexp - extractFloatx80Exp(accum), + &asig0, &asig1); + sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1, + &asig0, &asig1); + /* Multiply by z to compute arctan(z). */ + azexp = aexp + zexp - 0x3ffe; + mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1, + &azsig2, &azsig3); + } + + /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign). */ + if (texp == 0) { + /* z is positive. */ + axexp = azexp; + axsig0 = azsig0; + axsig1 = azsig1; + } else { + bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low); + int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low); + uint64_t low_sig0 = + extractFloatx80Frac(fpatan_table[n].atan_low); + uint64_t low_sig1 = 0; + axexp = extractFloatx80Exp(fpatan_table[n].atan_high); + axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high); + axsig1 = 0; + shift128RightJamming(low_sig0, low_sig1, axexp - low_exp, + &low_sig0, &low_sig1); + if (low_sign) { + sub128(axsig0, axsig1, low_sig0, low_sig1, + &axsig0, &axsig1); + } else { + add128(axsig0, axsig1, low_sig0, low_sig1, + &axsig0, &axsig1); + } + if (azexp >= axexp) { + shift128RightJamming(axsig0, axsig1, azexp - axexp + 1, + &axsig0, &axsig1); + axexp = azexp + 1; + shift128RightJamming(azsig0, azsig1, 1, + &azsig0, &azsig1); + } else { + shift128RightJamming(axsig0, axsig1, 1, + &axsig0, &axsig1); + shift128RightJamming(azsig0, azsig1, axexp - azexp + 1, + &azsig0, &azsig1); + ++axexp; + } + if (zsign) { + sub128(axsig0, axsig1, azsig0, azsig1, + &axsig0, &axsig1); + } else { + add128(axsig0, axsig1, azsig0, azsig1, + &axsig0, &axsig1); + } + } + + if (adj_exp == 0) { + rexp = axexp; + rsig0 = axsig0; + rsig1 = axsig1; + } else { + /* + * Add or subtract arctan(x) (exponent axexp, + * significand axsig0 and axsig1, positive, not + * necessarily normalized) to the number given by + * adj_exp, adj_sig0 and adj_sig1, according to + * adj_sub. + */ + if (adj_exp >= axexp) { + shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1, + &axsig0, &axsig1); + rexp = adj_exp + 1; + shift128RightJamming(adj_sig0, adj_sig1, 1, + &adj_sig0, &adj_sig1); + } else { + shift128RightJamming(axsig0, axsig1, 1, + &axsig0, &axsig1); + shift128RightJamming(adj_sig0, adj_sig1, + axexp - adj_exp + 1, + &adj_sig0, &adj_sig1); + rexp = axexp + 1; + } + if (adj_sub) { + sub128(adj_sig0, adj_sig1, axsig0, axsig1, + &rsig0, &rsig1); + } else { + add128(adj_sig0, adj_sig1, axsig0, axsig1, + &rsig0, &rsig1); + } + } + + env->fp_status.float_rounding_mode = save_mode; + env->fp_status.floatx80_rounding_precision = save_prec; + } + /* This result is inexact. */ + rsig1 |= 1; + ST1 = normalizeRoundAndPackFloatx80(80, rsign, rexp, + rsig0, rsig1, &env->fp_status); + } - fpsrcop = floatx80_to_double(env, ST1); - fptemp = floatx80_to_double(env, ST0); - ST1 = double_to_floatx80(env, atan2(fpsrcop, fptemp)); fpop(env); + merge_exception_flags(env, old_flags); } void helper_fxtract(CPUX86State *env) @@ -934,139 +1777,438 @@ void helper_fxtract(CPUX86State *env) merge_exception_flags(env, old_flags); } -void helper_fprem1(CPUX86State *env) +static void helper_fprem_common(CPUX86State *env, bool mod) { - double st0, st1, dblq, fpsrcop, fptemp; - CPU_LDoubleU fpsrcop1, fptemp1; - int expdif; - signed long long int q; - - st0 = floatx80_to_double(env, ST0); - st1 = floatx80_to_double(env, ST1); - - if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) { - ST0 = double_to_floatx80(env, 0.0 / 0.0); /* NaN */ - env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ - return; - } - - fpsrcop = st0; - fptemp = st1; - fpsrcop1.d = ST0; - fptemp1.d = ST1; - expdif = EXPD(fpsrcop1) - EXPD(fptemp1); - - if (expdif < 0) { - /* optimisation? taken from the AMD docs */ - env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ - /* ST0 is unchanged */ - return; - } + uint8_t old_flags = save_exception_flags(env); + uint64_t quotient; + CPU_LDoubleU temp0, temp1; + int exp0, exp1, expdiff; - if (expdif < 53) { - dblq = fpsrcop / fptemp; - /* round dblq towards nearest integer */ - dblq = rint(dblq); - st0 = fpsrcop - fptemp * dblq; + temp0.d = ST0; + temp1.d = ST1; + exp0 = EXPD(temp0); + exp1 = EXPD(temp1); - /* convert dblq to q by truncating towards zero */ - if (dblq < 0.0) { - q = (signed long long int)(-dblq); + env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ + if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) || + exp0 == 0x7fff || exp1 == 0x7fff || + floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) { + ST0 = floatx80_modrem(ST0, ST1, mod, "ient, &env->fp_status); + } else { + if (exp0 == 0) { + exp0 = 1 - clz64(temp0.l.lower); + } + if (exp1 == 0) { + exp1 = 1 - clz64(temp1.l.lower); + } + expdiff = exp0 - exp1; + if (expdiff < 64) { + ST0 = floatx80_modrem(ST0, ST1, mod, "ient, &env->fp_status); + env->fpus |= (quotient & 0x4) << (8 - 2); /* (C0) <-- q2 */ + env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */ + env->fpus |= (quotient & 0x1) << (9 - 0); /* (C1) <-- q0 */ } else { - q = (signed long long int)dblq; + /* + * Partial remainder. This choice of how many bits to + * process at once is specified in AMD instruction set + * manuals, and empirically is followed by Intel + * processors as well; it ensures that the final remainder + * operation in a loop does produce the correct low three + * bits of the quotient. AMD manuals specify that the + * flags other than C2 are cleared, and empirically Intel + * processors clear them as well. + */ + int n = 32 + (expdiff % 32); + temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status); + ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status); + env->fpus |= 0x400; /* C2 <-- 1 */ } - - env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ - /* (C0,C3,C1) <-- (q2,q1,q0) */ - env->fpus |= (q & 0x4) << (8 - 2); /* (C0) <-- q2 */ - env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */ - env->fpus |= (q & 0x1) << (9 - 0); /* (C1) <-- q0 */ - } else { - env->fpus |= 0x400; /* C2 <-- 1 */ - fptemp = pow(2.0, expdif - 50); - fpsrcop = (st0 / st1) / fptemp; - /* fpsrcop = integer obtained by chopping */ - fpsrcop = (fpsrcop < 0.0) ? - -(floor(fabs(fpsrcop))) : floor(fpsrcop); - st0 -= (st1 * fpsrcop * fptemp); } - ST0 = double_to_floatx80(env, st0); + merge_exception_flags(env, old_flags); +} + +void helper_fprem1(CPUX86State *env) +{ + helper_fprem_common(env, false); } void helper_fprem(CPUX86State *env) { - double st0, st1, dblq, fpsrcop, fptemp; - CPU_LDoubleU fpsrcop1, fptemp1; - int expdif; - signed long long int q; + helper_fprem_common(env, true); +} - st0 = floatx80_to_double(env, ST0); - st1 = floatx80_to_double(env, ST1); +/* 128-bit significand of log2(e). */ +#define log2_e_sig_high 0xb8aa3b295c17f0bbULL +#define log2_e_sig_low 0xbe87fed0691d3e89ULL - if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) { - ST0 = double_to_floatx80(env, 0.0 / 0.0); /* NaN */ - env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ - return; - } +/* + * Polynomial coefficients for an approximation to log2((1+x)/(1-x)), + * with only odd powers of x used, for x in the interval [2*sqrt(2)-3, + * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the + * interval [sqrt(2)/2, sqrt(2)]. + */ +#define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL) +#define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL) +#define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL) +#define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL) +#define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL) +#define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL) +#define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL) +#define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL) +#define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL) +#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL) +#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL) - fpsrcop = st0; - fptemp = st1; - fpsrcop1.d = ST0; - fptemp1.d = ST1; - expdif = EXPD(fpsrcop1) - EXPD(fptemp1); +/* + * Compute an approximation of log2(1+arg), where 1+arg is in the + * interval [sqrt(2)/2, sqrt(2)]. It is assumed that when this + * function is called, rounding precision is set to 80 and the + * round-to-nearest mode is in effect. arg must not be exactly zero, + * and must not be so close to zero that underflow might occur. + */ +static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp, + uint64_t *sig0, uint64_t *sig1) +{ + uint64_t arg0_sig = extractFloatx80Frac(arg); + int32_t arg0_exp = extractFloatx80Exp(arg); + bool arg0_sign = extractFloatx80Sign(arg); + bool asign; + int32_t dexp, texp, aexp; + uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2; + uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3; + uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1; + floatx80 t2, accum; - if (expdif < 0) { - /* optimisation? taken from the AMD docs */ - env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ - /* ST0 is unchanged */ - return; - } + /* + * Compute an approximation of arg/(2+arg), with extra precision, + * as the argument to a polynomial approximation. The extra + * precision is only needed for the first term of the + * approximation, with subsequent terms being significantly + * smaller; the approximation only uses odd exponents, and the + * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029.... + */ + if (arg0_sign) { + dexp = 0x3fff; + shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); + sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1); + } else { + dexp = 0x4000; + shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); + dsig0 |= 0x8000000000000000ULL; + } + texp = arg0_exp - dexp + 0x3ffe; + rsig0 = arg0_sig; + rsig1 = 0; + rsig2 = 0; + if (dsig0 <= rsig0) { + shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1); + ++texp; + } + tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0); + mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2); + sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2, + &rsig0, &rsig1, &rsig2); + while ((int64_t) rsig0 < 0) { + --tsig0; + add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1, + &rsig0, &rsig1, &rsig2); + } + tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0); + /* + * No need to correct any estimation error in tsig1; even with + * such error, it is accurate enough. Now compute the square of + * that approximation. + */ + mul128To256(tsig0, tsig1, tsig0, tsig1, + &t2sig0, &t2sig1, &t2sig2, &t2sig3); + t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe, + t2sig0, t2sig1, &env->fp_status); + + /* Compute the lower parts of the polynomial expansion. */ + accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status); - if (expdif < 53) { - dblq = fpsrcop / fptemp; /* ST0 / ST1 */ - /* round dblq towards zero */ - dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq); - st0 = fpsrcop - fptemp * dblq; /* fpsrcop is ST0 */ + /* + * The full polynomial expansion is fyl2x_coeff_0 + accum (where + * accum has much lower magnitude, and so, in particular, carry + * out of the addition is not possible), multiplied by t. (This + * expansion is only accurate to about 70 bits, not 128 bits.) + */ + aexp = extractFloatx80Exp(fyl2x_coeff_0); + asign = extractFloatx80Sign(fyl2x_coeff_0); + shift128RightJamming(extractFloatx80Frac(accum), 0, + aexp - extractFloatx80Exp(accum), + &asig0, &asig1); + bsig0 = extractFloatx80Frac(fyl2x_coeff_0); + bsig1 = 0; + if (asign == extractFloatx80Sign(accum)) { + add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } else { + sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } + /* Multiply by t to compute the required result. */ + mul128To256(asig0, asig1, tsig0, tsig1, + &asig0, &asig1, &asig2, &asig3); + aexp += texp - 0x3ffe; + *exp = aexp; + *sig0 = asig0; + *sig1 = asig1; +} - /* convert dblq to q by truncating towards zero */ - if (dblq < 0.0) { - q = (signed long long int)(-dblq); - } else { - q = (signed long long int)dblq; +void helper_fyl2xp1(CPUX86State *env) +{ + uint8_t old_flags = save_exception_flags(env); + uint64_t arg0_sig = extractFloatx80Frac(ST0); + int32_t arg0_exp = extractFloatx80Exp(ST0); + bool arg0_sign = extractFloatx80Sign(ST0); + uint64_t arg1_sig = extractFloatx80Frac(ST1); + int32_t arg1_exp = extractFloatx80Exp(ST1); + bool arg1_sign = extractFloatx80Sign(ST1); + + if (floatx80_is_signaling_nan(ST0, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST0, &env->fp_status); + } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST1, &env->fp_status); + } else if (floatx80_invalid_encoding(ST0) || + floatx80_invalid_encoding(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_any_nan(ST0)) { + ST1 = ST0; + } else if (floatx80_is_any_nan(ST1)) { + /* Pass this NaN through. */ + } else if (arg0_exp > 0x3ffd || + (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ? + 0x95f619980c4336f7ULL : + 0xd413cccfe7799211ULL))) { + /* + * Out of range for the instruction (ST0 must have absolute + * value less than 1 - sqrt(2)/2 = 0.292..., according to + * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1 + * to sqrt(2) - 1, which we allow here), treat as invalid. + */ + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) || + arg1_exp == 0x7fff) { + /* + * One argument is zero, or multiplying by infinity; correct + * result is exact and can be obtained by multiplying the + * arguments. + */ + ST1 = floatx80_mul(ST0, ST1, &env->fp_status); + } else if (arg0_exp < 0x3fb0) { + /* + * Multiplying both arguments and an extra-precision version + * of log2(e) is sufficiently precise. + */ + uint64_t sig0, sig1, sig2; + int32_t exp; + if (arg0_exp == 0) { + normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig); } - - env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */ - /* (C0,C3,C1) <-- (q2,q1,q0) */ - env->fpus |= (q & 0x4) << (8 - 2); /* (C0) <-- q2 */ - env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */ - env->fpus |= (q & 0x1) << (9 - 0); /* (C1) <-- q0 */ + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig, + &sig0, &sig1, &sig2); + exp = arg0_exp + 1; + mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2); + exp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + sig1 |= 1; + ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp, + sig0, sig1, &env->fp_status); } else { - int N = 32 + (expdif % 32); /* as per AMD docs */ + int32_t aexp; + uint64_t asig0, asig1, asig2; + FloatRoundMode save_mode = env->fp_status.float_rounding_mode; + signed char save_prec = env->fp_status.floatx80_rounding_precision; + env->fp_status.float_rounding_mode = float_round_nearest_even; + env->fp_status.floatx80_rounding_precision = 80; - env->fpus |= 0x400; /* C2 <-- 1 */ - fptemp = pow(2.0, (double)(expdif - N)); - fpsrcop = (st0 / st1) / fptemp; - /* fpsrcop = integer obtained by chopping */ - fpsrcop = (fpsrcop < 0.0) ? - -(floor(fabs(fpsrcop))) : floor(fpsrcop); - st0 -= (st1 * fpsrcop * fptemp); + helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1); + /* + * Multiply by the second argument to compute the required + * result. + */ + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2); + aexp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + asig1 |= 1; + env->fp_status.float_rounding_mode = save_mode; + ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, aexp, + asig0, asig1, &env->fp_status); + env->fp_status.floatx80_rounding_precision = save_prec; } - ST0 = double_to_floatx80(env, st0); + fpop(env); + merge_exception_flags(env, old_flags); } -void helper_fyl2xp1(CPUX86State *env) +void helper_fyl2x(CPUX86State *env) { - double fptemp = floatx80_to_double(env, ST0); - - if ((fptemp + 1.0) > 0.0) { - fptemp = log(fptemp + 1.0) / log(2.0); /* log2(ST + 1.0) */ - fptemp *= floatx80_to_double(env, ST1); - ST1 = double_to_floatx80(env, fptemp); - fpop(env); + uint8_t old_flags = save_exception_flags(env); + uint64_t arg0_sig = extractFloatx80Frac(ST0); + int32_t arg0_exp = extractFloatx80Exp(ST0); + bool arg0_sign = extractFloatx80Sign(ST0); + uint64_t arg1_sig = extractFloatx80Frac(ST1); + int32_t arg1_exp = extractFloatx80Exp(ST1); + bool arg1_sign = extractFloatx80Sign(ST1); + + if (floatx80_is_signaling_nan(ST0, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST0, &env->fp_status); + } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST1, &env->fp_status); + } else if (floatx80_invalid_encoding(ST0) || + floatx80_invalid_encoding(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_any_nan(ST0)) { + ST1 = ST0; + } else if (floatx80_is_any_nan(ST1)) { + /* Pass this NaN through. */ + } else if (arg0_sign && !floatx80_is_zero(ST0)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_infinity(ST1)) { + FloatRelation cmp = floatx80_compare(ST0, floatx80_one, + &env->fp_status); + switch (cmp) { + case float_relation_less: + ST1 = floatx80_chs(ST1); + break; + case float_relation_greater: + /* Result is infinity of the same sign as ST1. */ + break; + default: + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + break; + } + } else if (floatx80_is_infinity(ST0)) { + if (floatx80_is_zero(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (arg1_sign) { + ST1 = floatx80_chs(ST0); + } else { + ST1 = ST0; + } + } else if (floatx80_is_zero(ST0)) { + if (floatx80_is_zero(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else { + /* Result is infinity with opposite sign to ST1. */ + float_raise(float_flag_divbyzero, &env->fp_status); + ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff, + 0x8000000000000000ULL); + } + } else if (floatx80_is_zero(ST1)) { + if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) { + ST1 = floatx80_chs(ST1); + } + /* Otherwise, ST1 is already the correct result. */ + } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) { + if (arg1_sign) { + ST1 = floatx80_chs(floatx80_zero); + } else { + ST1 = floatx80_zero; + } } else { - env->fpus &= ~0x4700; - env->fpus |= 0x400; + int32_t int_exp; + floatx80 arg0_m1; + FloatRoundMode save_mode = env->fp_status.float_rounding_mode; + signed char save_prec = env->fp_status.floatx80_rounding_precision; + env->fp_status.float_rounding_mode = float_round_nearest_even; + env->fp_status.floatx80_rounding_precision = 80; + + if (arg0_exp == 0) { + normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig); + } + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + int_exp = arg0_exp - 0x3fff; + if (arg0_sig > 0xb504f333f9de6484ULL) { + ++int_exp; + } + arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp, + &env->fp_status), + floatx80_one, &env->fp_status); + if (floatx80_is_zero(arg0_m1)) { + /* Exact power of 2; multiply by ST1. */ + env->fp_status.float_rounding_mode = save_mode; + ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status), + ST1, &env->fp_status); + } else { + bool asign = extractFloatx80Sign(arg0_m1); + int32_t aexp; + uint64_t asig0, asig1, asig2; + helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1); + if (int_exp != 0) { + bool isign = (int_exp < 0); + int32_t iexp; + uint64_t isig; + int shift; + int_exp = isign ? -int_exp : int_exp; + shift = clz32(int_exp) + 32; + isig = int_exp; + isig <<= shift; + iexp = 0x403e - shift; + shift128RightJamming(asig0, asig1, iexp - aexp, + &asig0, &asig1); + if (asign == isign) { + add128(isig, 0, asig0, asig1, &asig0, &asig1); + } else { + sub128(isig, 0, asig0, asig1, &asig0, &asig1); + } + aexp = iexp; + asign = isign; + } + /* + * Multiply by the second argument to compute the required + * result. + */ + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2); + aexp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + asig1 |= 1; + env->fp_status.float_rounding_mode = save_mode; + ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp, + asig0, asig1, &env->fp_status); + } + + env->fp_status.floatx80_rounding_precision = save_prec; } + fpop(env); + merge_exception_flags(env, old_flags); } void helper_fsqrt(CPUX86State *env) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index b3c13cb898..6adbff3d74 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -740,26 +740,62 @@ static bool hyperv_enabled(X86CPU *cpu) cpu->hyperv_features || cpu->hyperv_passthrough); } +/* + * Check whether target_freq is within conservative + * ntp correctable bounds (250ppm) of freq + */ +static inline bool freq_within_bounds(int freq, int target_freq) +{ + int max_freq = freq + (freq * 250 / 1000000); + int min_freq = freq - (freq * 250 / 1000000); + + if (target_freq >= min_freq && target_freq <= max_freq) { + return true; + } + + return false; +} + static int kvm_arch_set_tsc_khz(CPUState *cs) { X86CPU *cpu = X86_CPU(cs); CPUX86State *env = &cpu->env; - int r; + int r, cur_freq; + bool set_ioctl = false; if (!env->tsc_khz) { return 0; } - r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ? + cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? + kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP; + + /* + * If TSC scaling is supported, attempt to set TSC frequency. + */ + if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) { + set_ioctl = true; + } + + /* + * If desired TSC frequency is within bounds of NTP correction, + * attempt to set TSC frequency. + */ + if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) { + set_ioctl = true; + } + + r = set_ioctl ? kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : -ENOTSUP; + if (r < 0) { /* When KVM_SET_TSC_KHZ fails, it's an error only if the current * TSC frequency doesn't match the one we want. */ - int cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? - kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : - -ENOTSUP; + cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? + kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : + -ENOTSUP; if (cur_freq <= 0 || cur_freq != env->tsc_khz) { warn_report("TSC frequency mismatch between " "VM (%" PRId64 " kHz) and host (%d kHz), " diff --git a/target/m68k/softfloat.c b/target/m68k/softfloat.c index 9f120cf15e..b6d0ed7acf 100644 --- a/target/m68k/softfloat.c +++ b/target/m68k/softfloat.c @@ -43,89 +43,6 @@ static floatx80 propagateFloatx80NaNOneArg(floatx80 a, float_status *status) } /* - * Returns the modulo remainder of the extended double-precision floating-point - * value `a' with respect to the corresponding value `b'. - */ - -floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status) -{ - bool aSign, zSign; - int32_t aExp, bExp, expDiff; - uint64_t aSig0, aSig1, bSig; - uint64_t qTemp, term0, term1; - - aSig0 = extractFloatx80Frac(a); - aExp = extractFloatx80Exp(a); - aSign = extractFloatx80Sign(a); - bSig = extractFloatx80Frac(b); - bExp = extractFloatx80Exp(b); - - if (aExp == 0x7FFF) { - if ((uint64_t) (aSig0 << 1) - || ((bExp == 0x7FFF) && (uint64_t) (bSig << 1))) { - return propagateFloatx80NaN(a, b, status); - } - goto invalid; - } - if (bExp == 0x7FFF) { - if ((uint64_t) (bSig << 1)) { - return propagateFloatx80NaN(a, b, status); - } - return a; - } - if (bExp == 0) { - if (bSig == 0) { - invalid: - float_raise(float_flag_invalid, status); - return floatx80_default_nan(status); - } - normalizeFloatx80Subnormal(bSig, &bExp, &bSig); - } - if (aExp == 0) { - if ((uint64_t) (aSig0 << 1) == 0) { - return a; - } - normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0); - } - bSig |= UINT64_C(0x8000000000000000); - zSign = aSign; - expDiff = aExp - bExp; - aSig1 = 0; - if (expDiff < 0) { - return a; - } - qTemp = (bSig <= aSig0); - if (qTemp) { - aSig0 -= bSig; - } - expDiff -= 64; - while (0 < expDiff) { - qTemp = estimateDiv128To64(aSig0, aSig1, bSig); - qTemp = (2 < qTemp) ? qTemp - 2 : 0; - mul64To128(bSig, qTemp, &term0, &term1); - sub128(aSig0, aSig1, term0, term1, &aSig0, &aSig1); - shortShift128Left(aSig0, aSig1, 62, &aSig0, &aSig1); - expDiff -= 62; - } - expDiff += 64; - if (0 < expDiff) { - qTemp = estimateDiv128To64(aSig0, aSig1, bSig); - qTemp = (2 < qTemp) ? qTemp - 2 : 0; - qTemp >>= 64 - expDiff; - mul64To128(bSig, qTemp << (64 - expDiff), &term0, &term1); - sub128(aSig0, aSig1, term0, term1, &aSig0, &aSig1); - shortShift128Left(0, bSig, 64 - expDiff, &term0, &term1); - while (le128(term0, term1, aSig0, aSig1)) { - ++qTemp; - sub128(aSig0, aSig1, term0, term1, &aSig0, &aSig1); - } - } - return - normalizeRoundAndPackFloatx80( - 80, zSign, bExp + expDiff, aSig0, aSig1, status); -} - -/* * Returns the mantissa of the extended double-precision floating-point * value `a'. */ diff --git a/target/m68k/softfloat.h b/target/m68k/softfloat.h index 365ef6ac7a..4bb9567134 100644 --- a/target/m68k/softfloat.h +++ b/target/m68k/softfloat.h @@ -23,7 +23,6 @@ #define TARGET_M68K_SOFTFLOAT_H #include "fpu/softfloat.h" -floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status); floatx80 floatx80_getman(floatx80 a, float_status *status); floatx80 floatx80_getexp(floatx80 a, float_status *status); floatx80 floatx80_scale(floatx80 a, floatx80 b, float_status *status); diff --git a/target/mips/kvm.c b/target/mips/kvm.c index 96cfa10cf2..72637a1e02 100644 --- a/target/mips/kvm.c +++ b/target/mips/kvm.c @@ -21,10 +21,12 @@ #include "qemu/main-loop.h" #include "qemu/timer.h" #include "sysemu/kvm.h" +#include "sysemu/kvm_int.h" #include "sysemu/runstate.h" #include "sysemu/cpus.h" #include "kvm_mips.h" #include "exec/memattrs.h" +#include "hw/boards.h" #define DEBUG_KVM 0 @@ -1270,3 +1272,27 @@ int kvm_arch_msi_data_to_gsi(uint32_t data) { abort(); } + +int mips_kvm_type(MachineState *machine, const char *vm_type) +{ +#if defined(KVM_CAP_MIPS_VZ) || defined(KVM_CAP_MIPS_TE) + int r; + KVMState *s = KVM_STATE(machine->accelerator); +#endif + +#if defined(KVM_CAP_MIPS_VZ) + r = kvm_check_extension(s, KVM_CAP_MIPS_VZ); + if (r > 0) { + return KVM_VM_MIPS_VZ; + } +#endif + +#if defined(KVM_CAP_MIPS_TE) + r = kvm_check_extension(s, KVM_CAP_MIPS_TE); + if (r > 0) { + return KVM_VM_MIPS_TE; + } +#endif + + return -1; +} diff --git a/target/mips/kvm_mips.h b/target/mips/kvm_mips.h index 1e4014792d..171d53dbe1 100644 --- a/target/mips/kvm_mips.h +++ b/target/mips/kvm_mips.h @@ -12,6 +12,8 @@ #ifndef KVM_MIPS_H #define KVM_MIPS_H +#include "cpu.h" + /** * kvm_mips_reset_vcpu: * @cpu: MIPSCPU @@ -23,4 +25,13 @@ void kvm_mips_reset_vcpu(MIPSCPU *cpu); int kvm_mips_set_interrupt(MIPSCPU *cpu, int irq, int level); int kvm_mips_set_ipi_interrupt(MIPSCPU *cpu, int irq, int level); +#ifdef CONFIG_KVM +int mips_kvm_type(MachineState *machine, const char *vm_type); +#else +static inline int mips_kvm_type(MachineState *machine, const char *vm_type) +{ + return 0; +} +#endif + #endif /* KVM_MIPS_H */ diff --git a/target/ppc/translate_init.inc.c b/target/ppc/translate_init.inc.c index a40888411c..49212bfd90 100644 --- a/target/ppc/translate_init.inc.c +++ b/target/ppc/translate_init.inc.c @@ -9086,11 +9086,6 @@ static void init_proc_POWER10(CPUPPCState *env) gen_spr_power8_rpr(env); gen_spr_power9_mmu(env); - /* POWER9 Specific registers */ - spr_register_kvm(env, SPR_TIDR, "TIDR", NULL, NULL, - spr_read_generic, spr_write_generic, - KVM_REG_PPC_TIDR, 0); - /* FIXME: Filter fields properly based on privilege level */ spr_register_kvm_hv(env, SPR_PSSCR, "PSSCR", NULL, NULL, NULL, NULL, spr_read_generic, spr_write_generic, diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h index 7a46dccbe1..32749378bf 100644 --- a/target/xtensa/cpu.h +++ b/target/xtensa/cpu.h @@ -464,6 +464,7 @@ struct XtensaConfig { XtensaMemory sysrom; XtensaMemory sysram; + unsigned hw_version; uint32_t configid[2]; void *isa_internal; diff --git a/target/xtensa/overlay_tool.h b/target/xtensa/overlay_tool.h index cab532095c..a994e69b6e 100644 --- a/target/xtensa/overlay_tool.h +++ b/target/xtensa/overlay_tool.h @@ -60,8 +60,9 @@ #define XCHAL_RESET_VECTOR1_VADDR XCHAL_RESET_VECTOR_VADDR #endif -#ifndef XCHAL_HW_MIN_VERSION -#define XCHAL_HW_MIN_VERSION 0 +#ifndef XCHAL_HW_VERSION +#define XCHAL_HW_VERSION (XCHAL_HW_VERSION_MAJOR * 100 \ + + XCHAL_HW_VERSION_MINOR) #endif #ifndef XCHAL_LOOP_BUFFER_SIZE @@ -100,7 +101,7 @@ XCHAL_OPTION(XCHAL_HAVE_FP, XTENSA_OPTION_FP_COPROCESSOR) | \ XCHAL_OPTION(XCHAL_HAVE_RELEASE_SYNC, XTENSA_OPTION_MP_SYNCHRO) | \ XCHAL_OPTION(XCHAL_HAVE_S32C1I, XTENSA_OPTION_CONDITIONAL_STORE) | \ - XCHAL_OPTION(((XCHAL_HAVE_S32C1I && XCHAL_HW_MIN_VERSION >= 230000) || \ + XCHAL_OPTION(((XCHAL_HAVE_S32C1I && XCHAL_HW_VERSION >= 230000) || \ XCHAL_HAVE_EXCLUSIVE), XTENSA_OPTION_ATOMCTL) | \ XCHAL_OPTION(XCHAL_HAVE_DEPBITS, XTENSA_OPTION_DEPBITS) | \ /* Interrupts and exceptions */ \ @@ -498,6 +499,7 @@ } #define CONFIG_SECTION \ + .hw_version = XCHAL_HW_VERSION, \ .configid = { \ XCHAL_HW_CONFIGID0, \ XCHAL_HW_CONFIGID1, \ diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c index e0beaf7abb..6346b2eef0 100644 --- a/target/xtensa/translate.c +++ b/target/xtensa/translate.c @@ -595,9 +595,6 @@ static int gen_postprocess(DisasContext *dc, int slot) gen_io_start(); } gen_helper_check_interrupts(cpu_env); - if (tb_cflags(dc->base.tb) & CF_USE_ICOUNT) { - gen_io_end(); - } } #endif if (op_flags & XTENSA_OP_SYNC_REGISTER_WINDOW) { @@ -2191,7 +2188,11 @@ static void translate_rsil(DisasContext *dc, const OpcodeArg arg[], static void translate_rsr(DisasContext *dc, const OpcodeArg arg[], const uint32_t par[]) { - tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]); + if (sr_name[par[0]]) { + tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]); + } else { + tcg_gen_movi_i32(arg[0].out, 0); + } } static void translate_rsr_ccount(DisasContext *dc, const OpcodeArg arg[], @@ -2363,9 +2364,10 @@ static bool test_ill_simcall(DisasContext *dc, const OpcodeArg arg[], #ifdef CONFIG_USER_ONLY bool ill = true; #else - bool ill = !semihosting_enabled(); + /* Between RE.2 and RE.3 simcall opcode's become nop for the hardware. */ + bool ill = dc->config->hw_version <= 250002 && !semihosting_enabled(); #endif - if (ill) { + if (ill || !semihosting_enabled()) { qemu_log_mask(LOG_GUEST_ERROR, "SIMCALL but semihosting is disabled\n"); } return ill; @@ -2375,7 +2377,9 @@ static void translate_simcall(DisasContext *dc, const OpcodeArg arg[], const uint32_t par[]) { #ifndef CONFIG_USER_ONLY - gen_helper_simcall(cpu_env); + if (semihosting_enabled()) { + gen_helper_simcall(cpu_env); + } #endif } @@ -2563,13 +2567,17 @@ static void translate_wrmsk_expstate(DisasContext *dc, const OpcodeArg arg[], static void translate_wsr(DisasContext *dc, const OpcodeArg arg[], const uint32_t par[]) { - tcg_gen_mov_i32(cpu_SR[par[0]], arg[0].in); + if (sr_name[par[0]]) { + tcg_gen_mov_i32(cpu_SR[par[0]], arg[0].in); + } } static void translate_wsr_mask(DisasContext *dc, const OpcodeArg arg[], const uint32_t par[]) { - tcg_gen_andi_i32(cpu_SR[par[0]], arg[0].in, par[2]); + if (sr_name[par[0]]) { + tcg_gen_andi_i32(cpu_SR[par[0]], arg[0].in, par[2]); + } } static void translate_wsr_acchi(DisasContext *dc, const OpcodeArg arg[], @@ -2775,23 +2783,31 @@ static void translate_xor(DisasContext *dc, const OpcodeArg arg[], static void translate_xsr(DisasContext *dc, const OpcodeArg arg[], const uint32_t par[]) { - TCGv_i32 tmp = tcg_temp_new_i32(); + if (sr_name[par[0]]) { + TCGv_i32 tmp = tcg_temp_new_i32(); - tcg_gen_mov_i32(tmp, arg[0].in); - tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]); - tcg_gen_mov_i32(cpu_SR[par[0]], tmp); - tcg_temp_free(tmp); + tcg_gen_mov_i32(tmp, arg[0].in); + tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]); + tcg_gen_mov_i32(cpu_SR[par[0]], tmp); + tcg_temp_free(tmp); + } else { + tcg_gen_movi_i32(arg[0].out, 0); + } } static void translate_xsr_mask(DisasContext *dc, const OpcodeArg arg[], const uint32_t par[]) { - TCGv_i32 tmp = tcg_temp_new_i32(); + if (sr_name[par[0]]) { + TCGv_i32 tmp = tcg_temp_new_i32(); - tcg_gen_mov_i32(tmp, arg[0].in); - tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]); - tcg_gen_andi_i32(cpu_SR[par[0]], tmp, par[2]); - tcg_temp_free(tmp); + tcg_gen_mov_i32(tmp, arg[0].in); + tcg_gen_mov_i32(arg[0].out, cpu_SR[par[0]]); + tcg_gen_andi_i32(cpu_SR[par[0]], tmp, par[2]); + tcg_temp_free(tmp); + } else { + tcg_gen_movi_i32(arg[0].out, 0); + } } static void translate_xsr_ccount(DisasContext *dc, const OpcodeArg arg[], @@ -2819,7 +2835,11 @@ static void translate_xsr_ccount(DisasContext *dc, const OpcodeArg arg[], { \ TCGv_i32 tmp = tcg_temp_new_i32(); \ \ - tcg_gen_mov_i32(tmp, cpu_SR[par[0]]); \ + if (sr_name[par[0]]) { \ + tcg_gen_mov_i32(tmp, cpu_SR[par[0]]); \ + } else { \ + tcg_gen_movi_i32(tmp, 0); \ + } \ translate_wsr_##name(dc, arg, par); \ tcg_gen_mov_i32(arg[0].out, tmp); \ tcg_temp_free(tmp); \ |