diff options
Diffstat (limited to 'target/arm')
| -rw-r--r-- | target/arm/tcg/mve_helper.c | 16 | ||||
| -rw-r--r-- | target/arm/tcg/translate.c | 58 | ||||
| -rw-r--r-- | target/arm/tcg/vec_helper.c | 102 | ||||
| -rw-r--r-- | target/arm/tcg/vec_internal.h | 11 |
4 files changed, 26 insertions, 161 deletions
diff --git a/target/arm/tcg/mve_helper.c b/target/arm/tcg/mve_helper.c index 403b345ea3..c666a96ba1 100644 --- a/target/arm/tcg/mve_helper.c +++ b/target/arm/tcg/mve_helper.c @@ -26,6 +26,7 @@ #include "exec/exec-all.h" #include "tcg/tcg.h" #include "fpu/softfloat.h" +#include "crypto/clmul.h" static uint16_t mve_eci_mask(CPUARMState *env) { @@ -984,17 +985,10 @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL) * Polynomial multiply. We can always do this generating 64 bits * of the result at a time, so we don't need to use DO_2OP_L. */ -#define VMULLPH_MASK 0x00ff00ff00ff00ffULL -#define VMULLPW_MASK 0x0000ffff0000ffffULL -#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK) -#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8) -#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK) -#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16) - -DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH) -DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH) -DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW) -DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW) +DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even) +DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd) +DO_2OP(vmullpbw, 8, uint64_t, clmul_16x2_even) +DO_2OP(vmullptw, 8, uint64_t, clmul_16x2_odd) /* * Because the computation type is at least twice as large as required, diff --git a/target/arm/tcg/translate.c b/target/arm/tcg/translate.c index 976b704200..d83a0e772c 100644 --- a/target/arm/tcg/translate.c +++ b/target/arm/tcg/translate.c @@ -2943,54 +2943,16 @@ void gen_gvec_sqrdmlsh_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs, gen_gvec_fn3_qc(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, fns[vece - 1]); } -#define GEN_CMP0(NAME, COND) \ - static void gen_##NAME##0_i32(TCGv_i32 d, TCGv_i32 a) \ - { \ - tcg_gen_negsetcond_i32(COND, d, a, tcg_constant_i32(0)); \ - } \ - static void gen_##NAME##0_i64(TCGv_i64 d, TCGv_i64 a) \ - { \ - tcg_gen_negsetcond_i64(COND, d, a, tcg_constant_i64(0)); \ - } \ - static void gen_##NAME##0_vec(unsigned vece, TCGv_vec d, TCGv_vec a) \ - { \ - TCGv_vec zero = tcg_constant_vec_matching(d, vece, 0); \ - tcg_gen_cmp_vec(COND, vece, d, a, zero); \ - } \ - void gen_gvec_##NAME##0(unsigned vece, uint32_t d, uint32_t m, \ - uint32_t opr_sz, uint32_t max_sz) \ - { \ - const GVecGen2 op[4] = { \ - { .fno = gen_helper_gvec_##NAME##0_b, \ - .fniv = gen_##NAME##0_vec, \ - .opt_opc = vecop_list_cmp, \ - .vece = MO_8 }, \ - { .fno = gen_helper_gvec_##NAME##0_h, \ - .fniv = gen_##NAME##0_vec, \ - .opt_opc = vecop_list_cmp, \ - .vece = MO_16 }, \ - { .fni4 = gen_##NAME##0_i32, \ - .fniv = gen_##NAME##0_vec, \ - .opt_opc = vecop_list_cmp, \ - .vece = MO_32 }, \ - { .fni8 = gen_##NAME##0_i64, \ - .fniv = gen_##NAME##0_vec, \ - .opt_opc = vecop_list_cmp, \ - .prefer_i64 = TCG_TARGET_REG_BITS == 64, \ - .vece = MO_64 }, \ - }; \ - tcg_gen_gvec_2(d, m, opr_sz, max_sz, &op[vece]); \ - } - -static const TCGOpcode vecop_list_cmp[] = { - INDEX_op_cmp_vec, 0 -}; - -GEN_CMP0(ceq, TCG_COND_EQ) -GEN_CMP0(cle, TCG_COND_LE) -GEN_CMP0(cge, TCG_COND_GE) -GEN_CMP0(clt, TCG_COND_LT) -GEN_CMP0(cgt, TCG_COND_GT) +#define GEN_CMP0(NAME, COND) \ + void NAME(unsigned vece, uint32_t d, uint32_t m, \ + uint32_t opr_sz, uint32_t max_sz) \ + { tcg_gen_gvec_cmpi(COND, vece, d, m, 0, opr_sz, max_sz); } + +GEN_CMP0(gen_gvec_ceq0, TCG_COND_EQ) +GEN_CMP0(gen_gvec_cle0, TCG_COND_LE) +GEN_CMP0(gen_gvec_cge0, TCG_COND_GE) +GEN_CMP0(gen_gvec_clt0, TCG_COND_LT) +GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT) #undef GEN_CMP0 diff --git a/target/arm/tcg/vec_helper.c b/target/arm/tcg/vec_helper.c index 6712a2c790..1f93510b85 100644 --- a/target/arm/tcg/vec_helper.c +++ b/target/arm/tcg/vec_helper.c @@ -23,6 +23,7 @@ #include "tcg/tcg-gvec-desc.h" #include "fpu/softfloat.h" #include "qemu/int128.h" +#include "crypto/clmul.h" #include "vec_internal.h" /* @@ -1986,21 +1987,11 @@ void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) */ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) { - intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t i, opr_sz = simd_oprsz(desc); uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; ++i) { - uint64_t nn = n[i]; - uint64_t mm = m[i]; - uint64_t rr = 0; - - for (j = 0; j < 8; ++j) { - uint64_t mask = (nn & 0x0101010101010101ull) * 0xff; - rr ^= mm & mask; - mm = (mm << 1) & 0xfefefefefefefefeull; - nn >>= 1; - } - d[i] = rr; + d[i] = clmul_8x8_low(n[i], m[i]); } clear_tail(d, opr_sz, simd_maxsz(desc)); } @@ -2012,84 +2003,28 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) */ void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) { - intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t i, opr_sz = simd_oprsz(desc); intptr_t hi = simd_data(desc); uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; i += 2) { - uint64_t nn = n[i + hi]; - uint64_t mm = m[i + hi]; - uint64_t rhi = 0; - uint64_t rlo = 0; - - /* Bit 0 can only influence the low 64-bit result. */ - if (nn & 1) { - rlo = mm; - } - - for (j = 1; j < 64; ++j) { - uint64_t mask = -((nn >> j) & 1); - rlo ^= (mm << j) & mask; - rhi ^= (mm >> (64 - j)) & mask; - } - d[i] = rlo; - d[i + 1] = rhi; + Int128 r = clmul_64(n[i + hi], m[i + hi]); + d[i] = int128_getlo(r); + d[i + 1] = int128_gethi(r); } clear_tail(d, opr_sz, simd_maxsz(desc)); } -/* - * 8x8->16 polynomial multiply. - * - * The byte inputs are expanded to (or extracted from) half-words. - * Note that neon and sve2 get the inputs from different positions. - * This allows 4 bytes to be processed in parallel with uint64_t. - */ - -static uint64_t expand_byte_to_half(uint64_t x) -{ - return (x & 0x000000ff) - | ((x & 0x0000ff00) << 8) - | ((x & 0x00ff0000) << 16) - | ((x & 0xff000000) << 24); -} - -uint64_t pmull_w(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - for (i = 0; i < 16; ++i) { - uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff; - result ^= op2 & mask; - op1 >>= 1; - op2 <<= 1; - } - return result; -} - -uint64_t pmull_h(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - for (i = 0; i < 8; ++i) { - uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff; - result ^= op2 & mask; - op1 >>= 1; - op2 <<= 1; - } - return result; -} - void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) { int hi = simd_data(desc); uint64_t *d = vd, *n = vn, *m = vm; uint64_t nn = n[hi], mm = m[hi]; - d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + d[0] = clmul_8x4_packed(nn, mm); nn >>= 32; mm >>= 32; - d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm)); + d[1] = clmul_8x4_packed(nn, mm); clear_tail(d, 16, simd_maxsz(desc)); } @@ -2102,23 +2037,8 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) uint64_t *d = vd, *n = vn, *m = vm; for (i = 0; i < opr_sz / 8; ++i) { - uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull; - uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull; - - d[i] = pmull_h(nn, mm); - } -} - -static uint64_t pmull_d(uint64_t op1, uint64_t op2) -{ - uint64_t result = 0; - int i; - - for (i = 0; i < 32; ++i) { - uint64_t mask = -((op1 >> i) & 1); - result ^= (op2 << i) & mask; + d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); } - return result; } void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) @@ -2129,7 +2049,7 @@ void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) uint64_t *d = vd; for (i = 0; i < opr_sz / 8; ++i) { - d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); + d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]); } } #endif diff --git a/target/arm/tcg/vec_internal.h b/target/arm/tcg/vec_internal.h index 1f4ed80ff7..3ca1b94ccf 100644 --- a/target/arm/tcg/vec_internal.h +++ b/target/arm/tcg/vec_internal.h @@ -219,17 +219,6 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *); int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *); int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool); -/* - * 8 x 8 -> 16 vector polynomial multiply where the inputs are - * in the low 8 bits of each 16-bit element -*/ -uint64_t pmull_h(uint64_t op1, uint64_t op2); -/* - * 16 x 16 -> 32 vector polynomial multiply where the inputs are - * in the low 16 bits of each 32-bit element - */ -uint64_t pmull_w(uint64_t op1, uint64_t op2); - /** * bfdotadd: * @sum: addend |