diff options
Diffstat (limited to 'target/s390x/tcg/vec_int_helper.c')
| -rw-r--r-- | target/s390x/tcg/vec_int_helper.c | 186 |
1 files changed, 84 insertions, 102 deletions
diff --git a/target/s390x/tcg/vec_int_helper.c b/target/s390x/tcg/vec_int_helper.c index 53ab5c5eb3..b18d8a6d16 100644 --- a/target/s390x/tcg/vec_int_helper.c +++ b/target/s390x/tcg/vec_int_helper.c @@ -14,19 +14,13 @@ #include "vec.h" #include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" +#include "crypto/clmul.h" static bool s390_vec_is_zero(const S390Vector *v) { return !v->doubleword[0] && !v->doubleword[1]; } -static void s390_vec_xor(S390Vector *res, const S390Vector *a, - const S390Vector *b) -{ - res->doubleword[0] = a->doubleword[0] ^ b->doubleword[0]; - res->doubleword[1] = a->doubleword[1] ^ b->doubleword[1]; -} - static void s390_vec_and(S390Vector *res, const S390Vector *a, const S390Vector *b) { @@ -164,117 +158,105 @@ DEF_VCTZ(8) DEF_VCTZ(16) /* like binary multiplication, but XOR instead of addition */ -#define DEF_GALOIS_MULTIPLY(BITS, TBITS) \ -static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \ - uint##TBITS##_t b) \ -{ \ - uint##TBITS##_t res = 0; \ - \ - while (b) { \ - if (b & 0x1) { \ - res = res ^ a; \ - } \ - a = a << 1; \ - b = b >> 1; \ - } \ - return res; \ + +/* + * There is no carry across the two doublewords, so their order does + * not matter. Nor is there partial overlap between registers. + */ +static inline uint64_t do_gfma8(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_8x4_even(n, m) ^ clmul_8x4_odd(n, m) ^ a; } -DEF_GALOIS_MULTIPLY(8, 16) -DEF_GALOIS_MULTIPLY(16, 32) -DEF_GALOIS_MULTIPLY(32, 64) -static S390Vector galois_multiply64(uint64_t a, uint64_t b) +void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d) { - S390Vector res = {}; - S390Vector va = { - .doubleword[1] = a, - }; - S390Vector vb = { - .doubleword[1] = b, - }; - - while (!s390_vec_is_zero(&vb)) { - if (vb.doubleword[1] & 0x1) { - s390_vec_xor(&res, &res, &va); - } - s390_vec_shl(&va, &va, 1); - s390_vec_shr(&vb, &vb, 1); - } - return res; + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + + q1[0] = do_gfma8(q2[0], q3[0], 0); + q1[1] = do_gfma8(q2[1], q3[1], 0); } -#define DEF_VGFM(BITS, TBITS) \ -void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \ - uint32_t desc) \ -{ \ - int i; \ - \ - for (i = 0; i < (128 / TBITS); i++) { \ - uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \ - uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \ - uint##TBITS##_t d = galois_multiply##BITS(a, b); \ - \ - a = s390_vec_read_element##BITS(v2, i * 2 + 1); \ - b = s390_vec_read_element##BITS(v3, i * 2 + 1); \ - d = d ^ galois_multiply32(a, b); \ - s390_vec_write_element##TBITS(v1, i, d); \ - } \ +void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t desc) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma8(q2[0], q3[0], q4[0]); + q1[1] = do_gfma8(q2[1], q3[1], q4[1]); } -DEF_VGFM(8, 16) -DEF_VGFM(16, 32) -DEF_VGFM(32, 64) -void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, - uint32_t desc) +static inline uint64_t do_gfma16(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_16x2_even(n, m) ^ clmul_16x2_odd(n, m) ^ a; +} + +void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d) { - S390Vector tmp1, tmp2; - uint64_t a, b; + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; - a = s390_vec_read_element64(v2, 0); - b = s390_vec_read_element64(v3, 0); - tmp1 = galois_multiply64(a, b); - a = s390_vec_read_element64(v2, 1); - b = s390_vec_read_element64(v3, 1); - tmp2 = galois_multiply64(a, b); - s390_vec_xor(v1, &tmp1, &tmp2); + q1[0] = do_gfma16(q2[0], q3[0], 0); + q1[1] = do_gfma16(q2[1], q3[1], 0); } -#define DEF_VGFMA(BITS, TBITS) \ -void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \ - const void *v4, uint32_t desc) \ -{ \ - int i; \ - \ - for (i = 0; i < (128 / TBITS); i++) { \ - uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \ - uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \ - uint##TBITS##_t d = galois_multiply##BITS(a, b); \ - \ - a = s390_vec_read_element##BITS(v2, i * 2 + 1); \ - b = s390_vec_read_element##BITS(v3, i * 2 + 1); \ - d = d ^ galois_multiply32(a, b); \ - d = d ^ s390_vec_read_element##TBITS(v4, i); \ - s390_vec_write_element##TBITS(v1, i, d); \ - } \ +void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma16(q2[0], q3[0], q4[0]); + q1[1] = do_gfma16(q2[1], q3[1], q4[1]); +} + +static inline uint64_t do_gfma32(uint64_t n, uint64_t m, uint64_t a) +{ + return clmul_32(n, m) ^ clmul_32(n >> 32, m >> 32) ^ a; +} + +void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + + q1[0] = do_gfma32(q2[0], q3[0], 0); + q1[1] = do_gfma32(q2[1], q3[1], 0); +} + +void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3, + const void *v4, uint32_t d) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + + q1[0] = do_gfma32(q2[0], q3[0], q4[0]); + q1[1] = do_gfma32(q2[1], q3[1], q4[1]); +} + +void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3, + uint32_t desc) +{ + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3; + Int128 r; + + r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1])); + q1[0] = int128_gethi(r); + q1[1] = int128_getlo(r); } -DEF_VGFMA(8, 16) -DEF_VGFMA(16, 32) -DEF_VGFMA(32, 64) void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3, const void *v4, uint32_t desc) { - S390Vector tmp1, tmp2; - uint64_t a, b; - - a = s390_vec_read_element64(v2, 0); - b = s390_vec_read_element64(v3, 0); - tmp1 = galois_multiply64(a, b); - a = s390_vec_read_element64(v2, 1); - b = s390_vec_read_element64(v3, 1); - tmp2 = galois_multiply64(a, b); - s390_vec_xor(&tmp1, &tmp1, &tmp2); - s390_vec_xor(v1, &tmp1, v4); + uint64_t *q1 = v1; + const uint64_t *q2 = v2, *q3 = v3, *q4 = v4; + Int128 r; + + r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1])); + q1[0] = q4[0] ^ int128_gethi(r); + q1[1] = q4[1] ^ int128_getlo(r); } #define DEF_VMAL(BITS) \ |