diff options
| author | ye-yeshun <89620590+ye-yeshun@users.noreply.github.com> | 2025-02-12 17:32:57 +0800 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-02-12 10:32:57 +0100 |
| commit | b8cc8594f6d9cbe4a47b8a98ba9878da803a7243 (patch) | |
| tree | c304e50b1079e89f17f7e1c4fc7c5252d887301c | |
| parent | 6197b70a9a8f4a0d0857d94e67e45bd25d1f3bb0 (diff) | |
| download | box64-b8cc8594f6d9cbe4a47b8a98ba9878da803a7243.tar.gz box64-b8cc8594f6d9cbe4a47b8a98ba9878da803a7243.zip | |
fix VPMASKMOV and VMASKMOVP (#2342)
* fix: 0x2C: 'VMASKMOVPS Gx, Vx, Ex' Log: VMASKMOVPS读取内存时, 如果某些mask位是0则不进行读写避免访问越界 Signed-off-by: YeshunYe <yeyeshun@uniontech.com> Change-Id: I197fc356edcac202b5a329c50c334d0166532e93 * fix: 0x2D: 'VMASKMOVPD Gx, Vx, Ex' Log: VMASKMOVPD读取内存时, 如果某些mask位是0则不进行读写避免访问越界 Signed-off-by: YeshunYe <yeyeshun@uniontech.com> Change-Id: Ie11d93971aa92b141540a37bfdae0b3b060e3aea * fix: 0x2E: 'VMASKMOVPS Ex, Gx, Vx' Log: VMASKMOVPS写入内存时, 如果某些mask位是0则不进行读写避免访问越界 Signed-off-by: YeshunYe <yeyeshun@uniontech.com> Change-Id: Ide5cb36dc03fc56480fdd45e7d96daed8557d849 * fix: 0x2F: 'VMASKMOVPD Ex, Gx, Vx' Log: VMASKMOVPD写入内存时, 如果某些mask位是0则不进行读写避免访问越界 Signed-off-by: YeshunYe <yeyeshun@uniontech.com> Change-Id: I037de8568e9d2d29597fdf08f991d54e3cb2f6d9 * fix: 0x8E: 'VPMASKMOVD/Q Ex, Vx, Gx' Log: VPMASKMOVD/Q写入内存时, 如果某些mask位是0则不进行读写避免访问越界 Signed-off-by: YeshunYe <yeyeshun@uniontech.com> Change-Id: I0dc98a29ed933d953e137e777bc296149d94b10b * tests: add test for VPMASKMOV and VMASKMOVP Log: Signed-off-by: YeshunYe <yeyeshun@uniontech.com> --------- Signed-off-by: YeshunYe <yeyeshun@uniontech.com>
| -rw-r--r-- | src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c | 272 | ||||
| -rw-r--r-- | tests/test30.c | 255 |
2 files changed, 459 insertions, 68 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c index dcc455e1..633f35a9 100644 --- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c +++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c @@ -629,29 +629,112 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip case 0x2C: INST_NAME("VMASKMOVPS Gx, Vx, Ex"); nextop = F8; - GETGX_empty_VXEX(v0, v2, v1, 0); + GETVX(v2, 0); + GETGX_empty(v0); q0 = fpu_get_scratch(dyn, ninst); - // create mask VSSHRQ_32(q0, v2, 31); - VANDQ(v0, v1, q0); + VEORQ(v0, v0, v0); + if (MODREG) { + v1 = sse_get_reg(dyn, ninst, x3, (nextop & 7) + (rex.b << 3), 0); + VANDQ(v0, v1, q0); + } else { + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); + EORx_REG(x4, x4, x4); + VMOVSto(x4, q0, 0); + CBZx(x4, 4+1*4); + VLD1_32(v0, 0, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 1); + CBZx(x4, 4+1*4); + VLD1_32(v0, 1, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 2); + CBZx(x4, 4+1*4); + VLD1_32(v0, 2, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 3); + CBZx(x4, 4+1*4); + VLD1_32(v0, 3, ed); + if(vex.l) + ADDx_U12(ed, ed, 4); + } if(vex.l) { - GETGY_empty_VYEY(v0, v2, v1); + v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); + v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); VSSHRQ_32(q0, v2, 31); - VANDQ(v0, v1, q0); + VEORQ(v0, v0, v0); + if(MODREG) + { + v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1); + VANDQ(v0, v1, q0); + } + else + { + VMOVSto(x4, q0, 0); + CBZx(x4, 4+1*4); + VLD1_32(v0, 0, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 1); + CBZx(x4, 4+1*4); + VLD1_32(v0, 1, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 2); + CBZx(x4, 4+1*4); + VLD1_32(v0, 2, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 3); + CBZx(x4, 4+1*4); + VLD1_32(v0, 3, ed); + } } else YMM0(gd); break; case 0x2D: INST_NAME("VMASKMOVPD Gx, Vx, Ex"); nextop = F8; - GETGX_empty_VXEX(v0, v2, v1, 0); + GETVX(v2, 0); + GETGX_empty(v0); q0 = fpu_get_scratch(dyn, ninst); // create mask VSSHRQ_64(q0, v2, 63); - VANDQ(v0, v1, q0); + VEORQ(v0, v0, v0); + if (MODREG) { + v1 = sse_get_reg(dyn, ninst, x3, (nextop & 7) + (rex.b << 3), 0); + VANDQ(v0, v1, q0); + } else { + SMREAD(); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); + EORx_REG(x4, x4, x4); + VMOVQDto(x4, q0, 0); + CBZx(x4, 4+1*4); + VLD1_64(v0, 0, ed); + ADDx_U12(ed, ed, 8); + VMOVQDto(x4, q0, 1); + CBZx(x4, 4+1*4); + VLD1_64(v0, 1, ed); + if(vex.l) + ADDx_U12(ed, ed, 8); + } if(vex.l) { - GETGY_empty_VYEY(v0, v2, v1); + v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); + v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); VSSHRQ_64(q0, v2, 63); - VANDQ(v0, v1, q0); + VEORQ(v0, v0, v0); + if(MODREG) + { + v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1); + VANDQ(v0, v1, q0); + } + else + { + VMOVQDto(x4, q0, 0); + CBZx(x4, 4+1*4); + VLD1_64(v0, 0, ed); + ADDx_U12(ed, ed, 8); + VMOVQDto(x4, q0, 1); + CBZx(x4, 4+1*4); + VLD1_64(v0, 1, ed); + } } else YMM0(gd); break; case 0x2E: @@ -663,37 +746,55 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip VSSHRQ_32(q0, v2, 31); if(MODREG) { v1 = sse_get_reg(dyn, ninst, x3, (nextop&7)+(rex.b<<3), 1); + VBITQ(v1, v0, q0); } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0); - unscaled = 0; - v1 = fpu_get_scratch(dyn, ninst); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); // check if mask as anything, else skip the whole read/write to avoid a SEGFAULT. // TODO: let a segfault trigger and check if the mask is null instead and ignore the segfault / actually triger: needs to implement SSE reg tracking first! - SQXTN_32(v1, q0); - VMOVQDto(x4, v1, 0); - CBZx(x4, 4+3*4); - VLDR128_U12(v1, ed, fixedaddress); - } - // create mask - VBITQ(v1, v0, q0); - if(!MODREG) { - VSTR128_U12(v1, ed, fixedaddress); + EORx_REG(x4, x4, x4); + VMOVSto(x4, q0, 0); + CBZx(x4, 4+1*4); + VST1_32(v0, 0, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 1); + CBZx(x4, 4+1*4); + VST1_32(v0, 1, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 2); + CBZx(x4, 4+1*4); + VST1_32(v0, 2, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 3); + CBZx(x4, 4+1*4); + VST1_32(v0, 3, ed); + if(vex.l) + ADDx_U12(ed, ed, 4); } + if(vex.l && !is_avx_zero(dyn, ninst, vex.v)) { v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); v0 = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); VSSHRQ_32(q0, v2, 31); - if(MODREG) + if(MODREG) { v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1, gd, vex.v, -1); - else { - SQXTN_32(v1, q0); - VMOVQDto(x4, v1, 0); - CBZx(x4, 4+3*4); - VLDR128_U12(v1, ed, fixedaddress+16); + VBITQ(v1, v0, q0); } - VBITQ(v1, v0, q0); - if(!MODREG) { - VSTR128_U12(v1, ed, fixedaddress+16); + else { + VMOVSto(x4, q0, 0); + CBZx(x4, 4+1*4); + VST1_32(v0, 0, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 1); + CBZx(x4, 4+1*4); + VST1_32(v0, 1, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 2); + CBZx(x4, 4+1*4); + VST1_32(v0, 2, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 3); + CBZx(x4, 4+1*4); + VST1_32(v0, 3, ed); } } break; @@ -701,43 +802,46 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip INST_NAME("VMASKMOVPD Ex, Gx, Vx"); nextop = F8; q0 = fpu_get_scratch(dyn, ninst); - q1 = fpu_get_scratch(dyn, ninst); GETVX(v2, 0); GETGX(v0, 0); VSSHRQ_64(q0, v2, 63); if(MODREG) { v1 = sse_get_reg(dyn, ninst, x3, (nextop&7)+(rex.b<<3), 1); + VBITQ(v1, v0, q0); } else { - addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); unscaled = 0; v1 = fpu_get_scratch(dyn, ninst); // check if mask as anything, else skip the whole read/write to avoid a SEGFAULT. // TODO: let a segfault trigger and check if the mask is null instead and ignore the segfault / actually triger: needs to implement SSE reg tracking first! - SQXTN_32(q1, q0); - VMOVQDto(x4, q1, 0); - CBZx(x4, 4+3*4); - VLDR128_U12(v1, ed, fixedaddress); - } - // create mask - VBITQ(v1, v0, q0); - if(!MODREG) { - VSTR128_U12(v1, ed, fixedaddress); + EORx_REG(x4, x4, x4); + VMOVQDto(x4, q0, 0); + CBZx(x4, 4+1*4); + VST1_64(v0, 0, ed); + ADDx_U12(ed, ed, 8); + VMOVQDto(x4, q0, 1); + CBZx(x4, 4+1*4); + VST1_64(v0, 1, ed); + if(vex.l) + ADDx_U12(ed, ed, 8); } + if(vex.l && !is_avx_zero(dyn, ninst, vex.v)) { v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); v0 = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1); VSSHRQ_64(q0, v2, 63); - if(MODREG) + if(MODREG) { v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1, gd, vex.v, -1); - else { - SQXTN_32(q1, q0); - VMOVQDto(x4, q1, 0); - CBZx(x4, 4+3*4); - VLDR128_U12(v1, ed, fixedaddress+16); + VBITQ(v1, v0, q0); } - VBITQ(v1, v0, q0); - if(!MODREG) { - VSTR128_U12(v1, ed, fixedaddress+16); + else { + VMOVQDto(x4, q0, 0); + CBZx(x4, 4+1*4); + VST1_64(v0, 0, ed); + ADDx_U12(ed, ed, 8); + VMOVQDto(x4, q0, 1); + CBZx(x4, 4+1*4); + VST1_64(v0, 1, ed); } } break; @@ -1174,31 +1278,65 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip nextop = F8; q0 = fpu_get_scratch(dyn, ninst); for(int l=0; l<1+vex.l; ++l) { - if(!l) { - GETGX(v0, 0); GETVX(v2, 0); - if(MODREG) { + if(MODREG) { + if(!l) { + GETGX(v0, 0); GETVX(v2, 0); s0 = (nextop&7)+(rex.b<<3); v1 = sse_get_reg_empty(dyn, ninst, x1, s0); } else { + GETGY(v0, 0, vex.v, s0, -1); v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1); + v1 = ymm_get_reg_empty(dyn, ninst, x1, s0, gd, vex.v, -1); + } + if(rex.w) + VSSHRQ_64(q0, v2, 63); + else + VSSHRQ_32(q0, v2, 31); + VBITQ(v1, v0, q0); + } else { + if(!l) { + GETGX(v0, 0); GETVX(v2, 0); s0 = -1; v1 = fpu_get_scratch(dyn, ninst); - addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0); - VLDR128_U12(v1, ed, fixedaddress); + addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0); + EORx_REG(x4, x4, x4); + } else { + GETGY(v0, 0, vex.v, s0, -1); v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1); + } + if(rex.w) + { + VSSHRQ_64(q0, v2, 63); + VMOVQDto(x4, q0, 0); + CBZx(x4, 4+1*4); + VST1_64(v0, 0, ed); + ADDx_U12(ed, ed, 8); + VMOVQDto(x4, q0, 1); + CBZx(x4, 4+1*4); + VST1_64(v0, 1, ed); + if(!l && vex.l) + ADDx_U12(ed, ed, 8); } - } else { - GETGY(v0, 0, vex.v, s0, -1); v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1); - if(MODREG) - v1 = ymm_get_reg_empty(dyn, ninst, x1, s0, gd, vex.v, -1); else - VLDR128_U12(v1, ed, fixedaddress+16); + { + VSSHRQ_32(q0, v2, 31); + VMOVSto(x4, q0, 0); + CBZx(x4, 4+1*4); + VST1_32(v0, 0, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 1); + CBZx(x4, 4+1*4); + VST1_32(v0, 1, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 2); + CBZx(x4, 4+1*4); + VST1_32(v0, 2, ed); + ADDx_U12(ed, ed, 4); + VMOVSto(x4, q0, 3); + CBZx(x4, 4+1*4); + VST1_32(v0, 3, ed); + if(!l && vex.l) + ADDx_U12(ed, ed, 4); + } } - if(rex.w) - VSSHRQ_64(q0, v2, 63); - else - VSSHRQ_32(q0, v2, 31); - VBITQ(v1, v0, q0); - if(!MODREG) - VSTR128_U12(v1, ed, fixedaddress+16*l); } // no raz of upper ymm break; diff --git a/tests/test30.c b/tests/test30.c index def89c2d..ecc0da57 100644 --- a/tests/test30.c +++ b/tests/test30.c @@ -1,4 +1,4 @@ -// build with gcc -O0 -g -msse -msse2 -mssse3 -msse4.1 -mavx test30.c -o test30 +// build with gcc -O0 -g -msse -msse2 -mssse3 -msse4.1 -mavx test30.c -o test30 -march=native #include <inttypes.h> #include <string.h> #include <stdio.h> @@ -8,6 +8,8 @@ #include <math.h> #include <pmmintrin.h> #include <immintrin.h> +#include <sys/mman.h> +#include <unistd.h> typedef unsigned char u8x16 __attribute__ ((vector_size (16))); typedef unsigned short u16x8 __attribute__ ((vector_size (16))); @@ -15,6 +17,9 @@ typedef unsigned int u32x4 __attribute__ ((vector_size (16))); typedef unsigned long int u64x2 __attribute__ ((vector_size (16))); typedef float f32x4 __attribute__ ((vector_size (16))); typedef double d64x2 __attribute__ ((vector_size (16))); +int testVPMASKMOV(); +int testVMASKMOVP(); +static int ACCESS_TEST = 1; typedef union { __m128i mm; @@ -736,7 +741,255 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r); MULITGO2Cps(dp, dpps, 0x3f) MULITGO2Cps(dp, dpps, 0xf3) MULITGO2Cps(dp, dpps, 0x53) +// open this test must update test30 and ref30.txt +// ACCESS_TEST = 2; +// testVPMASKMOV(); +// testVMASKMOVP(); +// ACCESS_TEST = 1; +// testVPMASKMOV(); +// testVMASKMOVP(); return 0; } +__m256i m256_setr_epi64x(long long a, long long b, long long c, long long d) +{ + union { + long long q[4]; + int r[8]; + } u; + u.q[0] = a; u.q[1] = b; u.q[2] = c; u.q[3] = d; + return _mm256_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3], u.r[4], u.r[5], u.r[6], u.r[7]); +} + +__m128i m128_setr_epi64x(long long a, long long b) +{ + union { + long long q[2]; + int r[4]; + } u; + u.q[0] = a; u.q[1] = b; + return _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]); +} + +int testVPMASKMOV() { + long pageSize = sysconf(_SC_PAGESIZE); + + void *baseAddress = mmap(NULL, pageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (baseAddress == MAP_FAILED) { + printf("mmap failed\n"); + return 1; + } + void *resultAddress = mmap(NULL, pageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (resultAddress == MAP_FAILED) { + printf("mmap failed\n"); + return 1; + } + + int *intData = (int *)((char *)baseAddress + pageSize - 4 * ACCESS_TEST * sizeof(int)); // 32 bytes for 8 integers + int *intResult = (int *)((char *)resultAddress + pageSize - 4 * ACCESS_TEST * sizeof(int)); // 32 bytes for 8 integers + + for (int i = 0; i < 4 * ACCESS_TEST; i++) { + intData[i] = i + 1; + } + + __m256i mask256_int = _mm256_setr_epi32(-1, -1, -1, -1, 1 - ACCESS_TEST, 0, 1 - ACCESS_TEST, 0); // 32-bit mask + __m128i mask128_int = _mm_setr_epi32(-1, -1, 1 - ACCESS_TEST, 0); // 32-bit mask + __m256i mask256_long = m256_setr_epi64x(-1, -1, 1 - ACCESS_TEST, 0); // 64-bit mask + __m128i mask128_long = m128_setr_epi64x(-1, 0); // 64-bit mask + // ************************************************************** _mm256_maskload_epi32 + __m256i loaded_int256 = _mm256_maskload_epi32(intData, mask256_int); + printf("VPMASKMOV "); + for (int i = 0; i < 8; i++) { + printf("%d ", ((int*)&loaded_int256)[i]); + } + printf("\n"); + + memset(resultAddress, 0, pageSize); + _mm256_maskstore_epi32(intResult, mask256_int, loaded_int256); + printf("VPMASKMOV "); + for (int i = 0; i < 4 * ACCESS_TEST; i++) { + printf("%d ", intResult[i]); + } + printf("\n"); + + // ************************************************************** _mm_maskload_epi32 + __m128i loaded_int128 = _mm_maskload_epi32(intData, mask128_int); + printf("VPMASKMOV "); + for (int i = 0; i < 4; i++) { + printf("%d ", ((int*)&loaded_int128)[i]); + } + printf("\n"); + + memset(resultAddress, 0, pageSize); + _mm_maskstore_epi32(intResult, mask128_int, loaded_int128); + printf("VPMASKMOV "); + for (int i = 0; i < 2 * ACCESS_TEST; i++) { + printf("%d ", intResult[i]); + } + printf("\n"); + + long long *longData = (long long *)((char *)baseAddress + pageSize - 2 * ACCESS_TEST * sizeof(long long)); // 32 bytes for 4 long integers + long long *longResult = (long long *)((char *)resultAddress + pageSize - 2 * ACCESS_TEST * sizeof(long long)); // 32 bytes for 8 integers + for (int i = 0; i < 2 * ACCESS_TEST; i++) { + longData[i] = i + 1; + } + + // ************************************************************** _mm256_maskload_epi64 + __m256i loaded_long256 = _mm256_maskload_epi64(longData, mask256_long); + printf("VPMASKMOV "); + for (int i = 0; i < 4; i++) { + printf("%lld ", ((long long*)&loaded_long256)[i]); + } + printf("\n"); + + memset(resultAddress, 0, pageSize); + _mm256_maskstore_epi64(longResult, mask256_long, loaded_long256); + printf("VPMASKMOV "); + for (int i = 0; i < 2 * ACCESS_TEST; i++) { + printf("%lld ", longResult[i]); + } + printf("\n"); + + // ************************************************************** _mm_maskload_epi64 + __m128i loaded_long128 = _mm_maskload_epi64(longData, mask128_long); + printf("VPMASKMOV "); + for (int i = 0; i < 2; i++) { + printf("%lld ", ((long long*)&loaded_long128)[i]); + } + printf("\n"); + + // _mm_maskstore_epi64 + memset(resultAddress, 0, pageSize); + _mm_maskstore_epi64(longResult, mask128_long, loaded_long128); + printf("VPMASKMOV "); + for (int i = 0; i < 1 * ACCESS_TEST; i++) { + printf("%lld ", longResult[i]); + } + printf("\n"); + + munmap(baseAddress, pageSize); + munmap(resultAddress, pageSize); + + return 0; +} + +int testVMASKMOVP() { + long pageSize = sysconf(_SC_PAGESIZE); + + void *baseAddress = mmap(NULL, pageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (baseAddress == MAP_FAILED) { + perror("mmap failed"); + return 1; + } + void *destAddress = mmap(NULL, pageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (destAddress == MAP_FAILED) { + perror("mmap failed"); + return 1; + } + + float *floatData = (float *)((char *)baseAddress + pageSize - 16 * ACCESS_TEST); // 16 bytes for 4 floats + float *floatDest = (float *)((char *)destAddress + pageSize - 16 * ACCESS_TEST); // 16 bytes for 4 floats + + int mask_data[8] = { -1, 0, -1, -1, 0, 1 - ACCESS_TEST, 0, 0 }; // -1 的二进制表示是 0xFFFFFFFF(最高位为 1) + __m256i mask256ps = _mm256_loadu_si256((__m256i const *)mask_data); + __m256i mask256pd = _mm256_setr_epi64x(-1, -1, 0, 1 - ACCESS_TEST); + __m128i mask128 = _mm_setr_epi32(-1, -1, 0, 1 - ACCESS_TEST); + + //================================================================================= + // _mm256_maskload_ps + for (int i = 0; i < 4 * ACCESS_TEST; i++) { + floatData[i] = (float)(i + 1); + } + + __m256 floatVec = _mm256_maskload_ps(floatData, mask256ps); + printf("VMASKMOVP "); + for (int i = 0; i < 8; i++) { + printf("%f ", ((float*)&floatVec)[i]); + } + printf("\n"); + + // _mm256_maskstore_ps + memset(destAddress, 0, pageSize); + _mm256_maskstore_ps(floatDest, mask256ps, floatVec); + printf("VMASKMOVP "); + for (int i = 0; i < 4 * ACCESS_TEST; i++) { + printf("%f ", floatDest[i]); + } + printf("\n"); + + //================================================================================= + for (int i = 0; i < 4 * ACCESS_TEST; i++) { + floatData[i] = (float)(i + 10); + } + + // _mm_maskload_ps + __m128 floatVec128 = _mm_maskload_ps(floatData, mask128); + printf("VMASKMOVP "); + for (int i = 0; i < 4; i++) { + printf("%f ", ((float*)&floatVec128)[i]); + } + printf("\n"); + + // _mm_maskstore_ps + memset(destAddress, 0, pageSize); + _mm_maskstore_ps(floatDest, mask128, floatVec128); + printf("VMASKMOVP "); + for (int i = 0; i < 2 * ACCESS_TEST; i++) { + printf("%f ", floatDest[i]); + } + printf("\n"); + + //================================================================================= + double *doubleData = (double *)((char *)baseAddress + pageSize - 16 * ACCESS_TEST); // 16 bytes for 2 doubles + double *doubleDest = (double *)((char *)destAddress + pageSize - 16 * ACCESS_TEST); // 16 bytes for 2 doubles + for (int i = 0; i < 2 * ACCESS_TEST; i++) { + doubleData[i] = (double)(i + 20); + } + + // _mm256_maskload_pd + __m256d doubleVec = _mm256_maskload_pd(doubleData, mask256pd); + printf("VMASKMOVP "); + for (int i = 0; i < 4; i++) { + printf("%lf ", ((double *)&doubleVec)[i]); + } + printf("\n"); + + // _mm256_maskstore_pd + memset(destAddress, 0, pageSize); + _mm256_maskstore_pd(doubleDest, mask256pd, doubleVec); + printf("VMASKMOVP "); + for (int i = 0; i < 2 * ACCESS_TEST; i++) { + printf("%f ", doubleDest[i]); + } + printf("\n"); + + //================================================================================= + for (int i = 0; i < 2 * ACCESS_TEST; i++) { + doubleData[i] = (double)(i + 30); + } + + // _mm_maskload_pd + __m128d doubleVec128 = _mm_maskload_pd(doubleData, mask128); + printf("VMASKMOVP "); + for (int i = 0; i < 2; i++) { + printf("%lf ", ((double *)&doubleVec128)[i]); + } + printf("\n"); + + // _mm_maskstore_pd + memset(destAddress, 0, pageSize); + _mm_maskstore_pd(doubleDest, mask128, doubleVec128); + printf("VMASKMOVP "); + for (int i = 0; i < 1 * ACCESS_TEST; i++) { + printf("%f ", doubleDest[i]); + } + printf("\n"); + + //================================================================================= + + munmap(baseAddress, pageSize); + munmap(destAddress, pageSize); + + return 0; +} \ No newline at end of file |