about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorye-yeshun <89620590+ye-yeshun@users.noreply.github.com>2025-02-12 17:32:57 +0800
committerGitHub <noreply@github.com>2025-02-12 10:32:57 +0100
commitb8cc8594f6d9cbe4a47b8a98ba9878da803a7243 (patch)
treec304e50b1079e89f17f7e1c4fc7c5252d887301c
parent6197b70a9a8f4a0d0857d94e67e45bd25d1f3bb0 (diff)
downloadbox64-b8cc8594f6d9cbe4a47b8a98ba9878da803a7243.tar.gz
box64-b8cc8594f6d9cbe4a47b8a98ba9878da803a7243.zip
fix VPMASKMOV and VMASKMOVP (#2342)
* fix: 0x2C: 'VMASKMOVPS Gx, Vx, Ex'

Log: VMASKMOVPS读取内存时, 如果某些mask位是0则不进行读写避免访问越界

Signed-off-by: YeshunYe <yeyeshun@uniontech.com>
Change-Id: I197fc356edcac202b5a329c50c334d0166532e93

* fix: 0x2D: 'VMASKMOVPD Gx, Vx, Ex'

Log: VMASKMOVPD读取内存时, 如果某些mask位是0则不进行读写避免访问越界

Signed-off-by: YeshunYe <yeyeshun@uniontech.com>
Change-Id: Ie11d93971aa92b141540a37bfdae0b3b060e3aea

* fix: 0x2E: 'VMASKMOVPS Ex, Gx, Vx'

Log: VMASKMOVPS写入内存时, 如果某些mask位是0则不进行读写避免访问越界

Signed-off-by: YeshunYe <yeyeshun@uniontech.com>
Change-Id: Ide5cb36dc03fc56480fdd45e7d96daed8557d849

* fix: 0x2F: 'VMASKMOVPD Ex, Gx, Vx'

Log: VMASKMOVPD写入内存时, 如果某些mask位是0则不进行读写避免访问越界

Signed-off-by: YeshunYe <yeyeshun@uniontech.com>
Change-Id: I037de8568e9d2d29597fdf08f991d54e3cb2f6d9

* fix: 0x8E: 'VPMASKMOVD/Q Ex, Vx, Gx'

Log: VPMASKMOVD/Q写入内存时, 如果某些mask位是0则不进行读写避免访问越界

Signed-off-by: YeshunYe <yeyeshun@uniontech.com>
Change-Id: I0dc98a29ed933d953e137e777bc296149d94b10b

* tests: add test for VPMASKMOV and VMASKMOVP

Log:

Signed-off-by: YeshunYe <yeyeshun@uniontech.com>

---------

Signed-off-by: YeshunYe <yeyeshun@uniontech.com>
-rw-r--r--src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c272
-rw-r--r--tests/test30.c255
2 files changed, 459 insertions, 68 deletions
diff --git a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
index dcc455e1..633f35a9 100644
--- a/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
+++ b/src/dynarec/arm64/dynarec_arm64_avx_66_0f38.c
@@ -629,29 +629,112 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
         case 0x2C:
             INST_NAME("VMASKMOVPS Gx, Vx, Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
+            GETVX(v2, 0);
+            GETGX_empty(v0);
             q0 = fpu_get_scratch(dyn, ninst);
-            // create mask
             VSSHRQ_32(q0, v2, 31);
-            VANDQ(v0, v1, q0);
+            VEORQ(v0, v0, v0);
+            if (MODREG) {
+                v1 = sse_get_reg(dyn, ninst, x3, (nextop & 7) + (rex.b << 3), 0);
+                VANDQ(v0, v1, q0);
+            } else {
+                SMREAD();
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
+                EORx_REG(x4, x4, x4);
+                VMOVSto(x4, q0, 0);
+                CBZx(x4, 4+1*4);
+                VLD1_32(v0, 0, ed);
+                ADDx_U12(ed, ed, 4);
+                VMOVSto(x4, q0, 1);
+                CBZx(x4, 4+1*4);
+                VLD1_32(v0, 1, ed);
+                ADDx_U12(ed, ed, 4);
+                VMOVSto(x4, q0, 2);
+                CBZx(x4, 4+1*4);
+                VLD1_32(v0, 2, ed);
+                ADDx_U12(ed, ed, 4);
+                VMOVSto(x4, q0, 3);
+                CBZx(x4, 4+1*4);
+                VLD1_32(v0, 3, ed);
+                if(vex.l)
+                    ADDx_U12(ed, ed, 4);
+            }
             if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+                v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
+                v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
                 VSSHRQ_32(q0, v2, 31);
-                VANDQ(v0, v1, q0);
+                VEORQ(v0, v0, v0);
+                if(MODREG)
+                {
+                    v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1);
+                    VANDQ(v0, v1, q0);
+                }
+                else
+                {
+                    VMOVSto(x4, q0, 0);
+                    CBZx(x4, 4+1*4);
+                    VLD1_32(v0, 0, ed);
+                    ADDx_U12(ed, ed, 4);
+                    VMOVSto(x4, q0, 1);
+                    CBZx(x4, 4+1*4);
+                    VLD1_32(v0, 1, ed);
+                    ADDx_U12(ed, ed, 4);
+                    VMOVSto(x4, q0, 2);
+                    CBZx(x4, 4+1*4);
+                    VLD1_32(v0, 2, ed);
+                    ADDx_U12(ed, ed, 4);
+                    VMOVSto(x4, q0, 3);
+                    CBZx(x4, 4+1*4);
+                    VLD1_32(v0, 3, ed);
+                }
             } else YMM0(gd);
             break;
         case 0x2D:
             INST_NAME("VMASKMOVPD Gx, Vx, Ex");
             nextop = F8;
-            GETGX_empty_VXEX(v0, v2, v1, 0);
+            GETVX(v2, 0);
+            GETGX_empty(v0);
             q0 = fpu_get_scratch(dyn, ninst);
             // create mask
             VSSHRQ_64(q0, v2, 63);
-            VANDQ(v0, v1, q0);
+            VEORQ(v0, v0, v0);
+            if (MODREG) {
+                v1 = sse_get_reg(dyn, ninst, x3, (nextop & 7) + (rex.b << 3), 0);
+                VANDQ(v0, v1, q0);
+            } else {
+                SMREAD();
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
+                EORx_REG(x4, x4, x4);
+                VMOVQDto(x4, q0, 0);
+                CBZx(x4, 4+1*4);
+                VLD1_64(v0, 0, ed);
+                ADDx_U12(ed, ed, 8);
+                VMOVQDto(x4, q0, 1);
+                CBZx(x4, 4+1*4);
+                VLD1_64(v0, 1, ed);
+                if(vex.l)
+                    ADDx_U12(ed, ed, 8);
+            }
             if(vex.l) {
-                GETGY_empty_VYEY(v0, v2, v1);
+                v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
+                v0 = ymm_get_reg_empty(dyn, ninst, x1, gd, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
                 VSSHRQ_64(q0, v2, 63);
-                VANDQ(v0, v1, q0);
+                VEORQ(v0, v0, v0);
+                if(MODREG)
+                {
+                    v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 0, gd, vex.v, -1);
+                    VANDQ(v0, v1, q0);
+                }
+                else
+                {
+                    VMOVQDto(x4, q0, 0);
+                    CBZx(x4, 4+1*4);
+                    VLD1_64(v0, 0, ed);
+                    ADDx_U12(ed, ed, 8);
+                    VMOVQDto(x4, q0, 1);
+                    CBZx(x4, 4+1*4);
+                    VLD1_64(v0, 1, ed);
+                }
             } else YMM0(gd);
             break;
         case 0x2E:
@@ -663,37 +746,55 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             VSSHRQ_32(q0, v2, 31);
             if(MODREG) {
                 v1 = sse_get_reg(dyn, ninst, x3, (nextop&7)+(rex.b<<3), 1);
+                VBITQ(v1, v0, q0);
             } else {
-                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0);
-                unscaled = 0;
-                v1 = fpu_get_scratch(dyn, ninst);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
                 // check if mask as anything, else skip the whole read/write to avoid a SEGFAULT.
                 // TODO: let a segfault trigger and check if the mask is null instead and ignore the segfault / actually triger: needs to implement SSE reg tracking first!
-                SQXTN_32(v1, q0);
-                VMOVQDto(x4, v1, 0);
-                CBZx(x4, 4+3*4);
-                VLDR128_U12(v1, ed, fixedaddress);
-            }
-            // create mask
-            VBITQ(v1, v0, q0);
-            if(!MODREG) {
-                VSTR128_U12(v1, ed, fixedaddress);
+                EORx_REG(x4, x4, x4);
+                VMOVSto(x4, q0, 0);
+                CBZx(x4, 4+1*4);
+                VST1_32(v0, 0, ed);
+                ADDx_U12(ed, ed, 4);
+                VMOVSto(x4, q0, 1);
+                CBZx(x4, 4+1*4);
+                VST1_32(v0, 1, ed);
+                ADDx_U12(ed, ed, 4);
+                VMOVSto(x4, q0, 2);
+                CBZx(x4, 4+1*4);
+                VST1_32(v0, 2, ed);
+                ADDx_U12(ed, ed, 4);
+                VMOVSto(x4, q0, 3);
+                CBZx(x4, 4+1*4);
+                VST1_32(v0, 3, ed);
+                if(vex.l)
+                    ADDx_U12(ed, ed, 4);
             }
+
             if(vex.l && !is_avx_zero(dyn, ninst, vex.v)) {
                 v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
                 v0 = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
                 VSSHRQ_32(q0, v2, 31);
-                if(MODREG)
+                if(MODREG) {
                     v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1, gd, vex.v, -1);
-                else {
-                    SQXTN_32(v1, q0);
-                    VMOVQDto(x4, v1, 0);
-                    CBZx(x4, 4+3*4);
-                    VLDR128_U12(v1, ed, fixedaddress+16);
+                    VBITQ(v1, v0, q0);
                 }
-                VBITQ(v1, v0, q0);
-                if(!MODREG) {
-                    VSTR128_U12(v1, ed, fixedaddress+16);
+                else {
+                    VMOVSto(x4, q0, 0);
+                    CBZx(x4, 4+1*4);
+                    VST1_32(v0, 0, ed);
+                    ADDx_U12(ed, ed, 4);
+                    VMOVSto(x4, q0, 1);
+                    CBZx(x4, 4+1*4);
+                    VST1_32(v0, 1, ed);
+                    ADDx_U12(ed, ed, 4);
+                    VMOVSto(x4, q0, 2);
+                    CBZx(x4, 4+1*4);
+                    VST1_32(v0, 2, ed);
+                    ADDx_U12(ed, ed, 4);
+                    VMOVSto(x4, q0, 3);
+                    CBZx(x4, 4+1*4);
+                    VST1_32(v0, 3, ed);
                 }
             }
             break;
@@ -701,43 +802,46 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             INST_NAME("VMASKMOVPD Ex, Gx, Vx");
             nextop = F8;
             q0 = fpu_get_scratch(dyn, ninst);
-            q1 = fpu_get_scratch(dyn, ninst);
             GETVX(v2, 0);
             GETGX(v0, 0);
             VSSHRQ_64(q0, v2, 63);
             if(MODREG) {
                 v1 = sse_get_reg(dyn, ninst, x3, (nextop&7)+(rex.b<<3), 1);
+                VBITQ(v1, v0, q0);
             } else {
-                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0);
+                addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
                 unscaled = 0;
                 v1 = fpu_get_scratch(dyn, ninst);
                 // check if mask as anything, else skip the whole read/write to avoid a SEGFAULT.
                 // TODO: let a segfault trigger and check if the mask is null instead and ignore the segfault / actually triger: needs to implement SSE reg tracking first!
-                SQXTN_32(q1, q0);
-                VMOVQDto(x4, q1, 0);
-                CBZx(x4, 4+3*4);
-                VLDR128_U12(v1, ed, fixedaddress);
-            }
-            // create mask
-            VBITQ(v1, v0, q0);
-            if(!MODREG) {
-                VSTR128_U12(v1, ed, fixedaddress);
+                EORx_REG(x4, x4, x4);
+                VMOVQDto(x4, q0, 0);
+                CBZx(x4, 4+1*4);
+                VST1_64(v0, 0, ed);
+                ADDx_U12(ed, ed, 8);
+                VMOVQDto(x4, q0, 1);
+                CBZx(x4, 4+1*4);
+                VST1_64(v0, 1, ed);
+                if(vex.l)
+                    ADDx_U12(ed, ed, 8);
             }
+
             if(vex.l && !is_avx_zero(dyn, ninst, vex.v)) {
                 v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
                 v0 = ymm_get_reg(dyn, ninst, x1, gd, 0, vex.v, (MODREG)?((nextop&7)+(rex.b<<3)):-1, -1);
                 VSSHRQ_64(q0, v2, 63);
-                if(MODREG)
+                if(MODREG) {
                     v1 = ymm_get_reg(dyn, ninst, x1, (nextop&7)+(rex.b<<3), 1, gd, vex.v, -1);
-                else {
-                    SQXTN_32(q1, q0);
-                    VMOVQDto(x4, q1, 0);
-                    CBZx(x4, 4+3*4);
-                    VLDR128_U12(v1, ed, fixedaddress+16);
+                    VBITQ(v1, v0, q0);
                 }
-                VBITQ(v1, v0, q0);
-                if(!MODREG) {
-                    VSTR128_U12(v1, ed, fixedaddress+16);
+                else {
+                    VMOVQDto(x4, q0, 0);
+                    CBZx(x4, 4+1*4);
+                    VST1_64(v0, 0, ed);
+                    ADDx_U12(ed, ed, 8);
+                    VMOVQDto(x4, q0, 1);
+                    CBZx(x4, 4+1*4);
+                    VST1_64(v0, 1, ed);
                 }
             }
             break;
@@ -1174,31 +1278,65 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip
             nextop = F8;
             q0 = fpu_get_scratch(dyn, ninst);
             for(int l=0; l<1+vex.l; ++l) {
-                if(!l) {
-                    GETGX(v0, 0); GETVX(v2, 0);
-                    if(MODREG) {
+                if(MODREG) {
+                    if(!l) {
+                        GETGX(v0, 0); GETVX(v2, 0);
                         s0 = (nextop&7)+(rex.b<<3);
                         v1 = sse_get_reg_empty(dyn, ninst, x1, s0);
                     } else {
+                        GETGY(v0, 0, vex.v, s0, -1); v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1);
+                        v1 = ymm_get_reg_empty(dyn, ninst, x1, s0, gd, vex.v, -1);
+                    }
+                    if(rex.w)
+                        VSSHRQ_64(q0, v2, 63);
+                    else
+                        VSSHRQ_32(q0, v2, 31);
+                    VBITQ(v1, v0, q0);
+                } else {
+                    if(!l) {
+                        GETGX(v0, 0); GETVX(v2, 0);
                         s0 = -1;
                         v1 = fpu_get_scratch(dyn, ninst);
-                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0xffe<<4, 15, rex, NULL, 0, 0);
-                        VLDR128_U12(v1, ed, fixedaddress);
+                        addr = geted(dyn, addr, ninst, nextop, &ed, x3, &fixedaddress, NULL, 0, 0, rex, NULL, 0, 0);
+                        EORx_REG(x4, x4, x4);
+                    } else {
+                        GETGY(v0, 0, vex.v, s0, -1); v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1);
+                    }
+                    if(rex.w)
+                    {
+                        VSSHRQ_64(q0, v2, 63);
+                        VMOVQDto(x4, q0, 0);
+                        CBZx(x4, 4+1*4);
+                        VST1_64(v0, 0, ed);
+                        ADDx_U12(ed, ed, 8);
+                        VMOVQDto(x4, q0, 1);
+                        CBZx(x4, 4+1*4);
+                        VST1_64(v0, 1, ed);
+                        if(!l && vex.l)
+                            ADDx_U12(ed, ed, 8);
                     }
-                } else {
-                    GETGY(v0, 0, vex.v, s0, -1); v2 = ymm_get_reg(dyn, ninst, x1, vex.v, 0, gd, s0, -1);
-                    if(MODREG)
-                        v1 = ymm_get_reg_empty(dyn, ninst, x1, s0, gd, vex.v, -1);
                     else
-                        VLDR128_U12(v1, ed, fixedaddress+16);
+                    {
+                        VSSHRQ_32(q0, v2, 31);
+                        VMOVSto(x4, q0, 0);
+                        CBZx(x4, 4+1*4);
+                        VST1_32(v0, 0, ed);
+                        ADDx_U12(ed, ed, 4);
+                        VMOVSto(x4, q0, 1);
+                        CBZx(x4, 4+1*4);
+                        VST1_32(v0, 1, ed);
+                        ADDx_U12(ed, ed, 4);
+                        VMOVSto(x4, q0, 2);
+                        CBZx(x4, 4+1*4);
+                        VST1_32(v0, 2, ed);
+                        ADDx_U12(ed, ed, 4);
+                        VMOVSto(x4, q0, 3);
+                        CBZx(x4, 4+1*4);
+                        VST1_32(v0, 3, ed);
+                        if(!l && vex.l)
+                            ADDx_U12(ed, ed, 4);
+                    }
                 }
-                if(rex.w)
-                    VSSHRQ_64(q0, v2, 63);
-                else
-                    VSSHRQ_32(q0, v2, 31);
-                VBITQ(v1, v0, q0);
-                if(!MODREG)
-                    VSTR128_U12(v1, ed, fixedaddress+16*l);
             }
             // no raz of upper ymm
             break;
diff --git a/tests/test30.c b/tests/test30.c
index def89c2d..ecc0da57 100644
--- a/tests/test30.c
+++ b/tests/test30.c
@@ -1,4 +1,4 @@
-// build with  gcc -O0 -g -msse -msse2 -mssse3 -msse4.1 -mavx test30.c -o test30
+// build with  gcc -O0 -g -msse -msse2 -mssse3 -msse4.1 -mavx test30.c -o test30 -march=native
 #include <inttypes.h>
 #include <string.h>
 #include <stdio.h>
@@ -8,6 +8,8 @@
 #include <math.h>
 #include <pmmintrin.h>
 #include <immintrin.h> 
+#include <sys/mman.h>
+#include <unistd.h>
 
 typedef unsigned char u8x16 __attribute__ ((vector_size (16)));
 typedef unsigned short u16x8 __attribute__ ((vector_size (16)));
@@ -15,6 +17,9 @@ typedef unsigned int  u32x4 __attribute__ ((vector_size (16)));
 typedef unsigned long int  u64x2 __attribute__ ((vector_size (16)));
 typedef float  f32x4 __attribute__ ((vector_size (16)));
 typedef double d64x2 __attribute__ ((vector_size (16)));
+int testVPMASKMOV();
+int testVMASKMOVP();
+static int ACCESS_TEST = 1;
 
 typedef union {
         __m128i mm;
@@ -736,7 +741,255 @@ printf(N " %g, %g => %g\n", b, a, *(float*)&r);
  MULITGO2Cps(dp, dpps, 0x3f)
  MULITGO2Cps(dp, dpps, 0xf3)
  MULITGO2Cps(dp, dpps, 0x53)
+// open this test must update test30 and ref30.txt
+//  ACCESS_TEST = 2;
+//  testVPMASKMOV();
+//  testVMASKMOVP();
+//  ACCESS_TEST = 1;
+//  testVPMASKMOV();
+//  testVMASKMOVP();
 
  return 0;
 }
 
+__m256i m256_setr_epi64x(long long a, long long b, long long c, long long d)
+{
+    union {
+        long long q[4];
+        int r[8];
+    } u;
+    u.q[0] = a; u.q[1] = b; u.q[2] = c; u.q[3] = d;
+    return _mm256_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3], u.r[4], u.r[5], u.r[6], u.r[7]);
+}
+
+__m128i m128_setr_epi64x(long long a, long long b)
+{
+    union {
+        long long q[2];
+        int r[4];
+    } u;
+    u.q[0] = a; u.q[1] = b;
+    return _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]);
+}
+
+int testVPMASKMOV() {
+    long pageSize = sysconf(_SC_PAGESIZE);
+
+    void *baseAddress = mmap(NULL, pageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (baseAddress == MAP_FAILED) {
+        printf("mmap failed\n");
+        return 1;
+    }
+    void *resultAddress = mmap(NULL, pageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (resultAddress == MAP_FAILED) {
+        printf("mmap failed\n");
+        return 1;
+    }
+
+    int *intData = (int *)((char *)baseAddress + pageSize - 4 * ACCESS_TEST * sizeof(int)); // 32 bytes for 8 integers
+    int *intResult = (int *)((char *)resultAddress + pageSize - 4 * ACCESS_TEST * sizeof(int)); // 32 bytes for 8 integers
+
+    for (int i = 0; i < 4 * ACCESS_TEST; i++) {
+        intData[i] = i + 1;
+    }
+
+    __m256i mask256_int = _mm256_setr_epi32(-1, -1, -1, -1, 1 - ACCESS_TEST, 0, 1 - ACCESS_TEST, 0); // 32-bit mask
+    __m128i mask128_int = _mm_setr_epi32(-1, -1, 1 - ACCESS_TEST, 0); // 32-bit mask
+    __m256i mask256_long = m256_setr_epi64x(-1, -1, 1 - ACCESS_TEST, 0); // 64-bit mask
+    __m128i mask128_long = m128_setr_epi64x(-1, 0); // 64-bit mask
+    // ************************************************************** _mm256_maskload_epi32
+    __m256i loaded_int256 = _mm256_maskload_epi32(intData, mask256_int);
+    printf("VPMASKMOV ");
+    for (int i = 0; i < 8; i++) {
+        printf("%d ", ((int*)&loaded_int256)[i]);
+    }
+    printf("\n");
+
+    memset(resultAddress, 0, pageSize);
+    _mm256_maskstore_epi32(intResult, mask256_int, loaded_int256);
+    printf("VPMASKMOV ");
+    for (int i = 0; i < 4 * ACCESS_TEST; i++) {
+        printf("%d ", intResult[i]);
+    }
+    printf("\n");
+
+    // ************************************************************** _mm_maskload_epi32
+    __m128i loaded_int128 = _mm_maskload_epi32(intData, mask128_int);
+    printf("VPMASKMOV ");
+    for (int i = 0; i < 4; i++) {
+        printf("%d ", ((int*)&loaded_int128)[i]);
+    }
+    printf("\n");
+
+    memset(resultAddress, 0, pageSize);
+    _mm_maskstore_epi32(intResult, mask128_int, loaded_int128);
+    printf("VPMASKMOV ");
+    for (int i = 0; i < 2 * ACCESS_TEST; i++) {
+        printf("%d ", intResult[i]);
+    }
+    printf("\n");
+
+    long long *longData = (long long *)((char *)baseAddress + pageSize - 2 * ACCESS_TEST * sizeof(long long)); // 32 bytes for 4 long integers
+    long long *longResult = (long long *)((char *)resultAddress + pageSize - 2 * ACCESS_TEST * sizeof(long long)); // 32 bytes for 8 integers
+    for (int i = 0; i < 2 * ACCESS_TEST; i++) {
+        longData[i] = i + 1;
+    }
+
+    // ************************************************************** _mm256_maskload_epi64
+    __m256i loaded_long256 = _mm256_maskload_epi64(longData, mask256_long);
+    printf("VPMASKMOV ");
+    for (int i = 0; i < 4; i++) {
+        printf("%lld ", ((long long*)&loaded_long256)[i]);
+    }
+    printf("\n");
+
+    memset(resultAddress, 0, pageSize);
+    _mm256_maskstore_epi64(longResult, mask256_long, loaded_long256);
+    printf("VPMASKMOV ");
+    for (int i = 0; i < 2 * ACCESS_TEST; i++) {
+        printf("%lld ", longResult[i]);
+    }
+    printf("\n");
+
+    // ************************************************************** _mm_maskload_epi64
+    __m128i loaded_long128 = _mm_maskload_epi64(longData, mask128_long);
+    printf("VPMASKMOV ");
+    for (int i = 0; i < 2; i++) {
+        printf("%lld ", ((long long*)&loaded_long128)[i]);
+    }
+    printf("\n");
+
+    //  _mm_maskstore_epi64
+    memset(resultAddress, 0, pageSize);
+    _mm_maskstore_epi64(longResult, mask128_long, loaded_long128);
+    printf("VPMASKMOV ");
+    for (int i = 0; i < 1 * ACCESS_TEST; i++) {
+        printf("%lld ", longResult[i]);
+    }
+    printf("\n");
+
+    munmap(baseAddress, pageSize);
+    munmap(resultAddress, pageSize);
+
+    return 0;
+}
+
+int testVMASKMOVP() {
+    long pageSize = sysconf(_SC_PAGESIZE);
+
+    void *baseAddress = mmap(NULL, pageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (baseAddress == MAP_FAILED) {
+        perror("mmap failed");
+        return 1;
+    }
+    void *destAddress = mmap(NULL, pageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (destAddress == MAP_FAILED) {
+        perror("mmap failed");
+        return 1;
+    }
+
+    float *floatData = (float *)((char *)baseAddress + pageSize - 16 * ACCESS_TEST); // 16 bytes for 4 floats
+    float *floatDest = (float *)((char *)destAddress + pageSize - 16 * ACCESS_TEST); // 16 bytes for 4 floats
+
+    int mask_data[8] = { -1, 0, -1, -1, 0, 1 - ACCESS_TEST, 0, 0 };  // -1 的二进制表示是 0xFFFFFFFF(最高位为 1)
+    __m256i mask256ps = _mm256_loadu_si256((__m256i const *)mask_data);
+    __m256i mask256pd = _mm256_setr_epi64x(-1, -1, 0, 1 - ACCESS_TEST);
+    __m128i mask128 = _mm_setr_epi32(-1, -1, 0, 1 - ACCESS_TEST);
+
+    //=================================================================================
+    //  _mm256_maskload_ps
+    for (int i = 0; i < 4 * ACCESS_TEST; i++) {
+        floatData[i] = (float)(i + 1);
+    }
+
+    __m256 floatVec = _mm256_maskload_ps(floatData, mask256ps);
+    printf("VMASKMOVP ");
+    for (int i = 0; i < 8; i++) {
+        printf("%f ", ((float*)&floatVec)[i]);
+    }
+    printf("\n");
+
+    //  _mm256_maskstore_ps
+    memset(destAddress, 0, pageSize);
+    _mm256_maskstore_ps(floatDest, mask256ps, floatVec);
+    printf("VMASKMOVP ");
+    for (int i = 0; i < 4 * ACCESS_TEST; i++) {
+        printf("%f ", floatDest[i]);
+    }
+    printf("\n");
+
+    //=================================================================================
+    for (int i = 0; i < 4 * ACCESS_TEST; i++) {
+        floatData[i] = (float)(i + 10);
+    }
+
+    //  _mm_maskload_ps
+    __m128 floatVec128 = _mm_maskload_ps(floatData, mask128);
+    printf("VMASKMOVP ");
+    for (int i = 0; i < 4; i++) {
+        printf("%f ", ((float*)&floatVec128)[i]);
+    }
+    printf("\n");
+
+    //  _mm_maskstore_ps
+    memset(destAddress, 0, pageSize);
+    _mm_maskstore_ps(floatDest, mask128, floatVec128);
+    printf("VMASKMOVP ");
+    for (int i = 0; i < 2 * ACCESS_TEST; i++) {
+        printf("%f ", floatDest[i]);
+    }
+    printf("\n");
+
+    //=================================================================================
+    double *doubleData = (double *)((char *)baseAddress + pageSize - 16 * ACCESS_TEST); // 16 bytes for 2 doubles
+    double *doubleDest = (double *)((char *)destAddress + pageSize - 16 * ACCESS_TEST); // 16 bytes for 2 doubles
+    for (int i = 0; i < 2 * ACCESS_TEST; i++) {
+        doubleData[i] = (double)(i + 20);
+    }
+
+    //  _mm256_maskload_pd
+    __m256d doubleVec = _mm256_maskload_pd(doubleData, mask256pd);
+    printf("VMASKMOVP ");
+    for (int i = 0; i < 4; i++) {
+        printf("%lf ", ((double *)&doubleVec)[i]);
+    }
+    printf("\n");
+
+    //  _mm256_maskstore_pd
+    memset(destAddress, 0, pageSize);
+    _mm256_maskstore_pd(doubleDest, mask256pd, doubleVec);
+    printf("VMASKMOVP ");
+    for (int i = 0; i < 2 * ACCESS_TEST; i++) {
+        printf("%f ", doubleDest[i]);
+    }
+    printf("\n");
+
+    //=================================================================================
+    for (int i = 0; i < 2 * ACCESS_TEST; i++) {
+        doubleData[i] = (double)(i + 30);
+    }
+
+    //  _mm_maskload_pd
+    __m128d doubleVec128 = _mm_maskload_pd(doubleData, mask128);
+    printf("VMASKMOVP ");
+    for (int i = 0; i < 2; i++) {
+        printf("%lf ", ((double *)&doubleVec128)[i]);
+    }
+    printf("\n");
+
+    //  _mm_maskstore_pd
+    memset(destAddress, 0, pageSize);
+    _mm_maskstore_pd(doubleDest, mask128, doubleVec128);
+    printf("VMASKMOVP ");
+    for (int i = 0; i < 1 * ACCESS_TEST; i++) {
+        printf("%f ", doubleDest[i]);
+    }
+    printf("\n");
+
+    //=================================================================================
+
+    munmap(baseAddress, pageSize);
+    munmap(destAddress, pageSize);
+
+    return 0;
+}
\ No newline at end of file