about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2025-08-25 19:36:16 +0800
committerGitHub <noreply@github.com>2025-08-25 13:36:16 +0200
commitd71200de69bb38248ebeb482cc2366e5d1e0a9c1 (patch)
tree6cd1589287544e2b5722ad1a87b317cd9977e826 /src
parent8579ef84bd003de01f64257abbc1ee0544c85682 (diff)
downloadbox64-d71200de69bb38248ebeb482cc2366e5d1e0a9c1.tar.gz
box64-d71200de69bb38248ebeb482cc2366e5d1e0a9c1.zip
[RV64_DYNAREC] Added more scalar avx opcodes (#2971)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f.c2
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_66_0f.c546
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h27
3 files changed, 571 insertions, 4 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f.c b/src/dynarec/rv64/dynarec_rv64_660f.c
index 0a37495c..d5fb90dd 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f.c
@@ -833,7 +833,7 @@ uintptr_t dynarec64_660F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             GETEX(x2, 0, 8);
             SSE_LOOP_MV_Q(x3);
             break;
-        case 0x70: // TODO: Optimize this!
+        case 0x70:
             INST_NAME("PSHUFD Gx,Ex,Ib");
             nextop = F8;
             GETGX();
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
index 1e3e0227..88005608 100644
--- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
@@ -707,6 +707,83 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             } else
                 YMM0(gd);
             break;
+        case 0x63:
+        case 0x67:
+            if (opcode == 0x63)
+                INST_NAME("VPACKSSWB Gx, Vx, Ex");
+            else
+                INST_NAME("VPACKUSWB Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 30 : 14);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            if (gd == ed) {
+                ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                LD(x3, wback, fixedaddress + 0);
+                LD(x4, wback, fixedaddress + 8);
+                SD(x3, x5, 0);
+                SD(x4, x5, 8);
+                wback = x5;
+                fixedaddress = 0;
+            }
+            for (int i = 0; i < 8; ++i) {
+                LH(x3, vback, vxoffset + i * 2);
+                if (opcode == 0x63)
+                    SAT8(x3, x6);
+                else
+                    SATU8(x3, x6);
+                SB(x3, gback, gdoffset + i);
+            }
+            if (vex.v == ed) {
+                LD(x3, gback, gdoffset + 0);
+                SD(x3, gback, gdoffset + 8);
+            } else {
+                for (int i = 0; i < 8; ++i) {
+                    LH(x3, wback, fixedaddress + i * 2);
+                    if (opcode == 0x63)
+                        SAT8(x3, x6);
+                    else
+                        SATU8(x3, x6);
+                    SB(x3, gback, gdoffset + 8 + i);
+                }
+            }
+            if (vex.l) {
+                GETEY();
+                if (gd == ed) {
+                    ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                    LD(x3, wback, fixedaddress + 0);
+                    LD(x4, wback, fixedaddress + 8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    wback = x5;
+                    fixedaddress = 0;
+                }
+                for (int i = 0; i < 8; ++i) {
+                    LH(x3, vback, vyoffset + i * 2);
+                    if (opcode == 0x63)
+                        SAT8(x3, x6);
+                    else
+                        SATU8(x3, x6);
+                    SB(x3, gback, gyoffset + i);
+                }
+                if (vex.v == ed) {
+                    LD(x3, gback, gyoffset + 0);
+                    SD(x3, gback, gyoffset + 8);
+                } else {
+                    for (int i = 0; i < 8; ++i) {
+                        LH(x3, wback, fixedaddress + i * 2);
+                        if (opcode == 0x63)
+                            SAT8(x3, x6);
+                        else
+                            SATU8(x3, x6);
+                        SB(x3, gback, gyoffset + 8 + i);
+                    }
+                }
+            } else
+                YMM0(gd);
+            break;
         case 0x64:
             INST_NAME("VPCMPGTB Gx, Vx, Ex");
             nextop = F8;
@@ -788,6 +865,184 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             } else
                 YMM0(gd);
             break;
+        case 0x68:
+            INST_NAME("VPUNPCKHBW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 31 : 15);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            for (int i = 0; i < 8; ++i) {
+                LBU(x3, vback, vxoffset + i + 8);
+                LBU(x4, wback, fixedaddress + i + 8);
+                SB(x3, gback, gdoffset + i * 2);
+                SB(x4, gback, gdoffset + i * 2 + 1);
+            }
+            if (vex.l) {
+                GETEY();
+                for (int i = 0; i < 8; ++i) {
+                    LBU(x3, vback, vyoffset + i + 8);
+                    LBU(x4, wback, fixedaddress + i + 8);
+                    SB(x3, gback, gyoffset + i * 2);
+                    SB(x4, gback, gyoffset + i * 2 + 1);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0x69:
+            INST_NAME("VPUNPCKHWD Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 30 : 14);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            for (int i = 0; i < 4; ++i) {
+                LHU(x3, vback, vxoffset + i * 2 + 8);
+                LHU(x4, wback, fixedaddress + i * 2 + 8);
+                SH(x3, gback, gdoffset + i * 4);
+                SH(x4, gback, gdoffset + i * 4 + 2);
+            }
+            if (vex.l) {
+                GETEY();
+                for (int i = 0; i < 4; ++i) {
+                    LHU(x3, vback, vyoffset + i * 2 + 8);
+                    LHU(x4, wback, fixedaddress + i * 2 + 8);
+                    SH(x3, gback, gyoffset + i * 4);
+                    SH(x4, gback, gyoffset + i * 4 + 2);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0x6A:
+            INST_NAME("VPUNPCKHDQ Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 28 : 12);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            for (int i = 0; i < 2; ++i) {
+                LWU(x3, vback, vxoffset + i * 4 + 8);
+                LWU(x4, wback, fixedaddress + i * 4 + 8);
+                SW(x3, gback, gdoffset + i * 8);
+                SW(x4, gback, gdoffset + i * 8 + 4);
+            }
+            if (vex.l) {
+                GETEY();
+                for (int i = 0; i < 2; ++i) {
+                    LWU(x3, vback, vyoffset + i * 4 + 8);
+                    LWU(x4, wback, fixedaddress + i * 4 + 8);
+                    SW(x3, gback, gyoffset + i * 8);
+                    SW(x4, gback, gyoffset + i * 8 + 4);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0x6B:
+            INST_NAME("VPACKSSDW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 28 : 12);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            if (gd == ed) {
+                ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                LD(x3, wback, fixedaddress + 0);
+                LD(x4, wback, fixedaddress + 8);
+                SD(x3, x5, 0);
+                SD(x4, x5, 8);
+                wback = x5;
+                fixedaddress = 0;
+            }
+            for (int i = 0; i < 4; ++i) {
+                LW(x3, vback, vxoffset + i * 4);
+                SAT16(x3, x6);
+                SH(x3, gback, gdoffset + i * 2);
+            }
+            if (vex.v == ed) {
+                LD(x3, gback, gdoffset + 0);
+                SD(x3, gback, gdoffset + 8);
+            } else {
+                for (int i = 0; i < 4; ++i) {
+                    LW(x3, wback, fixedaddress + i * 4);
+                    SAT16(x3, x6);
+                    SH(x3, gback, gdoffset + (4 + i) * 2);
+                }
+            }
+            if (vex.l) {
+                GETEY();
+                if (gd == ed) {
+                    ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
+                    LD(x3, wback, fixedaddress + 0);
+                    LD(x4, wback, fixedaddress + 8);
+                    SD(x3, x5, 0);
+                    SD(x4, x5, 8);
+                    wback = x5;
+                    fixedaddress = 0;
+                }
+                for (int i = 0; i < 4; ++i) {
+                    LW(x3, vback, vyoffset + i * 4);
+                    SAT16(x3, x6);
+                    SH(x3, gback, gyoffset + i * 2);
+                }
+                if (vex.v == ed) {
+                    LD(x3, gback, gyoffset + 0);
+                    SD(x3, gback, gyoffset + 8);
+                } else {
+                    for (int i = 0; i < 4; ++i) {
+                        LW(x3, wback, fixedaddress + i * 4);
+                        SAT16(x3, x6);
+                        SH(x3, gback, gyoffset + (4 + i) * 2);
+                    }
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0x6C:
+            INST_NAME("VPUNPCKLQDQ Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 16 : 1);
+            GETGX();
+            GETVX();
+            GETGY();
+            GETVY();
+            LD(x3, vback, vxoffset + 0);
+            LD(x4, wback, fixedaddress + 0);
+            SD(x3, gback, gdoffset + 0);
+            SD(x4, gback, gdoffset + 8);
+            if (vex.l) {
+                GETEY();
+                LD(x3, vback, vyoffset + 0);
+                LD(x4, wback, fixedaddress + 0);
+                SD(x3, gback, gyoffset + 0);
+                SD(x4, gback, gyoffset + 8);
+            } else
+                YMM0(gd);
+            break;
+        case 0x6D:
+            INST_NAME("VPUNPCKHQDQ Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 24 : 8);
+            GETGX();
+            GETVX();
+            GETGY();
+            GETVY();
+            LD(x3, vback, vxoffset + 8);
+            LD(x4, wback, fixedaddress + 8);
+            SD(x3, gback, gdoffset + 0);
+            SD(x4, gback, gdoffset + 8);
+            if (vex.l) {
+                GETEY();
+                LD(x3, vback, vyoffset + 8);
+                LD(x4, wback, fixedaddress + 8);
+                SD(x3, gback, gyoffset + 0);
+                SD(x4, gback, gyoffset + 8);
+            } else
+                YMM0(gd);
+            break;
         case 0x6E:
             INST_NAME("VMOVD Gx, Ed");
             nextop = F8;
@@ -821,6 +1076,35 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             } else
                 YMM0(gd);
             break;
+        case 0x70:
+            INST_NAME("VPSHUFD Gx, Ex, Ib");
+            nextop = F8;
+            GETEX(x2, 1, vex.l ? 28 : 12);
+            GETGX();
+            GETGY();
+            u8 = F8;
+
+            LWU(x3, wback, fixedaddress + ((u8 >> (0 * 2)) & 3) * 4);
+            LWU(x4, wback, fixedaddress + ((u8 >> (1 * 2)) & 3) * 4);
+            LWU(x5, wback, fixedaddress + ((u8 >> (2 * 2)) & 3) * 4);
+            LWU(x6, wback, fixedaddress + ((u8 >> (3 * 2)) & 3) * 4);
+            SW(x3, gback, gdoffset + 0 * 4);
+            SW(x4, gback, gdoffset + 1 * 4);
+            SW(x5, gback, gdoffset + 2 * 4);
+            SW(x6, gback, gdoffset + 3 * 4);
+            if (vex.l) {
+                GETEY();
+                LWU(x3, wback, fixedaddress + ((u8 >> (0 * 2)) & 3) * 4);
+                LWU(x4, wback, fixedaddress + ((u8 >> (1 * 2)) & 3) * 4);
+                LWU(x5, wback, fixedaddress + ((u8 >> (2 * 2)) & 3) * 4);
+                LWU(x6, wback, fixedaddress + ((u8 >> (3 * 2)) & 3) * 4);
+                SW(x3, gback, gyoffset + 0 * 4);
+                SW(x4, gback, gyoffset + 1 * 4);
+                SW(x5, gback, gyoffset + 2 * 4);
+                SW(x6, gback, gyoffset + 3 * 4);
+            } else
+                YMM0(gd);
+            break;
         case 0x7E:
             INST_NAME("VMOVD Ed, Gx");
             nextop = F8;
@@ -1027,6 +1311,163 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             } else
                 YMM0(gd);
             break;
+        case 0xD1:
+            INST_NAME("VPSRLW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x2, 0, 1);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            LD(x3, wback, fixedaddress);
+            ADDI(x4, xZR, 16);
+            BLTU_MARK(x3, x4);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
+            B_MARK2_nocond;
+            MARK;
+            for (int i = 0; i < 8; ++i) {
+                LHU(x5, vback, vxoffset + 2 * i);
+                SRLW(x5, x5, x3);
+                SH(x5, gback, gdoffset + 2 * i);
+            }
+            MARK2;
+            if (vex.l) {
+                BLTU_MARK3(x3, x4);
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+                B_NEXT_nocond;
+                MARK3;
+                for (int i = 0; i < 8; ++i) {
+                    LHU(x5, vback, vyoffset + 2 * i);
+                    SRLW(x5, x5, x3);
+                    SH(x5, gback, gyoffset + 2 * i);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xD2:
+            INST_NAME("VPSRLD Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x2, 0, 1);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            LD(x3, wback, fixedaddress);
+            ADDI(x4, xZR, 32);
+            BLTU_MARK(x3, x4);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
+            B_MARK2_nocond;
+            MARK;
+            for (int i = 0; i < 4; ++i) {
+                LWU(x5, vback, vxoffset + 4 * i);
+                SRLW(x5, x5, x3);
+                SW(x5, gback, gdoffset + 4 * i);
+            }
+            MARK2;
+            if (vex.l) {
+                BLTU_MARK3(x3, x4);
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+                B_NEXT_nocond;
+                MARK3;
+                for (int i = 0; i < 4; ++i) {
+                    LWU(x5, vback, vyoffset + 4 * i);
+                    SRLW(x5, x5, x3);
+                    SW(x5, gback, gyoffset + 4 * i);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xD3:
+            INST_NAME("VPSRLQ Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x2, 0, 1);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            LD(x3, wback, fixedaddress);
+            ADDI(x4, xZR, 64);
+            BLTU_MARK(x3, x4);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
+            B_MARK2_nocond;
+            MARK;
+            for (int i = 0; i < 2; ++i) {
+                LD(x5, vback, vxoffset + 8 * i);
+                SRL(x5, x5, x3);
+                SD(x5, gback, gdoffset + 8 * i);
+            }
+            MARK2;
+            if (vex.l) {
+                BLTU_MARK3(x3, x4);
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+                B_NEXT_nocond;
+                MARK3;
+                for (int i = 0; i < 2; ++i) {
+                    LD(x5, vback, vyoffset + 8 * i);
+                    SRL(x5, x5, x3);
+                    SD(x5, gback, gyoffset + 8 * i);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xE1:
+            INST_NAME("VPSRAW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x2, 0, 1);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            ADDI(x4, xZR, 16);
+            LD(x3, wback, fixedaddress);
+            BLTU(x3, x4, 8);
+            SUBI(x3, x4, 1);
+            for (int i = 0; i < 8; ++i) {
+                LH(x5, vback, vxoffset + 2 * i);
+                SRAW(x5, x5, x3);
+                SH(x5, gback, gdoffset + 2 * i);
+            }
+            if (vex.l) {
+                for (int i = 0; i < 8; ++i) {
+                    LH(x5, vback, vyoffset + 2 * i);
+                    SRAW(x5, x5, x3);
+                    SH(x5, gback, gyoffset + 2 * i);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xE2:
+            INST_NAME("VPSRAD Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x2, 0, 1);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            ADDI(x4, xZR, 32);
+            LD(x3, wback, fixedaddress);
+            BLTU(x3, x4, 8);
+            SUBI(x3, x4, 1);
+            for (int i = 0; i < 4; ++i) {
+                LW(x5, vback, vxoffset + 4 * i);
+                SRAW(x5, x5, x3);
+                SW(x5, gback, gdoffset + 4 * i);
+            }
+            if (vex.l) {
+                for (int i = 0; i < 4; ++i) {
+                    LW(x5, vback, vyoffset + 4 * i);
+                    SRAW(x5, x5, x3);
+                    SW(x5, gback, gyoffset + 4 * i);
+                }
+            } else
+                YMM0(gd);
+            break;
         case 0xEF:
             INST_NAME("VPXOR Gx, Vx, Ex");
             nextop = F8;
@@ -1056,6 +1497,111 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             } else
                 YMM0(gd);
             break;
+        case 0xF1:
+            INST_NAME("VPSLLW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x2, 0, 1);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            LD(x3, wback, fixedaddress);
+            ADDI(x4, xZR, 16);
+            BLTU_MARK(x3, x4);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
+            B_MARK2_nocond;
+            MARK;
+            for (int i = 0; i < 8; ++i) {
+                LHU(x5, vback, vxoffset + 2 * i);
+                SLLW(x5, x5, x3);
+                SH(x5, gback, gdoffset + 2 * i);
+            }
+            MARK2;
+            if (vex.l) {
+                BLTU_MARK3(x3, x4);
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+                B_NEXT_nocond;
+                MARK3;
+                for (int i = 0; i < 8; ++i) {
+                    LHU(x5, vback, vyoffset + 2 * i);
+                    SLLW(x5, x5, x3);
+                    SH(x5, gback, gyoffset + 2 * i);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xF2:
+            INST_NAME("VPSLLD Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x2, 0, 1);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            LD(x3, wback, fixedaddress);
+            ADDI(x4, xZR, 32);
+            BLTU_MARK(x3, x4);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
+            B_MARK2_nocond;
+            MARK;
+            for (int i = 0; i < 4; ++i) {
+                LWU(x5, vback, vxoffset + 4 * i);
+                SLLW(x5, x5, x3);
+                SW(x5, gback, gdoffset + 4 * i);
+            }
+            MARK2;
+            if (vex.l) {
+                BLTU_MARK3(x3, x4);
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+                B_NEXT_nocond;
+                MARK3;
+                for (int i = 0; i < 4; ++i) {
+                    LWU(x5, vback, vyoffset + 4 * i);
+                    SLLW(x5, x5, x3);
+                    SW(x5, gback, gyoffset + 4 * i);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xF3:
+            INST_NAME("VPSLLQ Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x2, 0, 1);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            LD(x3, wback, fixedaddress);
+            ADDI(x4, xZR, 64);
+            BLTU_MARK(x3, x4);
+            SD(xZR, gback, gdoffset + 0);
+            SD(xZR, gback, gdoffset + 8);
+            B_MARK2_nocond;
+            MARK;
+            for (int i = 0; i < 2; ++i) {
+                LD(x5, vback, vxoffset + 8 * i);
+                SLL(x5, x5, x3);
+                SD(x5, gback, gdoffset + 8 * i);
+            }
+            MARK2;
+            if (vex.l) {
+                BLTU_MARK3(x3, x4);
+                SD(xZR, gback, gyoffset + 0);
+                SD(xZR, gback, gyoffset + 8);
+                B_NEXT_nocond;
+                MARK3;
+                for (int i = 0; i < 2; ++i) {
+                    LD(x5, vback, vyoffset + 8 * i);
+                    SLL(x5, x5, x3);
+                    SD(x5, gback, gyoffset + 8 * i);
+                }
+            } else
+                YMM0(gd);
+            break;
         default:
             DEFAULT;
     }
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 9b7cec7a..03bf3cf6 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -770,13 +770,13 @@
 #define B_MARKi_nocond Bxx_geni(__, MARK, 0, 0, i)
 // Branch to MARK if reg1<reg2 (use j64)
 #define BLT_MARK(reg1, reg2)  Bxx_gen(LT, MARK, reg1, reg2)
-#define BLT_MARKi(reg1, reg2) Bxx_geni(LT, MARK, reg1, reg2, i)
+#define BLT_MARKi(reg1, reg2, i) Bxx_geni(LT, MARK, reg1, reg2, i)
 // Branch to MARK if reg1<reg2 (use j64)
 #define BLTU_MARK(reg1, reg2)  Bxx_gen(LTU, MARK, reg1, reg2)
-#define BLTU_MARKi(reg1, reg2) Bxx_geni(LTU, MARK, reg1, reg2, i)
+#define BLTU_MARKi(reg1, reg2, i) Bxx_geni(LTU, MARK, reg1, reg2, i)
 // Branch to MARK if reg1>=reg2 (use j64)
 #define BGE_MARK(reg1, reg2)  Bxx_gen(GE, MARK, reg1, reg2)
-#define BGE_MARKi(reg1, reg2) Bxx_geni(GE, MARK, reg1, reg2, i)
+#define BGE_MARKi(reg1, reg2, i) Bxx_geni(GE, MARK, reg1, reg2, i)
 // Branch to MARK2 if reg1==reg2 (use j64)
 #define BEQ_MARK2(reg1, reg2) Bxx_gen(EQ, MARK2, reg1, reg2)
 // Branch to MARK2 if reg1!=reg2 (use j64)
@@ -793,6 +793,8 @@
 #define BNE_MARK3(reg1, reg2) Bxx_gen(NE, MARK3, reg1, reg2)
 // Branch to MARK3 if reg1!>=reg2 (use j64)
 #define BGE_MARK3(reg1, reg2) Bxx_gen(GE, MARK3, reg1, reg2)
+// Branch to MARK if reg1<reg2 (use j64)
+#define BLTU_MARK3(reg1, reg2) Bxx_gen(LTU, MARK3, reg1, reg2)
 // Branch to MARK3 if reg1!=0 (use j64)
 #define BNEZ_MARK3(reg) BNE_MARK3(reg, xZR)
 // Branch to MARK3 if reg1==0 (use j64)
@@ -1955,6 +1957,25 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
 #define PURGE_YMM()
 
 // TODO: zbb?
+#define SAT8(reg, s)                     \
+    do {                                 \
+        ADDIW(s, xZR, 0xF80); /* -128 */ \
+        BGE(reg, s, 4 + 4);              \
+        MV(reg, s);                      \
+        ADDIW(s, xZR, 0x80); /* 128 */   \
+        BLT(reg, s, 4 + 4);              \
+        ADDIW(reg, s, -1);               \
+    } while (0)
+
+#define SATU8(reg, s)                   \
+    do {                                \
+        ADDIW(s, xZR, 0x100); /* 256 */ \
+        BGE(reg, xZR, 4 + 4);           \
+        MV(reg, xZR);                   \
+        BLT(reg, s, 4 + 4);             \
+        ADDIW(reg, s, -1);              \
+    } while (0)
+
 #define SAT16(reg, s)                 \
     do {                              \
         LUI(s, 0xFFFF8); /* -32768 */ \