about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_660f38.c19
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_66_0f.c144
-rw-r--r--src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c34
-rw-r--r--src/dynarec/rv64/dynarec_rv64_helper.h46
4 files changed, 176 insertions, 67 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_660f38.c b/src/dynarec/rv64/dynarec_rv64_660f38.c
index 6ba960e7..1e9a357a 100644
--- a/src/dynarec/rv64/dynarec_rv64_660f38.c
+++ b/src/dynarec/rv64/dynarec_rv64_660f38.c
@@ -141,13 +141,15 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     INST_NAME("PHADDSW Gx, Ex");
                     nextop = F8;
                     GETGX();
+                    LUI(x6, 0xFFFF8); // -32768
+                    LUI(x7, 0x8);     // 32768
                     for (int i = 0; i < 4; ++i) {
                         // tmp32s = GX->sw[i*2+0]+GX->sw[i*2+1];
                         // GX->sw[i] = sat(tmp32s);
                         LH(x3, gback, gdoffset + 2 * (i * 2 + 0));
                         LH(x4, gback, gdoffset + 2 * (i * 2 + 1));
                         ADDW(x3, x3, x4);
-                        SAT16(x3, x6);
+                        SATw(x3, x6, x7);
                         SH(x3, gback, gdoffset + i * 2);
                     }
                     if (MODREG && gd == (nextop & 7) + (rex.b << 3)) {
@@ -162,7 +164,7 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                             LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                             LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                             ADDW(x3, x3, x4);
-                            SAT16(x3, x6);
+                            SATw(x3, x6, x7);
                             SH(x3, gback, gdoffset + 2 * (4 + i));
                         }
                     }
@@ -172,15 +174,17 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0, 15);
+                    LUI(x6, 0xFFFF8); // -32768
+                    LUI(x7, 0x8);     // 32768
                     for (int i = 0; i < 8; ++i) {
                         LBU(x3, gback, gdoffset + i * 2);
                         LB(x4, wback, fixedaddress + i * 2);
-                        MUL(x7, x3, x4);
+                        MUL(x1, x3, x4);
                         LBU(x3, gback, gdoffset + i * 2 + 1);
                         LB(x4, wback, fixedaddress + i * 2 + 1);
                         MUL(x3, x3, x4);
-                        ADD(x3, x3, x7);
-                        SAT16(x3, x6);
+                        ADD(x3, x3, x1);
+                        SATw(x3, x6, x7);
                         SH(x3, gback, gdoffset + i * 2);
                     }
                     break;
@@ -467,9 +471,10 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     nextop = F8;
                     GETGX();
                     GETEX(x2, 0, 12);
+                    LUI(x5, 0x10); // 65536
                     for (int i = 0; i < 4; ++i) {
                         LW(x3, gback, gdoffset + i * 4);
-                        SATU16(x3, x5);
+                        SATUw(x3, x5);
                         SH(x3, gback, gdoffset + i * 2);
                     }
                     if (MODREG && gd == ed) {
@@ -478,7 +483,7 @@ uintptr_t dynarec64_660F38(dynarec_rv64_t* dyn, uintptr_t addr, uint8_t opcode,
                     } else
                         for (int i = 0; i < 4; ++i) {
                             LW(x3, wback, fixedaddress + i * 4);
-                            SATU16(x3, x5);
+                            SATUw(x3, x5);
                             SH(x3, gback, gdoffset + 8 + i * 2);
                         }
                     break;
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
index 372e0faa..f19a3f32 100644
--- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f.c
@@ -719,6 +719,12 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             GETGY();
             GETVX();
             GETVY();
+            if (opcode == 0x63) {
+                ADDIW(x6, xZR, 0xF80); // -128
+                ADDIW(x7, xZR, 0x80);  // 128
+            } else {
+                ADDIW(x6, xZR, 0x100); // 256
+            }
             if (gd == ed) {
                 ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
                 LD(x3, wback, fixedaddress + 0);
@@ -731,9 +737,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             for (int i = 0; i < 8; ++i) {
                 LH(x3, vback, vxoffset + i * 2);
                 if (opcode == 0x63)
-                    SAT8(x3, x6);
+                    SATw(x3, x6, x7);
                 else
-                    SATU8(x3, x6);
+                    SATUw(x3, x6);
                 SB(x3, gback, gdoffset + i);
             }
             if (vex.v == ed) {
@@ -743,9 +749,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 for (int i = 0; i < 8; ++i) {
                     LH(x3, wback, fixedaddress + i * 2);
                     if (opcode == 0x63)
-                        SAT8(x3, x6);
+                        SATw(x3, x6, x7);
                     else
-                        SATU8(x3, x6);
+                        SATUw(x3, x6);
                     SB(x3, gback, gdoffset + 8 + i);
                 }
             }
@@ -763,9 +769,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 for (int i = 0; i < 8; ++i) {
                     LH(x3, vback, vyoffset + i * 2);
                     if (opcode == 0x63)
-                        SAT8(x3, x6);
+                        SATw(x3, x6, x7);
                     else
-                        SATU8(x3, x6);
+                        SATUw(x3, x6);
                     SB(x3, gback, gyoffset + i);
                 }
                 if (vex.v == ed) {
@@ -775,9 +781,9 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                     for (int i = 0; i < 8; ++i) {
                         LH(x3, wback, fixedaddress + i * 2);
                         if (opcode == 0x63)
-                            SAT8(x3, x6);
+                            SATw(x3, x6, x7);
                         else
-                            SATU8(x3, x6);
+                            SATUw(x3, x6);
                         SB(x3, gback, gyoffset + 8 + i);
                     }
                 }
@@ -948,6 +954,8 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             GETGY();
             GETVX();
             GETVY();
+            LUI(x6, 0xFFFF8); // -32768
+            LUI(x7, 0x8);     // 32768
             if (gd == ed) {
                 ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
                 LD(x3, wback, fixedaddress + 0);
@@ -959,7 +967,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             }
             for (int i = 0; i < 4; ++i) {
                 LW(x3, vback, vxoffset + i * 4);
-                SAT16(x3, x6);
+                SATw(x3, x6, x7);
                 SH(x3, gback, gdoffset + i * 2);
             }
             if (vex.v == ed) {
@@ -968,7 +976,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             } else {
                 for (int i = 0; i < 4; ++i) {
                     LW(x3, wback, fixedaddress + i * 4);
-                    SAT16(x3, x6);
+                    SATw(x3, x6, x7);
                     SH(x3, gback, gdoffset + (4 + i) * 2);
                 }
             }
@@ -985,7 +993,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 }
                 for (int i = 0; i < 4; ++i) {
                     LW(x3, vback, vyoffset + i * 4);
-                    SAT16(x3, x6);
+                    SATw(x3, x6, x7);
                     SH(x3, gback, gyoffset + i * 2);
                 }
                 if (vex.v == ed) {
@@ -994,7 +1002,7 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
                 } else {
                     for (int i = 0; i < 4; ++i) {
                         LW(x3, wback, fixedaddress + i * 4);
-                        SAT16(x3, x6);
+                        SATw(x3, x6, x7);
                         SH(x3, gback, gyoffset + (4 + i) * 2);
                     }
                 }
@@ -2141,6 +2149,118 @@ uintptr_t dynarec64_AVX_66_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
             } else
                 YMM0(gd);
             break;
+        case 0xE4:
+            INST_NAME("VPMULHUW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 30 : 14);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            for (int i = 0; i < 8; ++i) {
+                LHU(x3, vback, vxoffset + i * 2);
+                LHU(x4, wback, fixedaddress + i * 2);
+                MULW(x3, x3, x4);
+                SRLIW(x3, x3, 16);
+                SH(x3, gback, gdoffset + i * 2);
+            }
+            if (vex.l) {
+                GETEY();
+                for (int i = 0; i < 8; ++i) {
+                    LHU(x3, vback, vyoffset + i * 2);
+                    LHU(x4, wback, fixedaddress + i * 2);
+                    MULW(x3, x3, x4);
+                    SRLIW(x3, x3, 16);
+                    SH(x3, gback, gyoffset + i * 2);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xE5:
+            INST_NAME("VPMULHW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 30 : 14);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            for (int i = 0; i < 8; ++i) {
+                LH(x3, vback, vxoffset + i * 2);
+                LH(x4, wback, fixedaddress + i * 2);
+                MULW(x3, x3, x4);
+                SRAIW(x3, x3, 16);
+                SH(x3, gback, gdoffset + i * 2);
+            }
+            if (vex.l) {
+                GETEY();
+                for (int i = 0; i < 8; ++i) {
+                    LH(x3, vback, vyoffset + i * 2);
+                    LH(x4, wback, fixedaddress + i * 2);
+                    MULW(x3, x3, x4);
+                    SRAIW(x3, x3, 16);
+                    SH(x3, gback, gyoffset + i * 2);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xE8:
+            INST_NAME("VPSUBSB Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 31 : 15);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            ADDIW(x6, xZR, 0xF80); // -128
+            ADDIW(x7, xZR, 0x80);  // 128
+            for (int i = 0; i < 16; ++i) {
+                LB(x3, vback, vxoffset + i);
+                LB(x4, wback, fixedaddress + i);
+                SUBW(x3, x3, x4);
+                SATw(x3, x6, x7);
+                SB(x3, gback, gdoffset + i);
+            }
+            if (vex.l) {
+                GETEY();
+                for (int i = 0; i < 16; ++i) {
+                    LB(x3, vback, vyoffset + i);
+                    LB(x4, wback, fixedaddress + i);
+                    SUBW(x3, x3, x4);
+                    SATw(x3, x6, x7);
+                    SB(x3, gback, gyoffset + i);
+                }
+            } else
+                YMM0(gd);
+            break;
+        case 0xE9:
+            INST_NAME("VPSUBSW Gx, Vx, Ex");
+            nextop = F8;
+            GETEX(x1, 0, vex.l ? 30 : 14);
+            GETGX();
+            GETGY();
+            GETVX();
+            GETVY();
+            LUI(x6, 0xFFFF8); // -32768
+            LUI(x7, 0x8);     // 32768
+            for (int i = 0; i < 8; ++i) {
+                LH(x3, vback, vxoffset + i * 2);
+                LH(x4, wback, fixedaddress + i * 2);
+                SUBW(x3, x3, x4);
+                SATw(x3, x6, x7);
+                SH(x3, gback, gdoffset + i * 2);
+            }
+            if (vex.l) {
+                GETEY();
+                for (int i = 0; i < 8; ++i) {
+                    LH(x3, vback, vyoffset + i * 2);
+                    LH(x4, wback, fixedaddress + i * 2);
+                    SUBW(x3, x3, x4);
+                    SATw(x3, x6, x7);
+                    SH(x3, gback, gyoffset + i * 2);
+                }
+            } else
+                YMM0(gd);
+            break;
         case 0xEF:
             INST_NAME("VPXOR Gx, Vx, Ex");
             nextop = F8;
diff --git a/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c b/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c
index 3c81181a..40ec2518 100644
--- a/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c
+++ b/src/dynarec/rv64/dynarec_rv64_avx_66_0f38.c
@@ -110,6 +110,8 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETVX();
             GETGY();
             GETVY();
+            LUI(x6, 0xFFFF8); // -32768
+            LUI(x7, 0x8);     // 32768
             if (gd == ed) {
                 ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
                 LD(x3, wback, fixedaddress + 0);
@@ -124,7 +126,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 LH(x3, vback, vxoffset + 2 * (i * 2 + 0));
                 LH(x4, vback, vxoffset + 2 * (i * 2 + 1));
                 ADDW(x3, x3, x4);
-                if (opcode == 0x03) SAT16(x3, x6);
+                if (opcode == 0x03) SATw(x3, x6, x7);
                 SH(x3, gback, gdoffset + 2 * i);
             }
             if (MODREG && ed == vex.v) {
@@ -137,7 +139,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                     LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                     ADDW(x3, x3, x4);
-                    if (opcode == 0x03) SAT16(x3, x6);
+                    if (opcode == 0x03) SATw(x3, x6, x7);
                     SH(x3, gback, gdoffset + 2 * (4 + i));
                 }
             }
@@ -157,7 +159,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     LH(x3, vback, vyoffset + 2 * (i * 2 + 0));
                     LH(x4, vback, vyoffset + 2 * (i * 2 + 1));
                     ADDW(x3, x3, x4);
-                    if (opcode == 0x03) SAT16(x3, x6);
+                    if (opcode == 0x03) SATw(x3, x6, x7);
                     SH(x3, gback, gyoffset + 2 * i);
                 }
                 if (MODREG && ed == vex.v) {
@@ -170,7 +172,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                         LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                         LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                         ADDW(x3, x3, x4);
-                        if (opcode == 0x03) SAT16(x3, x6);
+                        if (opcode == 0x03) SATw(x3, x6, x7);
                         SH(x3, gback, gyoffset + 2 * (4 + i));
                     }
                 }
@@ -256,15 +258,17 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETVX();
             GETGY();
             GETVY();
+            LUI(x6, 0xFFFF8); // -32768
+            LUI(x7, 0x8);     // 32768
             for (int i = 0; i < 8; ++i) {
                 LBU(x3, vback, vxoffset + i * 2);
                 LB(x4, wback, fixedaddress + i * 2);
-                MUL(x7, x3, x4);
+                MUL(x2, x3, x4);
                 LBU(x3, vback, vxoffset + i * 2 + 1);
                 LB(x4, wback, fixedaddress + i * 2 + 1);
                 MUL(x3, x3, x4);
-                ADD(x3, x3, x7);
-                SAT16(x3, x6);
+                ADD(x3, x3, x2);
+                SATw(x3, x6, x7);
                 SH(x3, gback, gdoffset + i * 2);
             }
             if (vex.l) {
@@ -272,12 +276,12 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 for (int i = 0; i < 8; ++i) {
                     LBU(x3, vback, vyoffset + i * 2);
                     LB(x4, wback, fixedaddress + i * 2);
-                    MUL(x7, x3, x4);
+                    MUL(x2, x3, x4);
                     LBU(x3, vback, vyoffset + i * 2 + 1);
                     LB(x4, wback, fixedaddress + i * 2 + 1);
                     MUL(x3, x3, x4);
-                    ADD(x3, x3, x7);
-                    SAT16(x3, x6);
+                    ADD(x3, x3, x2);
+                    SATw(x3, x6, x7);
                     SH(x3, gback, gyoffset + i * 2);
                 }
             } else
@@ -295,6 +299,8 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
             GETVX();
             GETGY();
             GETVY();
+            LUI(x6, 0xFFFF8); // -32768
+            LUI(x7, 0x8);     // 32768
             if (gd == ed) {
                 ADDI(x5, xEmu, offsetof(x64emu_t, scratch));
                 LD(x3, wback, fixedaddress + 0);
@@ -309,7 +315,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                 LH(x3, vback, vxoffset + 2 * (i * 2 + 0));
                 LH(x4, vback, vxoffset + 2 * (i * 2 + 1));
                 SUBW(x3, x3, x4);
-                if (opcode == 0x07) SAT16(x3, x6);
+                if (opcode == 0x07) SATw(x3, x6, x7);
                 SH(x3, gback, gdoffset + 2 * i);
             }
             if (MODREG && ed == vex.v) {
@@ -322,7 +328,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                     LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                     SUBW(x3, x3, x4);
-                    if (opcode == 0x07) SAT16(x3, x6);
+                    if (opcode == 0x07) SATw(x3, x6, x7);
                     SH(x3, gback, gdoffset + 2 * (4 + i));
                 }
             }
@@ -342,7 +348,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                     LH(x3, vback, vyoffset + 2 * (i * 2 + 0));
                     LH(x4, vback, vyoffset + 2 * (i * 2 + 1));
                     SUBW(x3, x3, x4);
-                    if (opcode == 0x07) SAT16(x3, x6);
+                    if (opcode == 0x07) SATw(x3, x6, x7);
                     SH(x3, gback, gyoffset + 2 * i);
                 }
                 if (MODREG && ed == vex.v) {
@@ -355,7 +361,7 @@ uintptr_t dynarec64_AVX_66_0F38(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t i
                         LH(x3, wback, fixedaddress + 2 * (i * 2 + 0));
                         LH(x4, wback, fixedaddress + 2 * (i * 2 + 1));
                         SUBW(x3, x3, x4);
-                        if (opcode == 0x07) SAT16(x3, x6);
+                        if (opcode == 0x07) SATw(x3, x6, x7);
                         SH(x3, gback, gyoffset + 2 * (4 + i));
                     }
                 }
diff --git a/src/dynarec/rv64/dynarec_rv64_helper.h b/src/dynarec/rv64/dynarec_rv64_helper.h
index 03bf3cf6..d59e07f2 100644
--- a/src/dynarec/rv64/dynarec_rv64_helper.h
+++ b/src/dynarec/rv64/dynarec_rv64_helper.h
@@ -1957,42 +1957,20 @@ uintptr_t dynarec64_AVX_F3_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip,
 #define PURGE_YMM()
 
 // TODO: zbb?
-#define SAT8(reg, s)                     \
-    do {                                 \
-        ADDIW(s, xZR, 0xF80); /* -128 */ \
-        BGE(reg, s, 4 + 4);              \
-        MV(reg, s);                      \
-        ADDIW(s, xZR, 0x80); /* 128 */   \
-        BLT(reg, s, 4 + 4);              \
-        ADDIW(reg, s, -1);               \
+#define SATw(reg, min, maxp1)   \
+    do {                        \
+        BGE(reg, min, 4 + 4);   \
+        MV(reg, min);           \
+        BLT(reg, maxp1, 4 + 4); \
+        ADDIW(reg, maxp1, -1);  \
     } while (0)
 
-#define SATU8(reg, s)                   \
-    do {                                \
-        ADDIW(s, xZR, 0x100); /* 256 */ \
-        BGE(reg, xZR, 4 + 4);           \
-        MV(reg, xZR);                   \
-        BLT(reg, s, 4 + 4);             \
-        ADDIW(reg, s, -1);              \
-    } while (0)
-
-#define SAT16(reg, s)                 \
-    do {                              \
-        LUI(s, 0xFFFF8); /* -32768 */ \
-        BGE(reg, s, 4 + 4);           \
-        MV(reg, s);                   \
-        LUI(s, 0x8); /* 32768 */      \
-        BLT(reg, s, 4 + 4);           \
-        ADDIW(reg, s, -1);            \
-    } while (0)
-
-#define SATU16(reg, s)            \
-    do {                          \
-        LUI(s, 0x10); /* 65536 */ \
-        BGE(reg, xZR, 4 + 4);     \
-        MV(reg, xZR);             \
-        BLT(reg, s, 4 + 4);       \
-        ADDIW(reg, s, -1);        \
+#define SATUw(reg, maxu)       \
+    do {                       \
+        BGE(reg, xZR, 4 + 4);  \
+        MV(reg, xZR);          \
+        BLT(reg, maxu, 4 + 4); \
+        ADDIW(reg, maxu, -1);  \
     } while (0)
 
 #define FAST_8BIT_OPERATION(dst, src, s1, OP)                                        \