about summary refs log tree commit diff stats
path: root/src
diff options
context:
space:
mode:
authorYang Liu <liuyang22@iscas.ac.cn>2025-05-20 20:45:50 +0800
committerGitHub <noreply@github.com>2025-05-20 14:45:50 +0200
commite1e3122a1fe834ca12344e0f8a51b995d887b350 (patch)
treead6991f23d8e675351023dbebb1b4110d4635b99 /src
parent4f88b2bdd77d7e61c79f6ca632cbfaccc5371a6f (diff)
downloadbox64-e1e3122a1fe834ca12344e0f8a51b995d887b350.tar.gz
box64-e1e3122a1fe834ca12344e0f8a51b995d887b350.zip
[RV64_DYNAREC] Improved POPCNT and fixed some scratch conflicts (#2651)
Diffstat (limited to 'src')
-rw-r--r--src/dynarec/rv64/dynarec_rv64_00_2.c4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_0f.c8
-rw-r--r--src/dynarec/rv64/dynarec_rv64_66.c4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f0.c4
-rw-r--r--src/dynarec/rv64/dynarec_rv64_f30f.c105
-rw-r--r--src/dynarec/rv64/rv64_emitter.h4
6 files changed, 95 insertions, 34 deletions
diff --git a/src/dynarec/rv64/dynarec_rv64_00_2.c b/src/dynarec/rv64/dynarec_rv64_00_2.c
index 34c805d9..ffc4d43a 100644
--- a/src/dynarec/rv64/dynarec_rv64_00_2.c
+++ b/src/dynarec/rv64/dynarec_rv64_00_2.c
@@ -329,9 +329,9 @@ uintptr_t dynarec64_00_2(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             if (MODREG) {
                 GETGD;
                 GETED(0);
-                MVxw(x1, gd);
+                MVxw(x3, gd);
                 MVxw(gd, ed);
-                MVxw(ed, x1);
+                MVxw(ed, x3);
             } else {
                 GETGD;
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, LOCK_LOCK, 0, 0);
diff --git a/src/dynarec/rv64/dynarec_rv64_0f.c b/src/dynarec/rv64/dynarec_rv64_0f.c
index ae66e1b2..82f8f632 100644
--- a/src/dynarec/rv64/dynarec_rv64_0f.c
+++ b/src/dynarec/rv64/dynarec_rv64_0f.c
@@ -2257,7 +2257,7 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             B_NEXT_nocond;
             MARK;
             // gd is undefined if ed is all zeros, don't worry.
-            CTZxw(gd, ed, rex.w, x1, x2);
+            CTZxw(gd, ed, rex.w, x3, x5);
             ANDI(xFlags, xFlags, ~(1 << F_ZF));
             break;
         case 0xBD:
@@ -2276,9 +2276,9 @@ uintptr_t dynarec64_0F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             B_NEXT_nocond;
             MARK;
             ANDI(xFlags, xFlags, ~(1 << F_ZF));
-            CLZxw(gd, ed, rex.w, x1, x2, x3);
-            ADDI(x1, xZR, rex.w ? 63 : 31);
-            SUB(gd, x1, gd);
+            CLZxw(gd, ed, rex.w, x3, x5, x7);
+            ADDI(x3, xZR, rex.w ? 63 : 31);
+            SUB(gd, x3, gd);
             break;
         case 0xBE:
             INST_NAME("MOVSX Gd, Eb");
diff --git a/src/dynarec/rv64/dynarec_rv64_66.c b/src/dynarec/rv64/dynarec_rv64_66.c
index c054c3ba..f7134d6e 100644
--- a/src/dynarec/rv64/dynarec_rv64_66.c
+++ b/src/dynarec/rv64/dynarec_rv64_66.c
@@ -623,9 +623,9 @@ uintptr_t dynarec64_66(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             if (MODREG) {
                 GETGD;
                 GETED(0);
-                MV(x1, gd);
+                MV(x5, gd);
                 INSHz(gd, ed, x3, x4, 1, 1);
-                INSHz(ed, x1, x3, x4, 0, 1);
+                INSHz(ed, x5, x3, x4, 0, 1);
             } else {
                 GETGD;
                 addr = geted(dyn, addr, ninst, nextop, &ed, x2, x1, &fixedaddress, rex, LOCK_LOCK, 0, 0);
diff --git a/src/dynarec/rv64/dynarec_rv64_f0.c b/src/dynarec/rv64/dynarec_rv64_f0.c
index f51c3bb9..84816cea 100644
--- a/src/dynarec/rv64/dynarec_rv64_f0.c
+++ b/src/dynarec/rv64/dynarec_rv64_f0.c
@@ -788,9 +788,9 @@ uintptr_t dynarec64_F0(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int ni
             if (MODREG) {
                 GETGD;
                 GETED(0);
-                MV(x1, gd);
+                MV(x5, gd);
                 MV(gd, ed);
-                MV(ed, x1);
+                MV(ed, x5);
             } else {
                 SMDMB();
                 GETGD;
diff --git a/src/dynarec/rv64/dynarec_rv64_f30f.c b/src/dynarec/rv64/dynarec_rv64_f30f.c
index 16dc167f..19a8b4b9 100644
--- a/src/dynarec/rv64/dynarec_rv64_f30f.c
+++ b/src/dynarec/rv64/dynarec_rv64_f30f.c
@@ -469,26 +469,87 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             if (rv64_zbb) {
                 CPOPxw(gd, ed);
             } else {
-                TABLE64(x1, 0x5555555555555555uLL);
-                SRLI(x5, ed, 1);
-                AND(x5, x5, x1);
-                SUB(x5, ed, x5);
-                TABLE64(x3, 0x3333333333333333uLL);
-                SRLI(x1, x5, 2);
-                AND(x1, x1, x3);
-                AND(x5, x5, x3);
-                ADD(x5, x5, x1);
-                TABLE64(x3, 0x0F0F0F0F0F0F0F0FuLL);
-                SRLI(x1, x5, 4);
-                ADD(x5, x5, x1);
-                AND(x5, x5, x3);
-                SRLI(x1, x5, 32);
-                ADDW(x5, x5, x1);
-                SRLIW(x1, x5, 16);
-                ADDW(x5, x5, x1);
-                SRLIW(x1, x5, 8);
-                ADDW(x5, x5, x1);
-                ANDI(gd, x5, 0x7F);
+                if (rex.w) {
+                    // x7 = 0x5555555555555555
+                    LUI(x7, 0x55555);
+                    ADDIW(x7, x7, 0x555);
+                    SLLI(x6, x7, 32);
+                    ADD(x7, x7, x6);
+
+                    // v = v - ((v >> 1) & x7)
+                    SRLI(x5, ed, 1);
+                    AND(x5, x5, x7);
+                    SUB(x5, ed, x5);
+
+                    // x3 = 0x3333333333333333
+                    LUI(x3, 0x33333);
+                    ADDIW(x3, x3, 0x333);
+                    SLLI(x6, x3, 32);
+                    ADD(x3, x3, x6);
+
+                    // v = (v & x3) + ((v >> 2) & x3);
+                    SRLI(x7, x5, 2);
+                    AND(x7, x7, x3);
+                    AND(x5, x5, x3);
+                    ADD(x5, x5, x7);
+
+                    // x3 = 0x0F0F0F0F0F0F0F0F
+                    LUI(x3, 0xF0F1);
+                    ADDIW(x3, x3, 0xF0F);
+                    SLLI(x6, x3, 32);
+                    ADD(x3, x3, x6);
+
+                    // v = (v + (v >> 4) & x3)
+                    SRLI(x7, x5, 4);
+                    ADD(x5, x5, x7);
+                    AND(x5, x5, x3);
+
+                    // x3 = 0x0101010101010101
+                    LUI(x3, 0x1010);
+                    ADDIW(x3, x3, 0x101);
+                    SLLI(x6, x3, 32);
+                    ADD(x3, x3, x6);
+
+                    // count = (v * x3) >> 56
+                    MUL(gd, x5, x3);
+                    SRLI(gd, gd, 56);
+                } else {
+                    // x7 = 0x55555555uLL
+                    LUI(x7, 0x55555);
+                    ADDIW(x7, x7, 0x555);
+
+                    // v = v - ((v >> 1) & x7)
+                    SRLI(x5, ed, 1);
+                    AND(x5, x5, x7);
+                    SUB(x5, ed, x5);
+
+                    // x3 = 0x33333333uLL
+                    LUI(x3, 0x33333);
+                    ADDIW(x3, x3, 0x333);
+
+                    // v = (v & x3) + ((v >> 2) & x3);
+                    SRLI(x7, x5, 2);
+                    AND(x7, x7, x3);
+                    AND(x5, x5, x3);
+                    ADD(x5, x5, x7);
+
+                    // x3 = 0x0F0F0F0FuLL
+                    LUI(x3, 0xF0F1);
+                    ADDIW(x3, x3, 0xF0F);
+
+                    // v = (v + (v >> 4) & x3)
+                    SRLI(x7, x5, 4);
+                    ADD(x5, x5, x7);
+                    AND(x5, x5, x3);
+
+                    // x3 = 01010101uLL
+                    LUI(x3, 0x1010);
+                    ADDIW(x3, x3, 0x101);
+
+                    // count = (v * x3) >> 24
+                    MULW(gd, x5, x3);
+                    SRLIW(gd, gd, 24);
+                }
             }
             break;
         case 0xBC:
@@ -508,7 +569,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             MOV32w(gd, rex.w ? 64 : 32);
             B_NEXT_nocond;
             MARK;
-            CTZxw(gd, ed, rex.w, x1, x2);
+            CTZxw(gd, ed, rex.w, x3, x5);
             BNE(gd, xZR, 4 + 4);
             ORI(xFlags, xFlags, 1 << F_ZF);
             break;
@@ -529,7 +590,7 @@ uintptr_t dynarec64_F30F(dynarec_rv64_t* dyn, uintptr_t addr, uintptr_t ip, int
             ORI(xFlags, xFlags, 1 << F_CF);
             B_NEXT_nocond;
             MARK;
-            CLZxw(gd, ed, rex.w, x1, x2, x3);
+            CLZxw(gd, ed, rex.w, x5, x2, x3);
             ANDI(xFlags, xFlags, ~((1 << F_ZF) | (1 << F_CF)));
             BNE(gd, xZR, 4 + 4);
             ORI(xFlags, xFlags, 1 << F_ZF);
diff --git a/src/dynarec/rv64/rv64_emitter.h b/src/dynarec/rv64/rv64_emitter.h
index c1dc6fc7..4f656cfc 100644
--- a/src/dynarec/rv64/rv64_emitter.h
+++ b/src/dynarec/rv64/rv64_emitter.h
@@ -862,8 +862,8 @@
             u8 = rd;                         \
         else                                 \
             u8 = s1;                         \
-        ADDI(u8, xZR, rex.w ? 63 : 31);      \
-        if (rex.w) {                         \
+        ADDI(u8, xZR, x ? 63 : 31);          \
+        if (x) {                             \
             MV(s2, rs);                      \
             SRLI(s3, s2, 32);                \
             BEQZ(s3, 4 + 2 * 4);             \