25 files changed, 3997 insertions, 158 deletions
diff --git a/hw/ide/cmd646.c b/hw/ide/cmd646.c
index a68357c1c5..cabe9048b1 100644
--- a/hw/ide/cmd646.c
+++ b/hw/ide/cmd646.c
@@ -144,7 +144,7 @@ static void bmdma_write(void *opaque, hwaddr addr,
         cmd646_update_irq(pci_dev);
         break;
     case 2:
-        bm->status = (val & 0x60) | (bm->status & 1) | (bm->status & ~val & 0x06);
+        bmdma_status_writeb(bm, val);
         break;
     case 3:
         if (bm == &bm->pci_dev->bmdma[0]) {
@@ -297,7 +297,6 @@ static void pci_cmd646_ide_realize(PCIDevice *dev, Error **errp)
         ide_bus_init_output_irq(&d->bus[i], qdev_get_gpio_in(ds, i));
 
         bmdma_init(&d->bus[i], &d->bmdma[i], d);
-        d->bmdma[i].bus = &d->bus[i];
         ide_bus_register_restart_cb(&d->bus[i]);
     }
 }
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index fc9224bbc9..a25b352537 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -318,6 +318,12 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
     bm->cmd = val & 0x09;
 }
 
+void bmdma_status_writeb(BMDMAState *bm, uint32_t val)
+{
+    bm->status = (val & 0x60) | (bm->status & BM_STATUS_DMAING)
+                 | (bm->status & ~val & (BM_STATUS_ERROR | BM_STATUS_INT));
+}
+
 static uint64_t bmdma_addr_read(void *opaque, hwaddr addr,
                                 unsigned width)
 {
@@ -519,13 +525,23 @@ void bmdma_init(IDEBus *bus, BMDMAState *bm, PCIIDEState *d)
     bus->dma = &bm->dma;
     bm->irq = bus->irq;
     bus->irq = qemu_allocate_irq(bmdma_irq, bm, 0);
+    bm->bus = bus;
     bm->pci_dev = d;
 }
 
+static void pci_ide_init(Object *obj)
+{
+    PCIIDEState *d = PCI_IDE(obj);
+
+    qdev_init_gpio_out_named(DEVICE(d), d->isa_irq, "isa-irq",
+                             ARRAY_SIZE(d->isa_irq));
+}
+
 static const TypeInfo pci_ide_type_info = {
     .name = TYPE_PCI_IDE,
     .parent = TYPE_PCI_DEVICE,
     .instance_size = sizeof(PCIIDEState),
+    .instance_init = pci_ide_init,
     .abstract = true,
     .interfaces = (InterfaceInfo[]) {
         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
diff --git a/hw/ide/piix.c b/hw/ide/piix.c
index 41d60921e3..151f206046 100644
--- a/hw/ide/piix.c
+++ b/hw/ide/piix.c
@@ -28,7 +28,6 @@
  */
 
 #include "qemu/osdep.h"
-#include "migration/vmstate.h"
 #include "qapi/error.h"
 #include "hw/pci/pci.h"
 #include "hw/ide/piix.h"
@@ -76,7 +75,7 @@ static void bmdma_write(void *opaque, hwaddr addr,
         bmdma_cmd_writeb(bm, val);
         break;
     case 2:
-        bm->status = (val & 0x60) | (bm->status & 1) | (bm->status & ~val & 0x06);
+        bmdma_status_writeb(bm, val);
         break;
     }
 }
@@ -144,7 +143,6 @@ static bool pci_piix_init_bus(PCIIDEState *d, unsigned i, Error **errp)
     ide_bus_init_output_irq(&d->bus[i], isa_get_irq(NULL, port_info[i].isairq));
 
     bmdma_init(&d->bus[i], &d->bmdma[i], d);
-    d->bmdma[i].bus = &d->bus[i];
     ide_bus_register_restart_cb(&d->bus[i]);
 
     return true;
@@ -160,8 +158,6 @@ static void pci_piix_ide_realize(PCIDevice *dev, Error **errp)
     bmdma_setup_bar(d);
     pci_register_bar(dev, 4, PCI_BASE_ADDRESS_SPACE_IO, &d->bmdma_bar);
 
-    vmstate_register(VMSTATE_IF(dev), 0, &vmstate_ide_pci, d);
-
     for (unsigned i = 0; i < 2; i++) {
         if (!pci_piix_init_bus(d, i, errp)) {
             return;
@@ -187,6 +183,7 @@ static void piix3_ide_class_init(ObjectClass *klass, void *data)
     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
 
     dc->reset = piix_ide_reset;
+    dc->vmsd = &vmstate_ide_pci;
     k->realize = pci_piix_ide_realize;
     k->exit = pci_piix_ide_exitfn;
     k->vendor_id = PCI_VENDOR_ID_INTEL;
@@ -209,6 +206,7 @@ static void piix4_ide_class_init(ObjectClass *klass, void *data)
     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
 
     dc->reset = piix_ide_reset;
+    dc->vmsd = &vmstate_ide_pci;
     k->realize = pci_piix_ide_realize;
     k->exit = pci_piix_ide_exitfn;
     k->vendor_id = PCI_VENDOR_ID_INTEL;
diff --git a/hw/ide/sii3112.c b/hw/ide/sii3112.c
index f9becdff8e..63dc4a0494 100644
--- a/hw/ide/sii3112.c
+++ b/hw/ide/sii3112.c
@@ -149,8 +149,7 @@ static void sii3112_reg_write(void *opaque, hwaddr addr,
         break;
     case 0x02:
     case 0x12:
-        d->i.bmdma[0].status = (val & 0x60) | (d->i.bmdma[0].status & 1) |
-                               (d->i.bmdma[0].status & ~val & 6);
+        bmdma_status_writeb(&d->i.bmdma[0], val);
         break;
     case 0x04 ... 0x07:
         bmdma_addr_ioport_ops.write(&d->i.bmdma[0], addr - 4, val, size);
@@ -165,8 +164,7 @@ static void sii3112_reg_write(void *opaque, hwaddr addr,
         break;
     case 0x0a:
     case 0x1a:
-        d->i.bmdma[1].status = (val & 0x60) | (d->i.bmdma[1].status & 1) |
-                               (d->i.bmdma[1].status & ~val & 6);
+        bmdma_status_writeb(&d->i.bmdma[1], val);
         break;
     case 0x0c ... 0x0f:
         bmdma_addr_ioport_ops.write(&d->i.bmdma[1], addr - 12, val, size);
@@ -287,7 +285,6 @@ static void sii3112_pci_realize(PCIDevice *dev, Error **errp)
         ide_bus_init_output_irq(&s->bus[i], qdev_get_gpio_in(ds, i));
 
         bmdma_init(&s->bus[i], &s->bmdma[i], s);
-        s->bmdma[i].bus = &s->bus[i];
         ide_bus_register_restart_cb(&s->bus[i]);
     }
 }
diff --git a/hw/ide/via.c b/hw/ide/via.c
index 177baea9a7..fff23803a6 100644
--- a/hw/ide/via.c
+++ b/hw/ide/via.c
@@ -31,6 +31,7 @@
 #include "sysemu/dma.h"
 #include "hw/isa/vt82c686.h"
 #include "hw/ide/pci.h"
+#include "hw/irq.h"
 #include "trace.h"
 
 static uint64_t bmdma_read(void *opaque, hwaddr addr,
@@ -74,7 +75,7 @@ static void bmdma_write(void *opaque, hwaddr addr,
         bmdma_cmd_writeb(bm, val);
         break;
     case 2:
-        bm->status = (val & 0x60) | (bm->status & 1) | (bm->status & ~val & 0x06);
+        bmdma_status_writeb(bm, val);
         break;
     default:;
     }
@@ -104,7 +105,8 @@ static void bmdma_setup_bar(PCIIDEState *d)
 
 static void via_ide_set_irq(void *opaque, int n, int level)
 {
-    PCIDevice *d = PCI_DEVICE(opaque);
+    PCIIDEState *s = opaque;
+    PCIDevice *d = PCI_DEVICE(s);
 
     if (level) {
         d->config[0x70 + n * 8] |= 0x80;
@@ -112,7 +114,7 @@ static void via_ide_set_irq(void *opaque, int n, int level)
         d->config[0x70 + n * 8] &= ~0x80;
     }
 
-    via_isa_set_irq(pci_get_function_0(d), 14 + n, level);
+    qemu_set_irq(s->isa_irq[n], level);
 }
 
 static void via_ide_reset(DeviceState *dev)
@@ -194,7 +196,6 @@ static void via_ide_realize(PCIDevice *dev, Error **errp)
         ide_bus_init_output_irq(&d->bus[i], qdev_get_gpio_in(ds, i));
 
         bmdma_init(&d->bus[i], &d->bmdma[i], d);
-        d->bmdma[i].bus = &d->bus[i];
         ide_bus_register_restart_cb(&d->bus[i]);
     }
 }
diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
index ca89119ce0..57bdfb4e78 100644
--- a/hw/isa/vt82c686.c
+++ b/hw/isa/vt82c686.c
@@ -592,12 +592,6 @@ static const TypeInfo via_isa_info = {
     },
 };
 
-void via_isa_set_irq(PCIDevice *d, int n, int level)
-{
-    ViaISAState *s = VIA_ISA(d);
-    qemu_set_irq(s->isa_irqs_in[n], level);
-}
-
 static void via_isa_request_i8259_irq(void *opaque, int irq, int level)
 {
     ViaISAState *s = opaque;
@@ -692,6 +686,10 @@ static void via_isa_realize(PCIDevice *d, Error **errp)
     if (!qdev_realize(DEVICE(&s->ide), BUS(pci_bus), errp)) {
         return;
     }
+    for (i = 0; i < 2; i++) {
+        qdev_connect_gpio_out_named(DEVICE(&s->ide), "isa-irq", i,
+                                    s->isa_irqs_in[14 + i]);
+    }
 
     /* Functions 2-3: USB Ports */
     for (i = 0; i < ARRAY_SIZE(s->uhci); i++) {
@@ -814,6 +812,7 @@ static void vt8231_isa_reset(DeviceState *dev)
                  PCI_COMMAND_MASTER | PCI_COMMAND_SPECIAL);
     pci_set_word(pci_conf + PCI_STATUS, PCI_STATUS_DEVSEL_MEDIUM);
 
+    pci_conf[0x4c] = 0x04; /* IDE interrupt Routing */
     pci_conf[0x58] = 0x40; /* Miscellaneous Control 0 */
     pci_conf[0x67] = 0x08; /* Fast IR Config */
     pci_conf[0x6b] = 0x01; /* Fast IR I/O Base */
diff --git a/hw/mips/loongson3_virt.c b/hw/mips/loongson3_virt.c
index 3dd91da7a6..4018b8c1d3 100644
--- a/hw/mips/loongson3_virt.c
+++ b/hw/mips/loongson3_virt.c
@@ -481,8 +481,8 @@ static void mips_loongson3_virt_init(MachineState *machine)
         if (!machine->cpu_type) {
             machine->cpu_type = MIPS_CPU_TYPE_NAME("Loongson-3A1000");
         }
-        if (!strstr(machine->cpu_type, "Loongson-3A1000")) {
-            error_report("Loongson-3/TCG needs cpu type Loongson-3A1000");
+        if (!cpu_type_supports_isa(machine->cpu_type, INSN_LOONGSON3A)) {
+            error_report("Loongson-3/TCG needs a Loongson-3 series cpu");
             exit(1);
         }
     } else {
diff --git a/include/hw/ide/pci.h b/include/hw/ide/pci.h
index 74c127e32f..1ff469de87 100644
--- a/include/hw/ide/pci.h
+++ b/include/hw/ide/pci.h
@@ -58,6 +58,7 @@ struct PCIIDEState {
 
 void bmdma_init(IDEBus *bus, BMDMAState *bm, PCIIDEState *d);
 void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val);
+void bmdma_status_writeb(BMDMAState *bm, uint32_t val);
 extern MemoryRegionOps bmdma_addr_ioport_ops;
 void pci_ide_create_devs(PCIDevice *dev);
 
diff --git a/include/hw/isa/vt82c686.h b/include/hw/isa/vt82c686.h
index da1722daf2..b6e95b2851 100644
--- a/include/hw/isa/vt82c686.h
+++ b/include/hw/isa/vt82c686.h
@@ -34,6 +34,4 @@ struct ViaAC97State {
     uint32_t ac97_cmd;
 };
 
-void via_isa_set_irq(PCIDevice *d, int n, int level);
-
 #endif
diff --git a/target/mips/cpu-defs.c.inc b/target/mips/cpu-defs.c.inc
index d45f245a67..03185d9aa0 100644
--- a/target/mips/cpu-defs.c.inc
+++ b/target/mips/cpu-defs.c.inc
@@ -118,6 +118,26 @@ const mips_def_t mips_defs[] =
         .mmu_type = MMU_TYPE_R4000,
     },
     {
+        .name = "XBurstR1",
+        .CP0_PRid = 0x1ed0024f,
+        .CP0_Config0 = MIPS_CONFIG0 | (MMU_TYPE_R4000 << CP0C0_MT),
+        .CP0_Config1 = MIPS_CONFIG1 | (15 << CP0C1_MMU) |
+                       (0 << CP0C1_IS) | (3 << CP0C1_IL) | (1 << CP0C1_IA) |
+                       (0 << CP0C1_DS) | (3 << CP0C1_DL) | (1 << CP0C1_DA) |
+                       (0 << CP0C1_CA),
+        .CP0_Config2 = MIPS_CONFIG2,
+        .CP0_Config3 = MIPS_CONFIG3,
+        .CP0_LLAddr_rw_bitmask = 0,
+        .CP0_LLAddr_shift = 4,
+        .SYNCI_Step = 32,
+        .CCRes = 2,
+        .CP0_Status_rw_bitmask = 0x1278FF17,
+        .SEGBITS = 32,
+        .PABITS = 32,
+        .insn_flags = CPU_MIPS32R1 | ASE_MXU,
+        .mmu_type = MMU_TYPE_R4000,
+    },
+    {
         .name = "4KEmR1",
         .CP0_PRid = 0x00018500,
         .CP0_Config0 = MIPS_CONFIG0 | (MMU_TYPE_FMT << CP0C0_MT),
@@ -324,6 +344,32 @@ const mips_def_t mips_defs[] =
         .mmu_type = MMU_TYPE_R4000,
     },
     {
+        .name = "XBurstR2",
+        .CP0_PRid = 0x2ed1024f,
+        .CP0_Config0 = MIPS_CONFIG0 | (0x1 << CP0C0_AR) |
+                    (MMU_TYPE_R4000 << CP0C0_MT),
+        .CP0_Config1 = MIPS_CONFIG1 | (1 << CP0C1_FP) | (15 << CP0C1_MMU) |
+                       (0 << CP0C1_IS) | (3 << CP0C1_IL) | (1 << CP0C1_IA) |
+                       (0 << CP0C1_DS) | (3 << CP0C1_DL) | (1 << CP0C1_DA) |
+                       (1 << CP0C1_CA),
+        .CP0_Config2 = MIPS_CONFIG2,
+        .CP0_Config3 = MIPS_CONFIG3 | (1 << CP0C3_DSP2P) | (1 << CP0C3_DSPP) |
+                       (1 << CP0C3_VInt),
+        .CP0_LLAddr_rw_bitmask = 0,
+        .CP0_LLAddr_shift = 4,
+        .SYNCI_Step = 32,
+        .CCRes = 2,
+        .CP0_Status_rw_bitmask = 0x3778FF1F,
+        .CP1_fcr0 = (1 << FCR0_F64) | (1 << FCR0_L) | (1 << FCR0_W) |
+                    (1 << FCR0_D) | (1 << FCR0_S) | (0x93 << FCR0_PRID),
+        .CP1_fcr31 = 0,
+        .CP1_fcr31_rw_bitmask = 0xFF83FFFF,
+        .SEGBITS = 32,
+        .PABITS = 32,
+        .insn_flags = CPU_MIPS32R2 | ASE_MXU,
+        .mmu_type = MMU_TYPE_R4000,
+    },
+    {
         .name = "M14K",
         .CP0_PRid = 0x00019b00,
         /* Config1 implemented, fixed mapping MMU,
@@ -709,7 +755,7 @@ const mips_def_t mips_defs[] =
         .CP0_Config4 = MIPS_CONFIG4 | (1U << CP0C4_M) | (3 << CP0C4_IE) |
                        (1 << CP0C4_AE) | (0xfc << CP0C4_KScrExist),
         .CP0_Config5 = MIPS_CONFIG5 | (1 << CP0C5_XNP) | (1 << CP0C5_VP) |
-                       (1 << CP0C5_LLB) | (1 << CP0C5_MRP),
+                       (1 << CP0C5_LLB) | (1 << CP0C5_MRP) | (3 << CP0C5_GI),
         .CP0_Config5_rw_bitmask = (1 << CP0C5_MSAEn) | (1 << CP0C5_SBRI) |
                                   (1 << CP0C5_FRE) | (1 << CP0C5_UFE),
         .CP0_LLAddr_rw_bitmask = 0,
@@ -749,7 +795,7 @@ const mips_def_t mips_defs[] =
         .CP0_Config4 = MIPS_CONFIG4 | (1U << CP0C4_M) | (3 << CP0C4_IE) |
                        (1 << CP0C4_AE) | (0xfc << CP0C4_KScrExist),
         .CP0_Config5 = MIPS_CONFIG5 | (1 << CP0C5_XNP) | (1 << CP0C5_VP) |
-                       (1 << CP0C5_LLB) | (1 << CP0C5_MRP),
+                       (1 << CP0C5_LLB) | (1 << CP0C5_MRP) | (3 << CP0C5_GI),
         .CP0_Config5_rw_bitmask = (1 << CP0C5_MSAEn) | (1 << CP0C5_SBRI) |
                                   (1 << CP0C5_FRE) | (1 << CP0C5_UFE),
         .CP0_LLAddr_rw_bitmask = 0,
@@ -895,6 +941,15 @@ const mips_def_t mips_defs[] =
         .CP1_fcr31 = 0,
         .CP1_fcr31_rw_bitmask = 0xFF83FFFF,
         .MSAIR = (0x01 << MSAIR_ProcID) | (0x40 << MSAIR_Rev),
+        .lcsr_cpucfg1 = (1 << CPUCFG1_FP) | (2 << CPUCFG1_FPREV) |
+                    (1 << CPUCFG1_MSA1) | (1 << CPUCFG1_LSLDR0) |
+                    (1 << CPUCFG1_LSPERF) | (1 << CPUCFG1_LSPERFX) |
+                    (1 << CPUCFG1_LSSYNCI) | (1 << CPUCFG1_LLEXC) |
+                    (1 << CPUCFG1_SCRAND) | (1 << CPUCFG1_MUALP) |
+                    (1 << CPUCFG1_KMUALEN) | (1 << CPUCFG1_ITLBT) |
+                    (1 << CPUCFG1_SFBP) | (1 << CPUCFG1_CDMAP),
+        .lcsr_cpucfg2 = (1 << CPUCFG2_LEXT1) | (1 << CPUCFG2_LCSRP) |
+                    (1 << CPUCFG2_LDISBLIKELY),
         .SEGBITS = 48,
         .PABITS = 48,
         .insn_flags = CPU_MIPS64R2 | INSN_LOONGSON3A |
diff --git a/target/mips/cpu.c b/target/mips/cpu.c
index 01e0fbe10d..63da1948fd 100644
--- a/target/mips/cpu.c
+++ b/target/mips/cpu.c
@@ -244,6 +244,8 @@ static void mips_cpu_reset_hold(Object *obj)
     env->CP0_PageGrain_rw_bitmask = env->cpu_model->CP0_PageGrain_rw_bitmask;
     env->CP0_PageGrain = env->cpu_model->CP0_PageGrain;
     env->CP0_EBaseWG_rw_bitmask = env->cpu_model->CP0_EBaseWG_rw_bitmask;
+    env->lcsr_cpucfg1 = env->cpu_model->lcsr_cpucfg1;
+    env->lcsr_cpucfg2 = env->cpu_model->lcsr_cpucfg2;
     env->active_fpu.fcr0 = env->cpu_model->CP1_fcr0;
     env->active_fpu.fcr31_rw_bitmask = env->cpu_model->CP1_fcr31_rw_bitmask;
     env->active_fpu.fcr31 = env->cpu_model->CP1_fcr31;
@@ -449,9 +451,9 @@ static void mips_cp0_period_set(MIPSCPU *cpu)
 {
     CPUMIPSState *env = &cpu->env;
 
-    env->cp0_count_ns = clock_ticks_to_ns(MIPS_CPU(cpu)->clock,
-                                          env->cpu_model->CCRes);
-    assert(env->cp0_count_ns);
+    clock_set_mul_div(cpu->count_div, env->cpu_model->CCRes, 1);
+    clock_set_source(cpu->count_div, cpu->clock);
+    clock_set_source(env->count_clock, cpu->count_div);
 }
 
 static void mips_cpu_realizefn(DeviceState *dev, Error **errp)
@@ -504,7 +506,17 @@ static void mips_cpu_initfn(Object *obj)
 
     cpu_set_cpustate_pointers(cpu);
     cpu->clock = qdev_init_clock_in(DEVICE(obj), "clk-in", NULL, cpu, 0);
+    cpu->count_div = clock_new(OBJECT(obj), "clk-div-count");
+    env->count_clock = clock_new(OBJECT(obj), "clk-count");
     env->cpu_model = mcc->cpu_def;
+#ifndef CONFIG_USER_ONLY
+    if (mcc->cpu_def->lcsr_cpucfg2 & (1 << CPUCFG2_LCSRP)) {
+        memory_region_init_io(&env->iocsr.mr, OBJECT(cpu), NULL,
+                                env, "iocsr", UINT64_MAX);
+        address_space_init(&env->iocsr.as,
+                            &env->iocsr.mr, "IOCSR");
+    }
+#endif
 }
 
 static char *mips_cpu_type_name(const char *cpu_model)
diff --git a/target/mips/cpu.h b/target/mips/cpu.h
index a3bc646976..f81bd06f5e 100644
--- a/target/mips/cpu.h
+++ b/target/mips/cpu.h
@@ -3,6 +3,9 @@
 
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
+#ifndef CONFIG_USER_ONLY
+#include "exec/memory.h"
+#endif
 #include "fpu/softfloat-types.h"
 #include "hw/clock.h"
 #include "mips-defs.h"
@@ -1068,6 +1071,33 @@ typedef struct CPUArchState {
  */
     int32_t CP0_DESAVE;
     target_ulong CP0_KScratch[MIPS_KSCRATCH_NUM];
+/*
+ * Loongson CSR CPUCFG registers
+ */
+    uint32_t lcsr_cpucfg1;
+#define CPUCFG1_FP     0
+#define CPUCFG1_FPREV  1
+#define CPUCFG1_MMI    4
+#define CPUCFG1_MSA1   5
+#define CPUCFG1_MSA2   6
+#define CPUCFG1_LSLDR0 16
+#define CPUCFG1_LSPERF 17
+#define CPUCFG1_LSPERFX 18
+#define CPUCFG1_LSSYNCI 19
+#define CPUCFG1_LLEXC   20
+#define CPUCFG1_SCRAND  21
+#define CPUCFG1_MUALP   25
+#define CPUCFG1_KMUALEN 26
+#define CPUCFG1_ITLBT   27
+#define CPUCFG1_SFBP    29
+#define CPUCFG1_CDMAP   30
+    uint32_t lcsr_cpucfg2;
+#define CPUCFG2_LEXT1   0
+#define CPUCFG2_LEXT2   1
+#define CPUCFG2_LEXT3   2
+#define CPUCFG2_LSPW    3
+#define CPUCFG2_LCSRP   27
+#define CPUCFG2_LDISBLIKELY 28
 
     /* We waste some space so we can handle shadow registers like TCs. */
     TCState tcs[MIPS_SHADOW_SET_MAX];
@@ -1156,12 +1186,18 @@ typedef struct CPUArchState {
     void *irq[8];
     struct MIPSITUState *itu;
     MemoryRegion *itc_tag; /* ITC Configuration Tags */
+
+    /* Loongson IOCSR memory */
+    struct {
+        AddressSpace as;
+        MemoryRegion mr;
+    } iocsr;
 #endif
 
     const mips_def_t *cpu_model;
     QEMUTimer *timer; /* Internal timer */
+    Clock *count_clock; /* CP0_Count clock */
     target_ulong exception_base; /* ExceptionBase input to the core */
-    uint64_t cp0_count_ns; /* CP0_Count clock period (in nanoseconds) */
 } CPUMIPSState;
 
 /**
@@ -1178,6 +1214,7 @@ struct ArchCPU {
     /*< public >*/
 
     Clock *clock;
+    Clock *count_div; /* Divider for CP0_Count clock */
     CPUNegativeOffsetState neg;
     CPUMIPSState env;
 };
@@ -1280,6 +1317,12 @@ static inline bool ase_msa_available(CPUMIPSState *env)
     return env->CP0_Config3 & (1 << CP0C3_MSAP);
 }
 
+/* Check presence of Loongson CSR instructions */
+static inline bool ase_lcsr_available(CPUMIPSState *env)
+{
+    return env->lcsr_cpucfg2 & (1 << CPUCFG2_LCSRP);
+}
+
 /* Check presence of multi-threading ASE implementation */
 static inline bool ase_mt_available(CPUMIPSState *env)
 {
diff --git a/target/mips/helper.h b/target/mips/helper.h
index de32d82e98..0f8462febb 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -196,6 +196,10 @@ DEF_HELPER_1(rdhwr_xnp, tl, env)
 DEF_HELPER_2(pmon, void, env, int)
 DEF_HELPER_1(wait, void, env)
 
+#ifdef TARGET_MIPS64
+DEF_HELPER_FLAGS_2(lcsr_cpucfg, TCG_CALL_NO_RWG_SE, tl, env, tl)
+#endif
+
 /* Loongson multimedia functions.  */
 DEF_HELPER_FLAGS_2(paddsh, TCG_CALL_NO_RWG_SE, i64, i64, i64)
 DEF_HELPER_FLAGS_2(paddush, TCG_CALL_NO_RWG_SE, i64, i64, i64)
diff --git a/target/mips/internal.h b/target/mips/internal.h
index 4b0031d10d..1d0c026c7d 100644
--- a/target/mips/internal.h
+++ b/target/mips/internal.h
@@ -79,6 +79,8 @@ struct mips_def_t {
     int32_t CP0_PageGrain_rw_bitmask;
     int32_t CP0_PageGrain;
     target_ulong CP0_EBaseWG_rw_bitmask;
+    uint32_t lcsr_cpucfg1;
+    uint32_t lcsr_cpucfg2;
     uint64_t insn_flags;
     enum mips_mmu_types mmu_type;
     int32_t SAARP;
diff --git a/target/mips/sysemu/cp0_timer.c b/target/mips/sysemu/cp0_timer.c
index 70de95d338..9d2bcb0dea 100644
--- a/target/mips/sysemu/cp0_timer.c
+++ b/target/mips/sysemu/cp0_timer.c
@@ -28,15 +28,26 @@
 #include "internal.h"
 
 /* MIPS R4K timer */
+static uint32_t cpu_mips_get_count_val(CPUMIPSState *env)
+{
+    int64_t now_ns;
+    now_ns = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+    return env->CP0_Count +
+            (uint32_t)clock_ns_to_ticks(env->count_clock, now_ns);
+}
+
 static void cpu_mips_timer_update(CPUMIPSState *env)
 {
     uint64_t now_ns, next_ns;
     uint32_t wait;
 
     now_ns = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
-    wait = env->CP0_Compare - env->CP0_Count -
-           (uint32_t)(now_ns / env->cp0_count_ns);
-    next_ns = now_ns + (uint64_t)wait * env->cp0_count_ns;
+    wait = env->CP0_Compare - cpu_mips_get_count_val(env);
+    /* Clamp interval to overflow if virtual time had not progressed */
+    if (!wait) {
+        wait = UINT32_MAX;
+    }
+    next_ns = now_ns + clock_ticks_to_ns(env->count_clock, wait);
     timer_mod(env->timer, next_ns);
 }
 
@@ -64,7 +75,7 @@ uint32_t cpu_mips_get_count(CPUMIPSState *env)
             cpu_mips_timer_expire(env);
         }
 
-        return env->CP0_Count + (uint32_t)(now_ns / env->cp0_count_ns);
+        return cpu_mips_get_count_val(env);
     }
 }
 
@@ -79,9 +90,8 @@ void cpu_mips_store_count(CPUMIPSState *env, uint32_t count)
         env->CP0_Count = count;
     } else {
         /* Store new count register */
-        env->CP0_Count = count -
-               (uint32_t)(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
-                          env->cp0_count_ns);
+        env->CP0_Count = count - (uint32_t)clock_ns_to_ticks(env->count_clock,
+                        qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
         /* Update timer timer */
         cpu_mips_timer_update(env);
     }
@@ -107,8 +117,8 @@ void cpu_mips_start_count(CPUMIPSState *env)
 void cpu_mips_stop_count(CPUMIPSState *env)
 {
     /* Store the current value */
-    env->CP0_Count += (uint32_t)(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) /
-                                 env->cp0_count_ns);
+    env->CP0_Count += (uint32_t)clock_ns_to_ticks(env->count_clock,
+                        qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
 }
 
 static void mips_timer_cb(void *opaque)
@@ -121,14 +131,7 @@ static void mips_timer_cb(void *opaque)
         return;
     }
 
-    /*
-     * ??? This callback should occur when the counter is exactly equal to
-     * the comparator value.  Offset the count by one to avoid immediately
-     * retriggering the callback before any virtual time has passed.
-     */
-    env->CP0_Count++;
     cpu_mips_timer_expire(env);
-    env->CP0_Count--;
 }
 
 void cpu_mips_clock_init(MIPSCPU *cpu)
diff --git a/target/mips/tcg/lcsr.decode b/target/mips/tcg/lcsr.decode
new file mode 100644
index 0000000000..960ef8b6f9
--- /dev/null
+++ b/target/mips/tcg/lcsr.decode
@@ -0,0 +1,17 @@
+# Loongson CSR instructions
+#
+# Copyright (C) 2023 Jiaxun Yang <jiaxun.yang@flygoat.com>
+#
+# SPDX-License-Identifier: LGPL-2.1-or-later
+#
+
+&r           rs rt rd sa
+
+@rs_rd       ...... rs:5 ..... rd:5 ..... ...... &r rt=0 sa=0
+
+CPUCFG       110010 ..... 01000 ..... 00100 011000 @rs_rd
+
+RDCSR        110010 ..... 00000 ..... 00100 011000 @rs_rd
+WRCSR        110010 ..... 00001 ..... 00100 011000 @rs_rd
+DRDCSR       110010 ..... 00010 ..... 00100 011000 @rs_rd
+DWRCSR       110010 ..... 00011 ..... 00100 011000 @rs_rd
diff --git a/target/mips/tcg/lcsr_translate.c b/target/mips/tcg/lcsr_translate.c
new file mode 100644
index 0000000000..9f2a5f4a37
--- /dev/null
+++ b/target/mips/tcg/lcsr_translate.c
@@ -0,0 +1,75 @@
+/*
+ * Loongson CSR instructions translation routines
+ *
+ *  Copyright (c) 2023 Jiaxun Yang <jiaxun.yang@flygoat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/helper-gen.h"
+#include "translate.h"
+
+/* Include the auto-generated decoder.  */
+#include "decode-lcsr.c.inc"
+
+static bool trans_CPUCFG(DisasContext *ctx, arg_CPUCFG *a)
+{
+    TCGv dest = tcg_temp_new();
+    TCGv src1 = tcg_temp_new();
+
+    gen_load_gpr(src1, a->rs);
+    gen_helper_lcsr_cpucfg(dest, cpu_env, src1);
+    gen_store_gpr(dest, a->rd);
+
+    return true;
+}
+
+#ifndef CONFIG_USER_ONLY
+static bool gen_rdcsr(DisasContext *ctx, arg_r *a,
+                        void (*func)(TCGv, TCGv_ptr, TCGv))
+{
+    TCGv dest = tcg_temp_new();
+    TCGv src1 = tcg_temp_new();
+
+    check_cp0_enabled(ctx);
+    gen_load_gpr(src1, a->rs);
+    func(dest, cpu_env, src1);
+    gen_store_gpr(dest, a->rd);
+
+    return true;
+}
+
+static bool gen_wrcsr(DisasContext *ctx, arg_r *a,
+                        void (*func)(TCGv_ptr, TCGv, TCGv))
+{
+    TCGv val = tcg_temp_new();
+    TCGv addr = tcg_temp_new();
+
+    check_cp0_enabled(ctx);
+    gen_load_gpr(addr, a->rs);
+    gen_load_gpr(val, a->rd);
+    func(cpu_env, addr, val);
+
+    return true;
+}
+
+TRANS(RDCSR, gen_rdcsr, gen_helper_lcsr_rdcsr)
+TRANS(DRDCSR, gen_rdcsr, gen_helper_lcsr_drdcsr)
+TRANS(WRCSR, gen_wrcsr, gen_helper_lcsr_wrcsr)
+TRANS(DWRCSR, gen_wrcsr, gen_helper_lcsr_dwrcsr)
+#else
+#define GEN_FALSE_TRANS(name)   \
+static bool trans_##name(DisasContext *ctx, arg_##name * a)  \
+{   \
+    return false;   \
+}
+
+GEN_FALSE_TRANS(RDCSR)
+GEN_FALSE_TRANS(DRDCSR)
+GEN_FALSE_TRANS(WRCSR)
+GEN_FALSE_TRANS(DWRCSR)
+#endif
diff --git a/target/mips/tcg/meson.build b/target/mips/tcg/meson.build
index 7ee969ec8f..ea7fb582f2 100644
--- a/target/mips/tcg/meson.build
+++ b/target/mips/tcg/meson.build
@@ -4,6 +4,7 @@ gen = [
   decodetree.process('tx79.decode', extra_args: '--static-decode=decode_tx79'),
   decodetree.process('vr54xx.decode', extra_args: '--decode=decode_ext_vr54xx'),
   decodetree.process('octeon.decode', extra_args: '--decode=decode_ext_octeon'),
+  decodetree.process('lcsr.decode', extra_args: '--decode=decode_ase_lcsr'),
 ]
 
 mips_ss.add(gen)
@@ -26,6 +27,7 @@ mips_ss.add(files(
 mips_ss.add(when: 'TARGET_MIPS64', if_true: files(
   'tx79_translate.c',
   'octeon_translate.c',
+  'lcsr_translate.c',
 ), if_false: files(
   'mxu_translate.c',
 ))
diff --git a/target/mips/tcg/mxu_translate.c b/target/mips/tcg/mxu_translate.c
index 39348b3a91..deb8060a17 100644
--- a/target/mips/tcg/mxu_translate.c
+++ b/target/mips/tcg/mxu_translate.c
@@ -237,11 +237,11 @@
  *          ├─ 001100 ─ OPC_MXU_D16MADL
  *          ├─ 001101 ─ OPC_MXU_S16MAD
  *          ├─ 001110 ─ OPC_MXU_Q16ADD
- *          ├─ 001111 ─ OPC_MXU_D16MACE     23
+ *          ├─ 001111 ─ OPC_MXU_D16MACE     20 (13..10 don't care)
  *          │                            ┌─ 0 ─ OPC_MXU_S32LDD
  *          ├─ 010000 ─ OPC_MXU__POOL04 ─┴─ 1 ─ OPC_MXU_S32LDDR
  *          │
- *          │                               23
+ *          │                               20 (13..10 don't care)
  *          ├─ 010001 ─ OPC_MXU__POOL05 ─┬─ 0 ─ OPC_MXU_S32STD
  *          │                            └─ 1 ─ OPC_MXU_S32STDR
  *          │
@@ -253,11 +253,11 @@
  *          ├─ 010011 ─ OPC_MXU__POOL07 ─┬─ 0000 ─ OPC_MXU_S32STDV
  *          │                            └─ 0001 ─ OPC_MXU_S32STDVR
  *          │
- *          │                               23
+ *          │                               20 (13..10 don't care)
  *          ├─ 010100 ─ OPC_MXU__POOL08 ─┬─ 0 ─ OPC_MXU_S32LDI
  *          │                            └─ 1 ─ OPC_MXU_S32LDIR
  *          │
- *          │                               23
+ *          │                               20 (13..10 don't care)
  *          ├─ 010101 ─ OPC_MXU__POOL09 ─┬─ 0 ─ OPC_MXU_S32SDI
  *          │                            └─ 1 ─ OPC_MXU_S32SDIR
  *          │
@@ -268,7 +268,7 @@
  *          │                               13..10
  *          ├─ 010111 ─ OPC_MXU__POOL11 ─┬─ 0000 ─ OPC_MXU_S32SDIV
  *          │                            └─ 0001 ─ OPC_MXU_S32SDIVR
- *          ├─ 011000 ─ OPC_MXU_D32ADD
+ *          ├─ 011000 ─ OPC_MXU_D32ADD  (catches D32ADDC too)
  *          │                               23..22
  *   MXU    ├─ 011001 ─ OPC_MXU__POOL12 ─┬─ 00 ─ OPC_MXU_D32ACC
  * opcodes ─┤                            ├─ 01 ─ OPC_MXU_D32ACCM
@@ -277,7 +277,7 @@
  *          │                               23..22
  *          ├─ 011011 ─ OPC_MXU__POOL13 ─┬─ 00 ─ OPC_MXU_Q16ACC
  *          │                            ├─ 01 ─ OPC_MXU_Q16ACCM
- *          │                            └─ 10 ─ OPC_MXU_Q16ASUM
+ *          │                            └─ 10 ─ OPC_MXU_D16ASUM
  *          │
  *          │                               23..22
  *          ├─ 011100 ─ OPC_MXU__POOL14 ─┬─ 00 ─ OPC_MXU_Q8ADDE
@@ -290,9 +290,9 @@
  *          ├─ 100010 ─ OPC_MXU_S8LDD
  *          ├─ 100011 ─ OPC_MXU_S8STD       15..14
  *          ├─ 100100 ─ OPC_MXU_S8LDI    ┌─ 00 ─ OPC_MXU_S32MUL
- *          ├─ 100101 ─ OPC_MXU_S8SDI    ├─ 00 ─ OPC_MXU_S32MULU
- *          │                            ├─ 00 ─ OPC_MXU_S32EXTR
- *          ├─ 100110 ─ OPC_MXU__POOL15 ─┴─ 00 ─ OPC_MXU_S32EXTRV
+ *          ├─ 100101 ─ OPC_MXU_S8SDI    ├─ 01 ─ OPC_MXU_S32MULU
+ *          │                            ├─ 10 ─ OPC_MXU_S32EXTR
+ *          ├─ 100110 ─ OPC_MXU__POOL15 ─┴─ 11 ─ OPC_MXU_S32EXTRV
  *          │
  *          │                               20..18
  *          ├─ 100111 ─ OPC_MXU__POOL16 ─┬─ 000 ─ OPC_MXU_D32SARW
@@ -304,7 +304,7 @@
  *          │                            ├─ 110 ─ OPC_MXU_S32OR
  *          │                            └─ 111 ─ OPC_MXU_S32XOR
  *          │
- *          │                               7..5
+ *          │                               8..6
  *          ├─ 101000 ─ OPC_MXU__POOL17 ─┬─ 000 ─ OPC_MXU_LXB
  *          │                            ├─ 001 ─ OPC_MXU_LXH
  *          ├─ 101001 ─ <not assigned>   ├─ 011 ─ OPC_MXU_LXW
@@ -318,15 +318,15 @@
  *          ├─ 110001 ─ OPC_MXU_D32SLR      20..18
  *          ├─ 110010 ─ OPC_MXU_D32SARL  ┌─ 000 ─ OPC_MXU_D32SLLV
  *          ├─ 110011 ─ OPC_MXU_D32SAR   ├─ 001 ─ OPC_MXU_D32SLRV
- *          ├─ 110100 ─ OPC_MXU_Q16SLL   ├─ 010 ─ OPC_MXU_D32SARV
- *          ├─ 110101 ─ OPC_MXU_Q16SLR   ├─ 011 ─ OPC_MXU_Q16SLLV
- *          │                            ├─ 100 ─ OPC_MXU_Q16SLRV
- *          ├─ 110110 ─ OPC_MXU__POOL18 ─┴─ 101 ─ OPC_MXU_Q16SARV
+ *          ├─ 110100 ─ OPC_MXU_Q16SLL   ├─ 011 ─ OPC_MXU_D32SARV
+ *          ├─ 110101 ─ OPC_MXU_Q16SLR   ├─ 100 ─ OPC_MXU_Q16SLLV
+ *          │                            ├─ 101 ─ OPC_MXU_Q16SLRV
+ *          ├─ 110110 ─ OPC_MXU__POOL18 ─┴─ 111 ─ OPC_MXU_Q16SARV
  *          │
  *          ├─ 110111 ─ OPC_MXU_Q16SAR
  *          │                               23..22
  *          ├─ 111000 ─ OPC_MXU__POOL19 ─┬─ 00 ─ OPC_MXU_Q8MUL
- *          │                            └─ 01 ─ OPC_MXU_Q8MULSU
+ *          │                            └─ 10 ─ OPC_MXU_Q8MULSU
  *          │
  *          │                               20..18
  *          ├─ 111001 ─ OPC_MXU__POOL20 ─┬─ 000 ─ OPC_MXU_Q8MOVZ
@@ -353,15 +353,62 @@
  */
 
 enum {
+    OPC_MXU_S32MADD  = 0x00,
+    OPC_MXU_S32MADDU = 0x01,
     OPC_MXU__POOL00  = 0x03,
+    OPC_MXU_S32MSUB  = 0x04,
+    OPC_MXU_S32MSUBU = 0x05,
+    OPC_MXU__POOL01  = 0x06,
+    OPC_MXU__POOL02  = 0x07,
     OPC_MXU_D16MUL   = 0x08,
+    OPC_MXU__POOL03  = 0x09,
     OPC_MXU_D16MAC   = 0x0A,
+    OPC_MXU_D16MACF  = 0x0B,
+    OPC_MXU_D16MADL  = 0x0C,
+    OPC_MXU_S16MAD   = 0x0D,
+    OPC_MXU_Q16ADD   = 0x0E,
+    OPC_MXU_D16MACE  = 0x0F,
     OPC_MXU__POOL04  = 0x10,
+    OPC_MXU__POOL05  = 0x11,
+    OPC_MXU__POOL06  = 0x12,
+    OPC_MXU__POOL07  = 0x13,
+    OPC_MXU__POOL08  = 0x14,
+    OPC_MXU__POOL09  = 0x15,
+    OPC_MXU__POOL10  = 0x16,
+    OPC_MXU__POOL11  = 0x17,
+    OPC_MXU_D32ADD   = 0x18,
+    OPC_MXU__POOL12  = 0x19,
+    OPC_MXU__POOL13  = 0x1B,
+    OPC_MXU__POOL14  = 0x1C,
+    OPC_MXU_Q8ACCE   = 0x1D,
     OPC_MXU_S8LDD    = 0x22,
+    OPC_MXU_S8STD    = 0x23,
+    OPC_MXU_S8LDI    = 0x24,
+    OPC_MXU_S8SDI    = 0x25,
+    OPC_MXU__POOL15  = 0x26,
     OPC_MXU__POOL16  = 0x27,
+    OPC_MXU__POOL17  = 0x28,
+    OPC_MXU_S16LDD   = 0x2A,
+    OPC_MXU_S16STD   = 0x2B,
+    OPC_MXU_S16LDI   = 0x2C,
+    OPC_MXU_S16SDI   = 0x2D,
     OPC_MXU_S32M2I   = 0x2E,
     OPC_MXU_S32I2M   = 0x2F,
+    OPC_MXU_D32SLL   = 0x30,
+    OPC_MXU_D32SLR   = 0x31,
+    OPC_MXU_D32SARL  = 0x32,
+    OPC_MXU_D32SAR   = 0x33,
+    OPC_MXU_Q16SLL   = 0x34,
+    OPC_MXU_Q16SLR   = 0x35,
+    OPC_MXU__POOL18  = 0x36,
+    OPC_MXU_Q16SAR   = 0x37,
     OPC_MXU__POOL19  = 0x38,
+    OPC_MXU__POOL20  = 0x39,
+    OPC_MXU__POOL21  = 0x3A,
+    OPC_MXU_Q16SCOP  = 0x3B,
+    OPC_MXU_Q8MADL   = 0x3C,
+    OPC_MXU_S32SFL   = 0x3D,
+    OPC_MXU_Q8SAD    = 0x3E,
 };
 
 
@@ -375,21 +422,94 @@ enum {
     OPC_MXU_D16MIN   = 0x03,
     OPC_MXU_Q8MAX    = 0x04,
     OPC_MXU_Q8MIN    = 0x05,
+    OPC_MXU_Q8SLT    = 0x06,
+    OPC_MXU_Q8SLTU   = 0x07,
 };
 
 /*
- * MXU pool 04
+ * MXU pool 01
  */
 enum {
-    OPC_MXU_S32LDD   = 0x00,
-    OPC_MXU_S32LDDR  = 0x01,
+    OPC_MXU_S32SLT   = 0x00,
+    OPC_MXU_D16SLT   = 0x01,
+    OPC_MXU_D16AVG   = 0x02,
+    OPC_MXU_D16AVGR  = 0x03,
+    OPC_MXU_Q8AVG    = 0x04,
+    OPC_MXU_Q8AVGR   = 0x05,
+    OPC_MXU_Q8ADD    = 0x07,
+};
+
+/*
+ * MXU pool 02
+ */
+enum {
+    OPC_MXU_S32CPS   = 0x00,
+    OPC_MXU_D16CPS   = 0x02,
+    OPC_MXU_Q8ABD    = 0x04,
+    OPC_MXU_Q16SAT   = 0x06,
+};
+
+/*
+ * MXU pool 03
+ */
+enum {
+    OPC_MXU_D16MULF  = 0x00,
+    OPC_MXU_D16MULE  = 0x01,
+};
+
+/*
+ * MXU pool 04 05 06 07 08 09 10 11
+ */
+enum {
+    OPC_MXU_S32LDST  = 0x00,
+    OPC_MXU_S32LDSTR = 0x01,
+};
+
+/*
+ * MXU pool 12
+ */
+enum {
+    OPC_MXU_D32ACC    = 0x00,
+    OPC_MXU_D32ACCM   = 0x01,
+    OPC_MXU_D32ASUM   = 0x02,
+};
+
+/*
+ * MXU pool 13
+ */
+enum {
+    OPC_MXU_Q16ACC    = 0x00,
+    OPC_MXU_Q16ACCM   = 0x01,
+    OPC_MXU_D16ASUM   = 0x02,
+};
+
+/*
+ * MXU pool 14
+ */
+enum {
+    OPC_MXU_Q8ADDE    = 0x00,
+    OPC_MXU_D8SUM     = 0x01,
+    OPC_MXU_D8SUMC    = 0x02,
+};
+
+/*
+ * MXU pool 15
+ */
+enum {
+    OPC_MXU_S32MUL    = 0x00,
+    OPC_MXU_S32MULU   = 0x01,
+    OPC_MXU_S32EXTR   = 0x02,
+    OPC_MXU_S32EXTRV  = 0x03,
 };
 
 /*
  * MXU pool 16
  */
 enum {
+    OPC_MXU_D32SARW  = 0x00,
+    OPC_MXU_S32ALN   = 0x01,
     OPC_MXU_S32ALNI  = 0x02,
+    OPC_MXU_S32LUI   = 0x03,
     OPC_MXU_S32NOR   = 0x04,
     OPC_MXU_S32AND   = 0x05,
     OPC_MXU_S32OR    = 0x06,
@@ -397,13 +517,57 @@ enum {
 };
 
 /*
+ * MXU pool 17
+ */
+enum {
+    OPC_MXU_LXB      = 0x00,
+    OPC_MXU_LXH      = 0x01,
+    OPC_MXU_LXW      = 0x03,
+    OPC_MXU_LXBU     = 0x04,
+    OPC_MXU_LXHU     = 0x05,
+};
+
+/*
+ * MXU pool 18
+ */
+enum {
+    OPC_MXU_D32SLLV  = 0x00,
+    OPC_MXU_D32SLRV  = 0x01,
+    OPC_MXU_D32SARV  = 0x03,
+    OPC_MXU_Q16SLLV  = 0x04,
+    OPC_MXU_Q16SLRV  = 0x05,
+    OPC_MXU_Q16SARV  = 0x07,
+};
+
+/*
  * MXU pool 19
  */
 enum {
     OPC_MXU_Q8MUL    = 0x00,
-    OPC_MXU_Q8MULSU  = 0x01,
+    OPC_MXU_Q8MULSU  = 0x02,
 };
 
+/*
+ * MXU pool 20
+ */
+enum {
+    OPC_MXU_Q8MOVZ   = 0x00,
+    OPC_MXU_Q8MOVN   = 0x01,
+    OPC_MXU_D16MOVZ  = 0x02,
+    OPC_MXU_D16MOVN  = 0x03,
+    OPC_MXU_S32MOVZ  = 0x04,
+    OPC_MXU_S32MOVN  = 0x05,
+};
+
+/*
+ * MXU pool 21
+ */
+enum {
+    OPC_MXU_Q8MAC    = 0x00,
+    OPC_MXU_Q8MACSU  = 0x02,
+};
+
+
 /* MXU accumulate add/subtract 1-bit pattern 'aptn1' */
 #define MXU_APTN1_A    0
 #define MXU_APTN1_S    1
@@ -537,8 +701,11 @@ static void gen_mxu_s32m2i(DisasContext *ctx)
 
 /*
  * S8LDD XRa, Rb, s8, optn3 - Load a byte from memory to XRF
+ *
+ * S8LDI XRa, Rb, s8, optn3 - Load a byte from memory to XRF,
+ * post modify address register
  */
-static void gen_mxu_s8ldd(DisasContext *ctx)
+static void gen_mxu_s8ldd(DisasContext *ctx, bool postmodify)
 {
     TCGv t0, t1;
     uint32_t XRa, Rb, s8, optn3;
@@ -553,6 +720,9 @@ static void gen_mxu_s8ldd(DisasContext *ctx)
 
     gen_load_gpr(t0, Rb);
     tcg_gen_addi_tl(t0, t0, (int8_t)s8);
+    if (postmodify) {
+        gen_store_gpr(t0, Rb);
+    }
 
     switch (optn3) {
     /* XRa[7:0] = tmp8 */
@@ -610,9 +780,208 @@ static void gen_mxu_s8ldd(DisasContext *ctx)
 }
 
 /*
- * D16MUL XRa, XRb, XRc, XRd, optn2 - Signed 16 bit pattern multiplication
+ * S8STD XRa, Rb, s8, optn3 - Store a byte from XRF to memory
+ *
+ * S8SDI XRa, Rb, s8, optn3 - Store a byte from XRF to memory,
+ * post modify address register
  */
-static void gen_mxu_d16mul(DisasContext *ctx)
+static void gen_mxu_s8std(DisasContext *ctx, bool postmodify)
+{
+    TCGv t0, t1;
+    uint32_t XRa, Rb, s8, optn3;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode, 6, 4);
+    s8 = extract32(ctx->opcode, 10, 8);
+    optn3 = extract32(ctx->opcode, 18, 3);
+    Rb = extract32(ctx->opcode, 21, 5);
+
+    if (optn3 > 3) {
+        /* reserved, do nothing */
+        return;
+    }
+
+    gen_load_gpr(t0, Rb);
+    tcg_gen_addi_tl(t0, t0, (int8_t)s8);
+    if (postmodify) {
+        gen_store_gpr(t0, Rb);
+    }
+    gen_load_mxu_gpr(t1, XRa);
+
+    switch (optn3) {
+    /* XRa[7:0] => tmp8 */
+    case MXU_OPTN3_PTN0:
+        tcg_gen_extract_tl(t1, t1, 0, 8);
+        break;
+    /* XRa[15:8] => tmp8 */
+    case MXU_OPTN3_PTN1:
+        tcg_gen_extract_tl(t1, t1, 8, 8);
+        break;
+    /* XRa[23:16] => tmp8 */
+    case MXU_OPTN3_PTN2:
+        tcg_gen_extract_tl(t1, t1, 16, 8);
+        break;
+    /* XRa[31:24] => tmp8 */
+    case MXU_OPTN3_PTN3:
+        tcg_gen_extract_tl(t1, t1, 24, 8);
+        break;
+    }
+
+    tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_UB);
+}
+
+/*
+ * S16LDD XRa, Rb, s10, optn2 - Load a halfword from memory to XRF
+ *
+ * S16LDI XRa, Rb, s10, optn2 - Load a halfword from memory to XRF,
+ * post modify address register
+ */
+static void gen_mxu_s16ldd(DisasContext *ctx, bool postmodify)
+{
+    TCGv t0, t1;
+    uint32_t XRa, Rb, optn2;
+    int32_t s10;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    XRa   = extract32(ctx->opcode,   6, 4);
+    s10   = sextract32(ctx->opcode, 10, 9) * 2;
+    optn2 = extract32(ctx->opcode,  19, 2);
+    Rb    = extract32(ctx->opcode,  21, 5);
+
+    gen_load_gpr(t0, Rb);
+    tcg_gen_addi_tl(t0, t0, s10);
+    if (postmodify) {
+        gen_store_gpr(t0, Rb);
+    }
+
+    switch (optn2) {
+    /* XRa[15:0] = tmp16 */
+    case MXU_OPTN2_PTN0:
+        tcg_gen_qemu_ld_tl(t1, t0, ctx->mem_idx, MO_UW);
+        gen_load_mxu_gpr(t0, XRa);
+        tcg_gen_deposit_tl(t0, t0, t1, 0, 16);
+        break;
+    /* XRa[31:16] = tmp16 */
+    case MXU_OPTN2_PTN1:
+        tcg_gen_qemu_ld_tl(t1, t0, ctx->mem_idx, MO_UW);
+        gen_load_mxu_gpr(t0, XRa);
+        tcg_gen_deposit_tl(t0, t0, t1, 16, 16);
+        break;
+    /* XRa = sign_extend(tmp16) */
+    case MXU_OPTN2_PTN2:
+        tcg_gen_qemu_ld_tl(t0, t0, ctx->mem_idx, MO_SW);
+        break;
+    /* XRa = {tmp16, tmp16} */
+    case MXU_OPTN2_PTN3:
+        tcg_gen_qemu_ld_tl(t1, t0, ctx->mem_idx, MO_UW);
+        tcg_gen_deposit_tl(t0, t1, t1,  0, 16);
+        tcg_gen_deposit_tl(t0, t1, t1, 16, 16);
+        break;
+    }
+
+    gen_store_mxu_gpr(t0, XRa);
+}
+
+/*
+ * S16STD XRa, Rb, s8, optn2 - Store a byte from XRF to memory
+ *
+ * S16SDI XRa, Rb, s8, optn2 - Store a byte from XRF to memory,
+ * post modify address register
+ */
+static void gen_mxu_s16std(DisasContext *ctx, bool postmodify)
+{
+    TCGv t0, t1;
+    uint32_t XRa, Rb, optn2;
+    int32_t s10;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode, 6, 4);
+    s10 = sextract32(ctx->opcode, 10, 9) * 2;
+    optn2 = extract32(ctx->opcode, 19, 2);
+    Rb = extract32(ctx->opcode, 21, 5);
+
+    if (optn2 > 1) {
+        /* reserved, do nothing */
+        return;
+    }
+
+    gen_load_gpr(t0, Rb);
+    tcg_gen_addi_tl(t0, t0, s10);
+    if (postmodify) {
+        gen_store_gpr(t0, Rb);
+    }
+    gen_load_mxu_gpr(t1, XRa);
+
+    switch (optn2) {
+    /* XRa[15:0] => tmp16 */
+    case MXU_OPTN2_PTN0:
+        tcg_gen_extract_tl(t1, t1, 0, 16);
+        break;
+    /* XRa[31:16] => tmp16 */
+    case MXU_OPTN2_PTN1:
+        tcg_gen_extract_tl(t1, t1, 16, 16);
+        break;
+    }
+
+    tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx, MO_UW);
+}
+
+/*
+ * S32MUL  XRa, XRd, rs, rt - Signed 32x32=>64 bit multiplication
+ * of GPR's and stores result into pair of MXU registers.
+ * It strains HI and LO registers.
+ *
+ * S32MULU XRa, XRd, rs, rt - Unsigned 32x32=>64 bit multiplication
+ * of GPR's and stores result into pair of MXU registers.
+ * It strains HI and LO registers.
+ */
+static void gen_mxu_s32mul(DisasContext *ctx, bool mulu)
+{
+    TCGv t0, t1;
+    uint32_t XRa, XRd, rs, rt;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode,  6, 4);
+    XRd = extract32(ctx->opcode, 10, 4);
+    rs  = extract32(ctx->opcode, 16, 5);
+    rt  = extract32(ctx->opcode, 21, 5);
+
+    if (unlikely(rs == 0 || rt == 0)) {
+        tcg_gen_movi_tl(t0, 0);
+        tcg_gen_movi_tl(t1, 0);
+    } else {
+        gen_load_gpr(t0, rs);
+        gen_load_gpr(t1, rt);
+
+        if (mulu) {
+            tcg_gen_mulu2_tl(t0, t1, t0, t1);
+        } else {
+            tcg_gen_muls2_tl(t0, t1, t0, t1);
+        }
+    }
+    tcg_gen_mov_tl(cpu_HI[0], t1);
+    tcg_gen_mov_tl(cpu_LO[0], t0);
+    gen_store_mxu_gpr(t1, XRa);
+    gen_store_mxu_gpr(t0, XRd);
+}
+
+/*
+ * D16MUL  XRa, XRb, XRc, XRd, optn2 - Signed 16 bit pattern multiplication
+ * D16MULF XRa, XRb, XRc, optn2 - Signed Q15 fraction pattern multiplication
+ *   with rounding and packing result
+ * D16MULE XRa, XRb, XRc, XRd, optn2 - Signed Q15 fraction pattern
+ *   multiplication with rounding
+ */
+static void gen_mxu_d16mul(DisasContext *ctx, bool fractional,
+                           bool packed_result)
 {
     TCGv t0, t1, t2, t3;
     uint32_t XRa, XRb, XRc, XRd, optn2;
@@ -628,6 +997,12 @@ static void gen_mxu_d16mul(DisasContext *ctx)
     XRd = extract32(ctx->opcode, 18, 4);
     optn2 = extract32(ctx->opcode, 22, 2);
 
+    /*
+     * TODO: XRd field isn't used for D16MULF
+     * There's no knowledge how this field affect
+     * instruction decoding/behavior
+     */
+
     gen_load_mxu_gpr(t1, XRb);
     tcg_gen_sextract_tl(t0, t1, 0, 16);
     tcg_gen_sextract_tl(t1, t1, 16, 16);
@@ -653,15 +1028,64 @@ static void gen_mxu_d16mul(DisasContext *ctx)
         tcg_gen_mul_tl(t2, t1, t2);
         break;
     }
-    gen_store_mxu_gpr(t3, XRa);
-    gen_store_mxu_gpr(t2, XRd);
+    if (fractional) {
+        TCGLabel *l_done = gen_new_label();
+        TCGv rounding = tcg_temp_new();
+
+        tcg_gen_shli_tl(t3, t3, 1);
+        tcg_gen_shli_tl(t2, t2, 1);
+        tcg_gen_andi_tl(rounding, mxu_CR, 0x2);
+        tcg_gen_brcondi_tl(TCG_COND_EQ, rounding, 0, l_done);
+        if (packed_result) {
+            TCGLabel *l_apply_bias_l = gen_new_label();
+            TCGLabel *l_apply_bias_r = gen_new_label();
+            TCGLabel *l_half_done = gen_new_label();
+            TCGv bias = tcg_temp_new();
+
+            /*
+             * D16MULF supports unbiased rounding aka "bankers rounding",
+             * "round to even", "convergent rounding"
+             */
+            tcg_gen_andi_tl(bias, mxu_CR, 0x4);
+            tcg_gen_brcondi_tl(TCG_COND_NE, bias, 0, l_apply_bias_l);
+            tcg_gen_andi_tl(t0, t3, 0x1ffff);
+            tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0x8000, l_half_done);
+            gen_set_label(l_apply_bias_l);
+            tcg_gen_addi_tl(t3, t3, 0x8000);
+            gen_set_label(l_half_done);
+            tcg_gen_brcondi_tl(TCG_COND_NE, bias, 0, l_apply_bias_r);
+            tcg_gen_andi_tl(t0, t2, 0x1ffff);
+            tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0x8000, l_done);
+            gen_set_label(l_apply_bias_r);
+            tcg_gen_addi_tl(t2, t2, 0x8000);
+        } else {
+            /* D16MULE doesn't support unbiased rounding */
+            tcg_gen_addi_tl(t3, t3, 0x8000);
+            tcg_gen_addi_tl(t2, t2, 0x8000);
+        }
+        gen_set_label(l_done);
+    }
+    if (!packed_result) {
+        gen_store_mxu_gpr(t3, XRa);
+        gen_store_mxu_gpr(t2, XRd);
+    } else {
+        tcg_gen_andi_tl(t3, t3, 0xffff0000);
+        tcg_gen_shri_tl(t2, t2, 16);
+        tcg_gen_or_tl(t3, t3, t2);
+        gen_store_mxu_gpr(t3, XRa);
+    }
 }
 
 /*
- * D16MAC XRa, XRb, XRc, XRd, aptn2, optn2 - Signed 16 bit pattern multiply
- *                                           and accumulate
+ * D16MAC XRa, XRb, XRc, XRd, aptn2, optn2
+ *   Signed 16 bit pattern multiply and accumulate
+ * D16MACF XRa, XRb, XRc, aptn2, optn2
+ *   Signed Q15 fraction pattern multiply accumulate and pack
+ * D16MACE XRa, XRb, XRc, XRd, aptn2, optn2
+ *   Signed Q15 fraction pattern multiply and accumulate
  */
-static void gen_mxu_d16mac(DisasContext *ctx)
+static void gen_mxu_d16mac(DisasContext *ctx, bool fractional,
+                           bool packed_result)
 {
     TCGv t0, t1, t2, t3;
     uint32_t XRa, XRb, XRc, XRd, optn2, aptn2;
@@ -704,6 +1128,11 @@ static void gen_mxu_d16mac(DisasContext *ctx)
         tcg_gen_mul_tl(t2, t1, t2);
         break;
     }
+
+    if (fractional) {
+        tcg_gen_shli_tl(t3, t3, 1);
+        tcg_gen_shli_tl(t2, t2, 1);
+    }
     gen_load_mxu_gpr(t0, XRa);
     gen_load_mxu_gpr(t1, XRd);
 
@@ -725,18 +1154,205 @@ static void gen_mxu_d16mac(DisasContext *ctx)
         tcg_gen_sub_tl(t2, t1, t2);
         break;
     }
-    gen_store_mxu_gpr(t3, XRa);
-    gen_store_mxu_gpr(t2, XRd);
+
+    if (fractional) {
+        TCGLabel *l_done = gen_new_label();
+        TCGv rounding = tcg_temp_new();
+
+        tcg_gen_andi_tl(rounding, mxu_CR, 0x2);
+        tcg_gen_brcondi_tl(TCG_COND_EQ, rounding, 0, l_done);
+        if (packed_result) {
+            TCGLabel *l_apply_bias_l = gen_new_label();
+            TCGLabel *l_apply_bias_r = gen_new_label();
+            TCGLabel *l_half_done = gen_new_label();
+            TCGv bias = tcg_temp_new();
+
+            /*
+             * D16MACF supports unbiased rounding aka "bankers rounding",
+             * "round to even", "convergent rounding"
+             */
+            tcg_gen_andi_tl(bias, mxu_CR, 0x4);
+            tcg_gen_brcondi_tl(TCG_COND_NE, bias, 0, l_apply_bias_l);
+            tcg_gen_andi_tl(t0, t3, 0x1ffff);
+            tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0x8000, l_half_done);
+            gen_set_label(l_apply_bias_l);
+            tcg_gen_addi_tl(t3, t3, 0x8000);
+            gen_set_label(l_half_done);
+            tcg_gen_brcondi_tl(TCG_COND_NE, bias, 0, l_apply_bias_r);
+            tcg_gen_andi_tl(t0, t2, 0x1ffff);
+            tcg_gen_brcondi_tl(TCG_COND_EQ, t0, 0x8000, l_done);
+            gen_set_label(l_apply_bias_r);
+            tcg_gen_addi_tl(t2, t2, 0x8000);
+        } else {
+            /* D16MACE doesn't support unbiased rounding */
+            tcg_gen_addi_tl(t3, t3, 0x8000);
+            tcg_gen_addi_tl(t2, t2, 0x8000);
+        }
+        gen_set_label(l_done);
+    }
+
+    if (!packed_result) {
+        gen_store_mxu_gpr(t3, XRa);
+        gen_store_mxu_gpr(t2, XRd);
+    } else {
+        tcg_gen_andi_tl(t3, t3, 0xffff0000);
+        tcg_gen_shri_tl(t2, t2, 16);
+        tcg_gen_or_tl(t3, t3, t2);
+        gen_store_mxu_gpr(t3, XRa);
+    }
 }
 
 /*
- * Q8MUL   XRa, XRb, XRc, XRd - Parallel unsigned 8 bit pattern multiply
- * Q8MULSU XRa, XRb, XRc, XRd - Parallel signed 8 bit pattern multiply
+ * D16MADL XRa, XRb, XRc, XRd, aptn2, optn2 - Double packed
+ * unsigned 16 bit pattern multiply and add/subtract.
  */
-static void gen_mxu_q8mul_q8mulsu(DisasContext *ctx)
+static void gen_mxu_d16madl(DisasContext *ctx)
+{
+    TCGv t0, t1, t2, t3;
+    uint32_t XRa, XRb, XRc, XRd, optn2, aptn2;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+    t2 = tcg_temp_new();
+    t3 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode, 6, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRd = extract32(ctx->opcode, 18, 4);
+    optn2 = extract32(ctx->opcode, 22, 2);
+    aptn2 = extract32(ctx->opcode, 24, 2);
+
+    gen_load_mxu_gpr(t1, XRb);
+    tcg_gen_sextract_tl(t0, t1,  0, 16);
+    tcg_gen_sextract_tl(t1, t1, 16, 16);
+
+    gen_load_mxu_gpr(t3, XRc);
+    tcg_gen_sextract_tl(t2, t3,  0, 16);
+    tcg_gen_sextract_tl(t3, t3, 16, 16);
+
+    switch (optn2) {
+    case MXU_OPTN2_WW: /* XRB.H*XRC.H == lop, XRB.L*XRC.L == rop */
+        tcg_gen_mul_tl(t3, t1, t3);
+        tcg_gen_mul_tl(t2, t0, t2);
+        break;
+    case MXU_OPTN2_LW: /* XRB.L*XRC.H == lop, XRB.L*XRC.L == rop */
+        tcg_gen_mul_tl(t3, t0, t3);
+        tcg_gen_mul_tl(t2, t0, t2);
+        break;
+    case MXU_OPTN2_HW: /* XRB.H*XRC.H == lop, XRB.H*XRC.L == rop */
+        tcg_gen_mul_tl(t3, t1, t3);
+        tcg_gen_mul_tl(t2, t1, t2);
+        break;
+    case MXU_OPTN2_XW: /* XRB.L*XRC.H == lop, XRB.H*XRC.L == rop */
+        tcg_gen_mul_tl(t3, t0, t3);
+        tcg_gen_mul_tl(t2, t1, t2);
+        break;
+    }
+    tcg_gen_extract_tl(t2, t2, 0, 16);
+    tcg_gen_extract_tl(t3, t3, 0, 16);
+
+    gen_load_mxu_gpr(t1, XRa);
+    tcg_gen_extract_tl(t0, t1,  0, 16);
+    tcg_gen_extract_tl(t1, t1, 16, 16);
+
+    switch (aptn2) {
+    case MXU_APTN2_AA:
+        tcg_gen_add_tl(t3, t1, t3);
+        tcg_gen_add_tl(t2, t0, t2);
+        break;
+    case MXU_APTN2_AS:
+        tcg_gen_add_tl(t3, t1, t3);
+        tcg_gen_sub_tl(t2, t0, t2);
+        break;
+    case MXU_APTN2_SA:
+        tcg_gen_sub_tl(t3, t1, t3);
+        tcg_gen_add_tl(t2, t0, t2);
+        break;
+    case MXU_APTN2_SS:
+        tcg_gen_sub_tl(t3, t1, t3);
+        tcg_gen_sub_tl(t2, t0, t2);
+        break;
+    }
+
+    tcg_gen_andi_tl(t2, t2, 0xffff);
+    tcg_gen_shli_tl(t3, t3, 16);
+    tcg_gen_or_tl(mxu_gpr[XRd - 1], t3, t2);
+}
+
+/*
+ * S16MAD XRa, XRb, XRc, XRd, aptn2, optn2 - Single packed
+ * signed 16 bit pattern multiply and 32-bit add/subtract.
+ */
+static void gen_mxu_s16mad(DisasContext *ctx)
+{
+    TCGv t0, t1;
+    uint32_t XRa, XRb, XRc, XRd, optn2, aptn1, pad;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode, 6, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRd = extract32(ctx->opcode, 18, 4);
+    optn2 = extract32(ctx->opcode, 22, 2);
+    aptn1 = extract32(ctx->opcode, 24, 1);
+    pad = extract32(ctx->opcode, 25, 1);
+
+    if (pad) {
+        /* FIXME check if it influence the result */
+    }
+
+    gen_load_mxu_gpr(t0, XRb);
+    gen_load_mxu_gpr(t1, XRc);
+
+    switch (optn2) {
+    case MXU_OPTN2_WW: /* XRB.H*XRC.H */
+        tcg_gen_sextract_tl(t0, t0, 16, 16);
+        tcg_gen_sextract_tl(t1, t1, 16, 16);
+        break;
+    case MXU_OPTN2_LW: /* XRB.L*XRC.L */
+        tcg_gen_sextract_tl(t0, t0,  0, 16);
+        tcg_gen_sextract_tl(t1, t1,  0, 16);
+        break;
+    case MXU_OPTN2_HW: /* XRB.H*XRC.L */
+        tcg_gen_sextract_tl(t0, t0, 16, 16);
+        tcg_gen_sextract_tl(t1, t1,  0, 16);
+        break;
+    case MXU_OPTN2_XW: /* XRB.L*XRC.H */
+        tcg_gen_sextract_tl(t0, t0,  0, 16);
+        tcg_gen_sextract_tl(t1, t1, 16, 16);
+        break;
+    }
+    tcg_gen_mul_tl(t0, t0, t1);
+
+    gen_load_mxu_gpr(t1, XRa);
+
+    switch (aptn1) {
+    case MXU_APTN1_A:
+        tcg_gen_add_tl(t1, t1, t0);
+        break;
+    case MXU_APTN1_S:
+        tcg_gen_sub_tl(t1, t1, t0);
+        break;
+    }
+
+    gen_store_mxu_gpr(t1, XRd);
+}
+
+/*
+ * Q8MUL   XRa, XRb, XRc, XRd - Parallel quad unsigned 8 bit multiply
+ * Q8MULSU XRa, XRb, XRc, XRd - Parallel quad signed 8 bit multiply
+ * Q8MAC   XRa, XRb, XRc, XRd - Parallel quad unsigned 8 bit multiply
+ *   and accumulate
+ * Q8MACSU XRa, XRb, XRc, XRd - Parallel quad signed 8 bit multiply
+ *   and accumulate
+ */
+static void gen_mxu_q8mul_mac(DisasContext *ctx, bool su, bool mac)
 {
     TCGv t0, t1, t2, t3, t4, t5, t6, t7;
-    uint32_t XRa, XRb, XRc, XRd, sel;
+    uint32_t XRa, XRb, XRc, XRd, aptn2;
 
     t0 = tcg_temp_new();
     t1 = tcg_temp_new();
@@ -751,90 +1367,311 @@ static void gen_mxu_q8mul_q8mulsu(DisasContext *ctx)
     XRb = extract32(ctx->opcode, 10, 4);
     XRc = extract32(ctx->opcode, 14, 4);
     XRd = extract32(ctx->opcode, 18, 4);
-    sel = extract32(ctx->opcode, 22, 2);
+    aptn2 = extract32(ctx->opcode, 24, 2);
 
     gen_load_mxu_gpr(t3, XRb);
     gen_load_mxu_gpr(t7, XRc);
 
-    if (sel == 0x2) {
-        /* Q8MULSU */
-        tcg_gen_ext8s_tl(t0, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8s_tl(t1, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8s_tl(t2, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8s_tl(t3, t3);
+    if (su) {
+        /* Q8MULSU / Q8MACSU */
+        tcg_gen_sextract_tl(t0, t3,  0, 8);
+        tcg_gen_sextract_tl(t1, t3,  8, 8);
+        tcg_gen_sextract_tl(t2, t3, 16, 8);
+        tcg_gen_sextract_tl(t3, t3, 24, 8);
     } else {
-        /* Q8MUL */
-        tcg_gen_ext8u_tl(t0, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8u_tl(t1, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8u_tl(t2, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8u_tl(t3, t3);
-    }
-
-    tcg_gen_ext8u_tl(t4, t7);
-    tcg_gen_shri_tl(t7, t7, 8);
-    tcg_gen_ext8u_tl(t5, t7);
-    tcg_gen_shri_tl(t7, t7, 8);
-    tcg_gen_ext8u_tl(t6, t7);
-    tcg_gen_shri_tl(t7, t7, 8);
-    tcg_gen_ext8u_tl(t7, t7);
+        /* Q8MUL / Q8MAC */
+        tcg_gen_extract_tl(t0, t3,  0, 8);
+        tcg_gen_extract_tl(t1, t3,  8, 8);
+        tcg_gen_extract_tl(t2, t3, 16, 8);
+        tcg_gen_extract_tl(t3, t3, 24, 8);
+    }
+
+    tcg_gen_extract_tl(t4, t7,  0, 8);
+    tcg_gen_extract_tl(t5, t7,  8, 8);
+    tcg_gen_extract_tl(t6, t7, 16, 8);
+    tcg_gen_extract_tl(t7, t7, 24, 8);
 
     tcg_gen_mul_tl(t0, t0, t4);
     tcg_gen_mul_tl(t1, t1, t5);
     tcg_gen_mul_tl(t2, t2, t6);
     tcg_gen_mul_tl(t3, t3, t7);
 
-    tcg_gen_andi_tl(t0, t0, 0xFFFF);
-    tcg_gen_andi_tl(t1, t1, 0xFFFF);
-    tcg_gen_andi_tl(t2, t2, 0xFFFF);
-    tcg_gen_andi_tl(t3, t3, 0xFFFF);
-
-    tcg_gen_shli_tl(t1, t1, 16);
-    tcg_gen_shli_tl(t3, t3, 16);
+    if (mac) {
+        gen_load_mxu_gpr(t4, XRd);
+        gen_load_mxu_gpr(t5, XRa);
+        tcg_gen_extract_tl(t6, t4,  0, 16);
+        tcg_gen_extract_tl(t7, t4, 16, 16);
+        if (aptn2 & 1) {
+            tcg_gen_sub_tl(t0, t6, t0);
+            tcg_gen_sub_tl(t1, t7, t1);
+        } else {
+            tcg_gen_add_tl(t0, t6, t0);
+            tcg_gen_add_tl(t1, t7, t1);
+        }
+        tcg_gen_extract_tl(t6, t5,  0, 16);
+        tcg_gen_extract_tl(t7, t5, 16, 16);
+        if (aptn2 & 2) {
+            tcg_gen_sub_tl(t2, t6, t2);
+            tcg_gen_sub_tl(t3, t7, t3);
+        } else {
+            tcg_gen_add_tl(t2, t6, t2);
+            tcg_gen_add_tl(t3, t7, t3);
+        }
+    }
 
-    tcg_gen_or_tl(t0, t0, t1);
-    tcg_gen_or_tl(t1, t2, t3);
+    tcg_gen_deposit_tl(t0, t0, t1, 16, 16);
+    tcg_gen_deposit_tl(t1, t2, t3, 16, 16);
 
     gen_store_mxu_gpr(t0, XRd);
     gen_store_mxu_gpr(t1, XRa);
 }
 
 /*
+ * Q8MADL  XRd, XRa, XRb, XRc
+ *   Parallel quad unsigned 8 bit multiply and accumulate.
+ *   e.g. XRd[0..3] = XRa[0..3] + XRb[0..3] * XRc[0..3]
+ */
+static void gen_mxu_q8madl(DisasContext *ctx)
+{
+    TCGv t0, t1, t2, t3, t4, t5, t6, t7;
+    uint32_t XRa, XRb, XRc, XRd, aptn2;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+    t2 = tcg_temp_new();
+    t3 = tcg_temp_new();
+    t4 = tcg_temp_new();
+    t5 = tcg_temp_new();
+    t6 = tcg_temp_new();
+    t7 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode, 6, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRd = extract32(ctx->opcode, 18, 4);
+    aptn2 = extract32(ctx->opcode, 24, 2);
+
+    gen_load_mxu_gpr(t3, XRb);
+    gen_load_mxu_gpr(t7, XRc);
+
+    tcg_gen_extract_tl(t0, t3,  0, 8);
+    tcg_gen_extract_tl(t1, t3,  8, 8);
+    tcg_gen_extract_tl(t2, t3, 16, 8);
+    tcg_gen_extract_tl(t3, t3, 24, 8);
+
+    tcg_gen_extract_tl(t4, t7,  0, 8);
+    tcg_gen_extract_tl(t5, t7,  8, 8);
+    tcg_gen_extract_tl(t6, t7, 16, 8);
+    tcg_gen_extract_tl(t7, t7, 24, 8);
+
+    tcg_gen_mul_tl(t0, t0, t4);
+    tcg_gen_mul_tl(t1, t1, t5);
+    tcg_gen_mul_tl(t2, t2, t6);
+    tcg_gen_mul_tl(t3, t3, t7);
+
+    gen_load_mxu_gpr(t4, XRa);
+    tcg_gen_extract_tl(t6, t4, 0, 8);
+    tcg_gen_extract_tl(t7, t4, 8, 8);
+    if (aptn2 & 1) {
+        tcg_gen_sub_tl(t0, t6, t0);
+        tcg_gen_sub_tl(t1, t7, t1);
+    } else {
+        tcg_gen_add_tl(t0, t6, t0);
+        tcg_gen_add_tl(t1, t7, t1);
+    }
+    tcg_gen_extract_tl(t6, t4, 16, 8);
+    tcg_gen_extract_tl(t7, t4, 24, 8);
+    if (aptn2 & 2) {
+        tcg_gen_sub_tl(t2, t6, t2);
+        tcg_gen_sub_tl(t3, t7, t3);
+    } else {
+        tcg_gen_add_tl(t2, t6, t2);
+        tcg_gen_add_tl(t3, t7, t3);
+    }
+
+    tcg_gen_andi_tl(t5, t0, 0xff);
+    tcg_gen_deposit_tl(t5, t5, t1,  8, 8);
+    tcg_gen_deposit_tl(t5, t5, t2, 16, 8);
+    tcg_gen_deposit_tl(t5, t5, t3, 24, 8);
+
+    gen_store_mxu_gpr(t5, XRd);
+}
+
+/*
  * S32LDD  XRa, Rb, S12 - Load a word from memory to XRF
- * S32LDDR XRa, Rb, S12 - Load a word from memory to XRF, reversed byte seq.
+ * S32LDDR XRa, Rb, S12 - Load a word from memory to XRF
+ *   in reversed byte seq.
+ * S32LDI  XRa, Rb, S12 - Load a word from memory to XRF,
+ *   post modify base address GPR.
+ * S32LDIR XRa, Rb, S12 - Load a word from memory to XRF,
+ *   post modify base address GPR and load in reversed byte seq.
  */
-static void gen_mxu_s32ldd_s32lddr(DisasContext *ctx)
+static void gen_mxu_s32ldxx(DisasContext *ctx, bool reversed, bool postinc)
 {
     TCGv t0, t1;
-    uint32_t XRa, Rb, s12, sel;
+    uint32_t XRa, Rb, s12;
 
     t0 = tcg_temp_new();
     t1 = tcg_temp_new();
 
     XRa = extract32(ctx->opcode, 6, 4);
-    s12 = extract32(ctx->opcode, 10, 10);
-    sel = extract32(ctx->opcode, 20, 1);
+    s12 = sextract32(ctx->opcode, 10, 10);
     Rb = extract32(ctx->opcode, 21, 5);
 
     gen_load_gpr(t0, Rb);
+    tcg_gen_movi_tl(t1, s12 * 4);
+    tcg_gen_add_tl(t0, t0, t1);
+
+    tcg_gen_qemu_ld_tl(t1, t0, ctx->mem_idx,
+                       (MO_TESL ^ (reversed ? MO_BSWAP : 0)) |
+                        ctx->default_tcg_memop_mask);
+    gen_store_mxu_gpr(t1, XRa);
 
-    tcg_gen_movi_tl(t1, s12);
-    tcg_gen_shli_tl(t1, t1, 2);
-    if (s12 & 0x200) {
-        tcg_gen_ori_tl(t1, t1, 0xFFFFF000);
+    if (postinc) {
+        gen_store_gpr(t0, Rb);
     }
-    tcg_gen_add_tl(t1, t0, t1);
-    tcg_gen_qemu_ld_tl(t1, t1, ctx->mem_idx, (MO_TESL ^ (sel * MO_BSWAP)) |
-                       ctx->default_tcg_memop_mask);
+}
+
+/*
+ * S32STD  XRa, Rb, S12 - Store a word from XRF to memory
+ * S32STDR XRa, Rb, S12 - Store a word from XRF to memory
+ *   in reversed byte seq.
+ * S32SDI  XRa, Rb, S12 - Store a word from XRF to memory,
+ *   post modify base address GPR.
+ * S32SDIR XRa, Rb, S12 - Store a word from XRF to memory,
+ *   post modify base address GPR and store in reversed byte seq.
+ */
+static void gen_mxu_s32stxx(DisasContext *ctx, bool reversed, bool postinc)
+{
+    TCGv t0, t1;
+    uint32_t XRa, Rb, s12;
 
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode, 6, 4);
+    s12 = sextract32(ctx->opcode, 10, 10);
+    Rb = extract32(ctx->opcode, 21, 5);
+
+    gen_load_gpr(t0, Rb);
+    tcg_gen_movi_tl(t1, s12 * 4);
+    tcg_gen_add_tl(t0, t0, t1);
+
+    gen_load_mxu_gpr(t1, XRa);
+    tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx,
+                       (MO_TESL ^ (reversed ? MO_BSWAP : 0)) |
+                        ctx->default_tcg_memop_mask);
+
+    if (postinc) {
+        gen_store_gpr(t0, Rb);
+    }
+}
+
+/*
+ * S32LDDV  XRa, Rb, Rc, STRD2 - Load a word from memory to XRF
+ * S32LDDVR XRa, Rb, Rc, STRD2 - Load a word from memory to XRF
+ *   in reversed byte seq.
+ * S32LDIV  XRa, Rb, Rc, STRD2 - Load a word from memory to XRF,
+ *   post modify base address GPR.
+ * S32LDIVR XRa, Rb, Rc, STRD2 - Load a word from memory to XRF,
+ *   post modify base address GPR and load in reversed byte seq.
+ */
+static void gen_mxu_s32ldxvx(DisasContext *ctx, bool reversed,
+                             bool postinc, uint32_t strd2)
+{
+    TCGv t0, t1;
+    uint32_t XRa, Rb, Rc;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode, 6, 4);
+    Rc = extract32(ctx->opcode, 16, 5);
+    Rb = extract32(ctx->opcode, 21, 5);
+
+    gen_load_gpr(t0, Rb);
+    gen_load_gpr(t1, Rc);
+    tcg_gen_shli_tl(t1, t1, strd2);
+    tcg_gen_add_tl(t0, t0, t1);
+
+    tcg_gen_qemu_ld_tl(t1, t0, ctx->mem_idx,
+                       (MO_TESL ^ (reversed ? MO_BSWAP : 0)) |
+                        ctx->default_tcg_memop_mask);
     gen_store_mxu_gpr(t1, XRa);
+
+    if (postinc) {
+        gen_store_gpr(t0, Rb);
+    }
 }
 
+/*
+ * LXW  Ra, Rb, Rc, STRD2 - Load a word from memory to GPR
+ * LXB  Ra, Rb, Rc, STRD2 - Load a byte from memory to GPR,
+ *   sign extending to GPR size.
+ * LXH  Ra, Rb, Rc, STRD2 - Load a byte from memory to GPR,
+ *   sign extending to GPR size.
+ * LXBU Ra, Rb, Rc, STRD2 - Load a halfword from memory to GPR,
+ *   zero extending to GPR size.
+ * LXHU Ra, Rb, Rc, STRD2 - Load a halfword from memory to GPR,
+ *   zero extending to GPR size.
+ */
+static void gen_mxu_lxx(DisasContext *ctx, uint32_t strd2, MemOp mop)
+{
+    TCGv t0, t1;
+    uint32_t Ra, Rb, Rc;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    Ra = extract32(ctx->opcode, 11, 5);
+    Rc = extract32(ctx->opcode, 16, 5);
+    Rb = extract32(ctx->opcode, 21, 5);
+
+    gen_load_gpr(t0, Rb);
+    gen_load_gpr(t1, Rc);
+    tcg_gen_shli_tl(t1, t1, strd2);
+    tcg_gen_add_tl(t0, t0, t1);
+
+    tcg_gen_qemu_ld_tl(t1, t0, ctx->mem_idx, mop | ctx->default_tcg_memop_mask);
+    gen_store_gpr(t1, Ra);
+}
+
+/*
+ * S32STDV  XRa, Rb, Rc, STRD2 - Load a word from memory to XRF
+ * S32STDVR XRa, Rb, Rc, STRD2 - Load a word from memory to XRF
+ *   in reversed byte seq.
+ * S32SDIV  XRa, Rb, Rc, STRD2 - Load a word from memory to XRF,
+ *   post modify base address GPR.
+ * S32SDIVR XRa, Rb, Rc, STRD2 - Load a word from memory to XRF,
+ *   post modify base address GPR and store in reversed byte seq.
+ */
+static void gen_mxu_s32stxvx(DisasContext *ctx, bool reversed,
+                             bool postinc, uint32_t strd2)
+{
+    TCGv t0, t1;
+    uint32_t XRa, Rb, Rc;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+
+    XRa = extract32(ctx->opcode, 6, 4);
+    Rc = extract32(ctx->opcode, 16, 5);
+    Rb = extract32(ctx->opcode, 21, 5);
+
+    gen_load_gpr(t0, Rb);
+    gen_load_gpr(t1, Rc);
+    tcg_gen_shli_tl(t1, t1, strd2);
+    tcg_gen_add_tl(t0, t0, t1);
+
+    gen_load_mxu_gpr(t1, XRa);
+    tcg_gen_qemu_st_tl(t1, t0, ctx->mem_idx,
+                       (MO_TESL ^ (reversed ? MO_BSWAP : 0)) |
+                        ctx->default_tcg_memop_mask);
+
+    if (postinc) {
+        gen_store_gpr(t0, Rb);
+    }
+}
 
 /*
  *                 MXU instruction category: logic
@@ -981,13 +1818,291 @@ static void gen_mxu_S32XOR(DisasContext *ctx)
     }
 }
 
+/*
+ *                 MXU instruction category: shift
+ *                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *               D32SLL    D32SLR    D32SAR    D32SARL
+ *               D32SLLV   D32SLRV   D32SARV   D32SARW
+ *               Q16SLL    Q16SLR    Q16SAR
+ *               Q16SLLV   Q16SLRV   Q16SARV
+ */
+
+/*
+ *  D32SLL XRa, XRd, XRb, XRc, SFT4
+ *    Dual 32-bit shift left from XRb and XRc to SFT4
+ *    bits (0..15). Store to XRa and XRd respectively.
+ *  D32SLR XRa, XRd, XRb, XRc, SFT4
+ *    Dual 32-bit shift logic right from XRb and XRc
+ *    to SFT4 bits (0..15). Store to XRa and XRd respectively.
+ *  D32SAR XRa, XRd, XRb, XRc, SFT4
+ *    Dual 32-bit shift arithmetic right from XRb and XRc
+ *    to SFT4 bits (0..15). Store to XRa and XRd respectively.
+ */
+static void gen_mxu_d32sxx(DisasContext *ctx, bool right, bool arithmetic)
+{
+    uint32_t XRa, XRb, XRc, XRd, sft4;
+
+    XRa  = extract32(ctx->opcode,  6, 4);
+    XRb  = extract32(ctx->opcode, 10, 4);
+    XRc  = extract32(ctx->opcode, 14, 4);
+    XRd  = extract32(ctx->opcode, 18, 4);
+    sft4 = extract32(ctx->opcode, 22, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t0, XRb);
+    gen_load_mxu_gpr(t1, XRc);
+
+    if (right) {
+        if (arithmetic) {
+            tcg_gen_sari_tl(t0, t0, sft4);
+            tcg_gen_sari_tl(t1, t1, sft4);
+        } else {
+            tcg_gen_shri_tl(t0, t0, sft4);
+            tcg_gen_shri_tl(t1, t1, sft4);
+        }
+    } else {
+        tcg_gen_shli_tl(t0, t0, sft4);
+        tcg_gen_shli_tl(t1, t1, sft4);
+    }
+    gen_store_mxu_gpr(t0, XRa);
+    gen_store_mxu_gpr(t1, XRd);
+}
 
 /*
- *                   MXU instruction category max/min
+ *  D32SLLV XRa, XRd, rs
+ *    Dual 32-bit shift left from XRa and XRd to rs[3:0]
+ *    bits. Store back to XRa and XRd respectively.
+ *  D32SLRV XRa, XRd, rs
+ *    Dual 32-bit shift logic right from XRa and XRd to rs[3:0]
+ *    bits. Store back to XRa and XRd respectively.
+ *  D32SARV XRa, XRd, rs
+ *    Dual 32-bit shift arithmetic right from XRa and XRd to rs[3:0]
+ *    bits. Store back to XRa and XRd respectively.
+ */
+static void gen_mxu_d32sxxv(DisasContext *ctx, bool right, bool arithmetic)
+{
+    uint32_t XRa, XRd, rs;
+
+    XRa = extract32(ctx->opcode, 10, 4);
+    XRd = extract32(ctx->opcode, 14, 4);
+    rs  = extract32(ctx->opcode, 21, 5);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t0, XRa);
+    gen_load_mxu_gpr(t1, XRd);
+    gen_load_gpr(t2, rs);
+    tcg_gen_andi_tl(t2, t2, 0x0f);
+
+    if (right) {
+        if (arithmetic) {
+            tcg_gen_sar_tl(t0, t0, t2);
+            tcg_gen_sar_tl(t1, t1, t2);
+        } else {
+            tcg_gen_shr_tl(t0, t0, t2);
+            tcg_gen_shr_tl(t1, t1, t2);
+        }
+    } else {
+        tcg_gen_shl_tl(t0, t0, t2);
+        tcg_gen_shl_tl(t1, t1, t2);
+    }
+    gen_store_mxu_gpr(t0, XRa);
+    gen_store_mxu_gpr(t1, XRd);
+}
+
+/*
+ *  D32SARL XRa, XRb, XRc, SFT4
+ *    Dual shift arithmetic right 32-bit integers in XRb and XRc
+ *    to SFT4 bits (0..15). Pack 16 LSBs of each into XRa.
+ *
+ *  D32SARW XRa, XRb, XRc, rb
+ *    Dual shift arithmetic right 32-bit integers in XRb and XRc
+ *    to rb[3:0] bits. Pack 16 LSBs of each into XRa.
+ */
+static void gen_mxu_d32sarl(DisasContext *ctx, bool sarw)
+{
+    uint32_t XRa, XRb, XRc, rb;
+
+    XRa = extract32(ctx->opcode,  6, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRc = extract32(ctx->opcode, 14, 4);
+    rb  = extract32(ctx->opcode, 21, 5);
+
+    if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else {
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+
+        if (!sarw) {
+            /* Make SFT4 from rb field */
+            tcg_gen_movi_tl(t2, rb >> 1);
+        } else {
+            gen_load_gpr(t2, rb);
+            tcg_gen_andi_tl(t2, t2, 0x0f);
+        }
+        gen_load_mxu_gpr(t0, XRb);
+        gen_load_mxu_gpr(t1, XRc);
+        tcg_gen_sar_tl(t0, t0, t2);
+        tcg_gen_sar_tl(t1, t1, t2);
+        tcg_gen_extract_tl(t2, t1, 0, 16);
+        tcg_gen_deposit_tl(t2, t2, t0, 16, 16);
+        gen_store_mxu_gpr(t2, XRa);
+    }
+}
+
+/*
+ *  Q16SLL XRa, XRd, XRb, XRc, SFT4
+ *    Quad 16-bit shift left from XRb and XRc to SFT4
+ *    bits (0..15). Store to XRa and XRd respectively.
+ *  Q16SLR XRa, XRd, XRb, XRc, SFT4
+ *    Quad 16-bit shift logic right from XRb and XRc
+ *    to SFT4 bits (0..15). Store to XRa and XRd respectively.
+ *  Q16SAR XRa, XRd, XRb, XRc, SFT4
+ *    Quad 16-bit shift arithmetic right from XRb and XRc
+ *    to SFT4 bits (0..15). Store to XRa and XRd respectively.
+ */
+static void gen_mxu_q16sxx(DisasContext *ctx, bool right, bool arithmetic)
+{
+    uint32_t XRa, XRb, XRc, XRd, sft4;
+
+    XRa  = extract32(ctx->opcode,  6, 4);
+    XRb  = extract32(ctx->opcode, 10, 4);
+    XRc  = extract32(ctx->opcode, 14, 4);
+    XRd  = extract32(ctx->opcode, 18, 4);
+    sft4 = extract32(ctx->opcode, 22, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t0, XRb);
+    gen_load_mxu_gpr(t2, XRc);
+
+    if (arithmetic) {
+        tcg_gen_sextract_tl(t1, t0, 16, 16);
+        tcg_gen_sextract_tl(t0, t0,  0, 16);
+        tcg_gen_sextract_tl(t3, t2, 16, 16);
+        tcg_gen_sextract_tl(t2, t2,  0, 16);
+    } else {
+        tcg_gen_extract_tl(t1, t0, 16, 16);
+        tcg_gen_extract_tl(t0, t0,  0, 16);
+        tcg_gen_extract_tl(t3, t2, 16, 16);
+        tcg_gen_extract_tl(t2, t2,  0, 16);
+    }
+
+    if (right) {
+        if (arithmetic) {
+            tcg_gen_sari_tl(t0, t0, sft4);
+            tcg_gen_sari_tl(t1, t1, sft4);
+            tcg_gen_sari_tl(t2, t2, sft4);
+            tcg_gen_sari_tl(t3, t3, sft4);
+        } else {
+            tcg_gen_shri_tl(t0, t0, sft4);
+            tcg_gen_shri_tl(t1, t1, sft4);
+            tcg_gen_shri_tl(t2, t2, sft4);
+            tcg_gen_shri_tl(t3, t3, sft4);
+        }
+    } else {
+        tcg_gen_shli_tl(t0, t0, sft4);
+        tcg_gen_shli_tl(t1, t1, sft4);
+        tcg_gen_shli_tl(t2, t2, sft4);
+        tcg_gen_shli_tl(t3, t3, sft4);
+    }
+    tcg_gen_deposit_tl(t0, t0, t1, 16, 16);
+    tcg_gen_deposit_tl(t2, t2, t3, 16, 16);
+
+    gen_store_mxu_gpr(t0, XRa);
+    gen_store_mxu_gpr(t2, XRd);
+}
+
+/*
+ *  Q16SLLV XRa, XRd, rs
+ *    Quad 16-bit shift left from XRa and XRd to rs[3:0]
+ *    bits. Store to XRa and XRd respectively.
+ *  Q16SLRV XRa, XRd, rs
+ *    Quad 16-bit shift logic right from XRa and XRd to rs[3:0]
+ *    bits. Store to XRa and XRd respectively.
+ *  Q16SARV XRa, XRd, rs
+ *    Quad 16-bit shift arithmetic right from XRa and XRd to rs[3:0]
+ *    bits. Store to XRa and XRd respectively.
+ */
+static void gen_mxu_q16sxxv(DisasContext *ctx, bool right, bool arithmetic)
+{
+    uint32_t XRa, XRd, rs;
+
+    XRa = extract32(ctx->opcode, 10, 4);
+    XRd = extract32(ctx->opcode, 14, 4);
+    rs  = extract32(ctx->opcode, 21, 5);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+    TCGv t5 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t0, XRa);
+    gen_load_mxu_gpr(t2, XRd);
+    gen_load_gpr(t5, rs);
+    tcg_gen_andi_tl(t5, t5, 0x0f);
+
+
+    if (arithmetic) {
+        tcg_gen_sextract_tl(t1, t0, 16, 16);
+        tcg_gen_sextract_tl(t0, t0,  0, 16);
+        tcg_gen_sextract_tl(t3, t2, 16, 16);
+        tcg_gen_sextract_tl(t2, t2,  0, 16);
+    } else {
+        tcg_gen_extract_tl(t1, t0, 16, 16);
+        tcg_gen_extract_tl(t0, t0,  0, 16);
+        tcg_gen_extract_tl(t3, t2, 16, 16);
+        tcg_gen_extract_tl(t2, t2,  0, 16);
+    }
+
+    if (right) {
+        if (arithmetic) {
+            tcg_gen_sar_tl(t0, t0, t5);
+            tcg_gen_sar_tl(t1, t1, t5);
+            tcg_gen_sar_tl(t2, t2, t5);
+            tcg_gen_sar_tl(t3, t3, t5);
+        } else {
+            tcg_gen_shr_tl(t0, t0, t5);
+            tcg_gen_shr_tl(t1, t1, t5);
+            tcg_gen_shr_tl(t2, t2, t5);
+            tcg_gen_shr_tl(t3, t3, t5);
+        }
+    } else {
+        tcg_gen_shl_tl(t0, t0, t5);
+        tcg_gen_shl_tl(t1, t1, t5);
+        tcg_gen_shl_tl(t2, t2, t5);
+        tcg_gen_shl_tl(t3, t3, t5);
+    }
+    tcg_gen_deposit_tl(t0, t0, t1, 16, 16);
+    tcg_gen_deposit_tl(t2, t2, t3, 16, 16);
+
+    gen_store_mxu_gpr(t0, XRa);
+    gen_store_mxu_gpr(t2, XRd);
+}
+
+/*
+ *                   MXU instruction category max/min/avg
  *                   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
  *                     S32MAX     D16MAX     Q8MAX
  *                     S32MIN     D16MIN     Q8MIN
+ *                     S32SLT     D16SLT     Q8SLT
+ *                                           Q8SLTU
+ *                                D16AVG     Q8AVG
+ *                                D16AVGR    Q8AVGR
+ *                     S32MOVZ    D16MOVZ    Q8MOVZ
+ *                     S32MOVN    D16MOVN    Q8MOVN
  */
 
 /*
@@ -1072,13 +2187,14 @@ static void gen_mxu_D16MAX_D16MIN(DisasContext *ctx)
         /* ...and do half-word-wise max/min with one operand 0 */
         TCGv_i32 t0 = tcg_temp_new();
         TCGv_i32 t1 = tcg_constant_i32(0);
+        TCGv_i32 t2 = tcg_temp_new();
 
         /* the left half-word first */
         tcg_gen_andi_i32(t0, mxu_gpr[XRx - 1], 0xFFFF0000);
         if (opc == OPC_MXU_D16MAX) {
-            tcg_gen_smax_i32(mxu_gpr[XRa - 1], t0, t1);
+            tcg_gen_smax_i32(t2, t0, t1);
         } else {
-            tcg_gen_smin_i32(mxu_gpr[XRa - 1], t0, t1);
+            tcg_gen_smin_i32(t2, t0, t1);
         }
 
         /* the right half-word */
@@ -1094,7 +2210,7 @@ static void gen_mxu_D16MAX_D16MIN(DisasContext *ctx)
         /* return resulting half-words to its original position */
         tcg_gen_shri_i32(t0, t0, 16);
         /* finally update the destination */
-        tcg_gen_or_i32(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+        tcg_gen_or_i32(mxu_gpr[XRa - 1], t2, t0);
     } else if (unlikely(XRb == XRc)) {
         /* both operands same -> just set destination to one of them */
         tcg_gen_mov_i32(mxu_gpr[XRa - 1], mxu_gpr[XRb - 1]);
@@ -1102,14 +2218,15 @@ static void gen_mxu_D16MAX_D16MIN(DisasContext *ctx)
         /* the most general case */
         TCGv_i32 t0 = tcg_temp_new();
         TCGv_i32 t1 = tcg_temp_new();
+        TCGv_i32 t2 = tcg_temp_new();
 
         /* the left half-word first */
         tcg_gen_andi_i32(t0, mxu_gpr[XRb - 1], 0xFFFF0000);
         tcg_gen_andi_i32(t1, mxu_gpr[XRc - 1], 0xFFFF0000);
         if (opc == OPC_MXU_D16MAX) {
-            tcg_gen_smax_i32(mxu_gpr[XRa - 1], t0, t1);
+            tcg_gen_smax_i32(t2, t0, t1);
         } else {
-            tcg_gen_smin_i32(mxu_gpr[XRa - 1], t0, t1);
+            tcg_gen_smin_i32(t2, t0, t1);
         }
 
         /* the right half-word */
@@ -1127,7 +2244,7 @@ static void gen_mxu_D16MAX_D16MIN(DisasContext *ctx)
         /* return resulting half-words to its original position */
         tcg_gen_shri_i32(t0, t0, 16);
         /* finally update the destination */
-        tcg_gen_or_i32(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+        tcg_gen_or_i32(mxu_gpr[XRa - 1], t2, t0);
     }
 }
 
@@ -1163,14 +2280,15 @@ static void gen_mxu_Q8MAX_Q8MIN(DisasContext *ctx)
         /* ...and do byte-wise max/min with one operand 0 */
         TCGv_i32 t0 = tcg_temp_new();
         TCGv_i32 t1 = tcg_constant_i32(0);
+        TCGv_i32 t2 = tcg_temp_new();
         int32_t i;
 
         /* the leftmost byte (byte 3) first */
         tcg_gen_andi_i32(t0, mxu_gpr[XRx - 1], 0xFF000000);
         if (opc == OPC_MXU_Q8MAX) {
-            tcg_gen_smax_i32(mxu_gpr[XRa - 1], t0, t1);
+            tcg_gen_smax_i32(t2, t0, t1);
         } else {
-            tcg_gen_smin_i32(mxu_gpr[XRa - 1], t0, t1);
+            tcg_gen_smin_i32(t2, t0, t1);
         }
 
         /* bytes 2, 1, 0 */
@@ -1188,8 +2306,9 @@ static void gen_mxu_Q8MAX_Q8MIN(DisasContext *ctx)
             /* return resulting byte to its original position */
             tcg_gen_shri_i32(t0, t0, 8 * (3 - i));
             /* finally update the destination */
-            tcg_gen_or_i32(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+            tcg_gen_or_i32(t2, t2, t0);
         }
+        gen_store_mxu_gpr(t2, XRa);
     } else if (unlikely(XRb == XRc)) {
         /* both operands same -> just set destination to one of them */
         tcg_gen_mov_i32(mxu_gpr[XRa - 1], mxu_gpr[XRb - 1]);
@@ -1197,15 +2316,16 @@ static void gen_mxu_Q8MAX_Q8MIN(DisasContext *ctx)
         /* the most general case */
         TCGv_i32 t0 = tcg_temp_new();
         TCGv_i32 t1 = tcg_temp_new();
+        TCGv_i32 t2 = tcg_temp_new();
         int32_t i;
 
         /* the leftmost bytes (bytes 3) first */
         tcg_gen_andi_i32(t0, mxu_gpr[XRb - 1], 0xFF000000);
         tcg_gen_andi_i32(t1, mxu_gpr[XRc - 1], 0xFF000000);
         if (opc == OPC_MXU_Q8MAX) {
-            tcg_gen_smax_i32(mxu_gpr[XRa - 1], t0, t1);
+            tcg_gen_smax_i32(t2, t0, t1);
         } else {
-            tcg_gen_smin_i32(mxu_gpr[XRa - 1], t0, t1);
+            tcg_gen_smin_i32(t2, t0, t1);
         }
 
         /* bytes 2, 1, 0 */
@@ -1225,11 +2345,1741 @@ static void gen_mxu_Q8MAX_Q8MIN(DisasContext *ctx)
             /* return resulting byte to its original position */
             tcg_gen_shri_i32(t0, t0, 8 * (3 - i));
             /* finally update the destination */
-            tcg_gen_or_i32(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+            tcg_gen_or_i32(t2, t2, t0);
         }
+        gen_store_mxu_gpr(t2, XRa);
+    }
+}
+
+/*
+ *  Q8SLT
+ *    Update XRa with the signed "set less than" comparison of XRb and XRc
+ *    on per-byte basis.
+ *    a.k.a. XRa[0..3] = XRb[0..3] < XRc[0..3] ? 1 : 0;
+ *
+ *  Q8SLTU
+ *    Update XRa with the unsigned "set less than" comparison of XRb and XRc
+ *    on per-byte basis.
+ *    a.k.a. XRa[0..3] = XRb[0..3] < XRc[0..3] ? 1 : 0;
+ */
+static void gen_mxu_q8slt(DisasContext *ctx, bool sltu)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 5);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else if (unlikely(XRb == XRc)) {
+        /* both operands same registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGv t4 = tcg_temp_new();
+
+        gen_load_mxu_gpr(t3, XRb);
+        gen_load_mxu_gpr(t4, XRc);
+        tcg_gen_movi_tl(t2, 0);
+
+        for (int i = 0; i < 4; i++) {
+            if (sltu) {
+                tcg_gen_extract_tl(t0, t3, 8 * i, 8);
+                tcg_gen_extract_tl(t1, t4, 8 * i, 8);
+            } else {
+                tcg_gen_sextract_tl(t0, t3, 8 * i, 8);
+                tcg_gen_sextract_tl(t1, t4, 8 * i, 8);
+            }
+            tcg_gen_setcond_tl(TCG_COND_LT, t0, t0, t1);
+            tcg_gen_deposit_tl(t2, t2, t0, 8 * i, 8);
+        }
+        gen_store_mxu_gpr(t2, XRa);
+    }
+}
+
+/*
+ *  S32SLT
+ *    Update XRa with the signed "set less than" comparison of XRb and XRc.
+ *    a.k.a. XRa = XRb < XRc ? 1 : 0;
+ */
+static void gen_mxu_S32SLT(DisasContext *ctx)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 5);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else if (unlikely(XRb == XRc)) {
+        /* both operands same registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else {
+        /* the most general case */
+        tcg_gen_setcond_tl(TCG_COND_LT, mxu_gpr[XRa - 1],
+                           mxu_gpr[XRb - 1], mxu_gpr[XRc - 1]);
     }
 }
 
+/*
+ *  D16SLT
+ *    Update XRa with the signed "set less than" comparison of XRb and XRc
+ *    on per-word basis.
+ *    a.k.a. XRa[0..1] = XRb[0..1] < XRc[0..1] ? 1 : 0;
+ */
+static void gen_mxu_D16SLT(DisasContext *ctx)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 5);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else if (unlikely(XRb == XRc)) {
+        /* both operands same registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGv t4 = tcg_temp_new();
+
+        gen_load_mxu_gpr(t3, XRb);
+        gen_load_mxu_gpr(t4, XRc);
+        tcg_gen_sextract_tl(t0, t3, 16, 16);
+        tcg_gen_sextract_tl(t1, t4, 16, 16);
+        tcg_gen_setcond_tl(TCG_COND_LT, t0, t0, t1);
+        tcg_gen_shli_tl(t2, t0, 16);
+        tcg_gen_sextract_tl(t0, t3,  0, 16);
+        tcg_gen_sextract_tl(t1, t4,  0, 16);
+        tcg_gen_setcond_tl(TCG_COND_LT, t0, t0, t1);
+        tcg_gen_or_tl(mxu_gpr[XRa - 1], t2, t0);
+    }
+}
+
+/*
+ *  D16AVG
+ *    Update XRa with the signed average of XRb and XRc
+ *    on per-word basis, rounding down.
+ *    a.k.a. XRa[0..1] = (XRb[0..1] + XRc[0..1]) >> 1;
+ *
+ *  D16AVGR
+ *    Update XRa with the signed average of XRb and XRc
+ *    on per-word basis, math rounding 4/5.
+ *    a.k.a. XRa[0..1] = (XRb[0..1] + XRc[0..1] + 1) >> 1;
+ */
+static void gen_mxu_d16avg(DisasContext *ctx, bool round45)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 5);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else if (unlikely(XRb == XRc)) {
+        /* both operands same registers -> just set destination to same */
+        tcg_gen_mov_tl(mxu_gpr[XRa - 1], mxu_gpr[XRb - 1]);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGv t4 = tcg_temp_new();
+
+        gen_load_mxu_gpr(t3, XRb);
+        gen_load_mxu_gpr(t4, XRc);
+        tcg_gen_sextract_tl(t0, t3, 16, 16);
+        tcg_gen_sextract_tl(t1, t4, 16, 16);
+        tcg_gen_add_tl(t0, t0, t1);
+        if (round45) {
+            tcg_gen_addi_tl(t0, t0, 1);
+        }
+        tcg_gen_shli_tl(t2, t0, 15);
+        tcg_gen_andi_tl(t2, t2, 0xffff0000);
+        tcg_gen_sextract_tl(t0, t3,  0, 16);
+        tcg_gen_sextract_tl(t1, t4,  0, 16);
+        tcg_gen_add_tl(t0, t0, t1);
+        if (round45) {
+            tcg_gen_addi_tl(t0, t0, 1);
+        }
+        tcg_gen_shri_tl(t0, t0, 1);
+        tcg_gen_deposit_tl(t2, t2, t0, 0, 16);
+        gen_store_mxu_gpr(t2, XRa);
+    }
+}
+
+/*
+ *  Q8AVG
+ *    Update XRa with the signed average of XRb and XRc
+ *    on per-byte basis, rounding down.
+ *    a.k.a. XRa[0..3] = (XRb[0..3] + XRc[0..3]) >> 1;
+ *
+ *  Q8AVGR
+ *    Update XRa with the signed average of XRb and XRc
+ *    on per-word basis, math rounding 4/5.
+ *    a.k.a. XRa[0..3] = (XRb[0..3] + XRc[0..3] + 1) >> 1;
+ */
+static void gen_mxu_q8avg(DisasContext *ctx, bool round45)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 5);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else if (unlikely(XRb == XRc)) {
+        /* both operands same registers -> just set destination to same */
+        tcg_gen_mov_tl(mxu_gpr[XRa - 1], mxu_gpr[XRb - 1]);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGv t4 = tcg_temp_new();
+
+        gen_load_mxu_gpr(t3, XRb);
+        gen_load_mxu_gpr(t4, XRc);
+        tcg_gen_movi_tl(t2, 0);
+
+        for (int i = 0; i < 4; i++) {
+            tcg_gen_extract_tl(t0, t3, 8 * i, 8);
+            tcg_gen_extract_tl(t1, t4, 8 * i, 8);
+            tcg_gen_add_tl(t0, t0, t1);
+            if (round45) {
+                tcg_gen_addi_tl(t0, t0, 1);
+            }
+            tcg_gen_shri_tl(t0, t0, 1);
+            tcg_gen_deposit_tl(t2, t2, t0, 8 * i, 8);
+        }
+        gen_store_mxu_gpr(t2, XRa);
+    }
+}
+
+/*
+ *  Q8MOVZ
+ *    Quadruple 8-bit packed conditional move where
+ *    XRb contains conditions, XRc what to move and
+ *    XRa is the destination.
+ *    a.k.a. if (XRb[0..3] == 0) { XRa[0..3] = XRc[0..3] }
+ *
+ *  Q8MOVN
+ *    Quadruple 8-bit packed conditional move where
+ *    XRb contains conditions, XRc what to move and
+ *    XRa is the destination.
+ *    a.k.a. if (XRb[0..3] != 0) { XRa[0..3] = XRc[0..3] }
+ */
+static void gen_mxu_q8movzn(DisasContext *ctx, TCGCond cond)
+{
+    uint32_t XRc, XRb, XRa;
+
+    XRa = extract32(ctx->opcode,  6, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRc = extract32(ctx->opcode, 14, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+    TCGLabel *l_quarterdone = gen_new_label();
+    TCGLabel *l_halfdone = gen_new_label();
+    TCGLabel *l_quarterrest = gen_new_label();
+    TCGLabel *l_done = gen_new_label();
+
+    gen_load_mxu_gpr(t0, XRc);
+    gen_load_mxu_gpr(t1, XRb);
+    gen_load_mxu_gpr(t2, XRa);
+
+    tcg_gen_extract_tl(t3, t1, 24, 8);
+    tcg_gen_brcondi_tl(cond, t3, 0, l_quarterdone);
+    tcg_gen_extract_tl(t3, t0, 24, 8);
+    tcg_gen_deposit_tl(t2, t2, t3, 24, 8);
+
+    gen_set_label(l_quarterdone);
+    tcg_gen_extract_tl(t3, t1, 16, 8);
+    tcg_gen_brcondi_tl(cond, t3, 0, l_halfdone);
+    tcg_gen_extract_tl(t3, t0, 16, 8);
+    tcg_gen_deposit_tl(t2, t2, t3, 16, 8);
+
+    gen_set_label(l_halfdone);
+    tcg_gen_extract_tl(t3, t1, 8, 8);
+    tcg_gen_brcondi_tl(cond, t3, 0, l_quarterrest);
+    tcg_gen_extract_tl(t3, t0, 8, 8);
+    tcg_gen_deposit_tl(t2, t2, t3, 8, 8);
+
+    gen_set_label(l_quarterrest);
+    tcg_gen_extract_tl(t3, t1, 0, 8);
+    tcg_gen_brcondi_tl(cond, t3, 0, l_done);
+    tcg_gen_extract_tl(t3, t0, 0, 8);
+    tcg_gen_deposit_tl(t2, t2, t3, 0, 8);
+
+    gen_set_label(l_done);
+    gen_store_mxu_gpr(t2, XRa);
+}
+
+/*
+ *  D16MOVZ
+ *    Double 16-bit packed conditional move where
+ *    XRb contains conditions, XRc what to move and
+ *    XRa is the destination.
+ *    a.k.a. if (XRb[0..1] == 0) { XRa[0..1] = XRc[0..1] }
+ *
+ *  D16MOVN
+ *    Double 16-bit packed conditional move where
+ *    XRb contains conditions, XRc what to move and
+ *    XRa is the destination.
+ *    a.k.a. if (XRb[0..3] != 0) { XRa[0..1] = XRc[0..1] }
+ */
+static void gen_mxu_d16movzn(DisasContext *ctx, TCGCond cond)
+{
+    uint32_t XRc, XRb, XRa;
+
+    XRa = extract32(ctx->opcode,  6, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRc = extract32(ctx->opcode, 14, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+    TCGLabel *l_halfdone = gen_new_label();
+    TCGLabel *l_done = gen_new_label();
+
+    gen_load_mxu_gpr(t0, XRc);
+    gen_load_mxu_gpr(t1, XRb);
+    gen_load_mxu_gpr(t2, XRa);
+
+    tcg_gen_extract_tl(t3, t1, 16, 16);
+    tcg_gen_brcondi_tl(cond, t3, 0, l_halfdone);
+    tcg_gen_extract_tl(t3, t0, 16, 16);
+    tcg_gen_deposit_tl(t2, t2, t3, 16, 16);
+
+    gen_set_label(l_halfdone);
+    tcg_gen_extract_tl(t3, t1, 0, 16);
+    tcg_gen_brcondi_tl(cond, t3, 0, l_done);
+    tcg_gen_extract_tl(t3, t0, 0, 16);
+    tcg_gen_deposit_tl(t2, t2, t3, 0, 16);
+
+    gen_set_label(l_done);
+    gen_store_mxu_gpr(t2, XRa);
+}
+
+/*
+ *  S32MOVZ
+ *    Quadruple 32-bit conditional move where
+ *    XRb contains conditions, XRc what to move and
+ *    XRa is the destination.
+ *    a.k.a. if (XRb == 0) { XRa = XRc }
+ *
+ *  S32MOVN
+ *    Single 32-bit conditional move where
+ *    XRb contains conditions, XRc what to move and
+ *    XRa is the destination.
+ *    a.k.a. if (XRb != 0) { XRa = XRc }
+ */
+static void gen_mxu_s32movzn(DisasContext *ctx, TCGCond cond)
+{
+    uint32_t XRc, XRb, XRa;
+
+    XRa = extract32(ctx->opcode,  6, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRc = extract32(ctx->opcode, 14, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGLabel *l_done = gen_new_label();
+
+    gen_load_mxu_gpr(t0, XRc);
+    gen_load_mxu_gpr(t1, XRb);
+
+    tcg_gen_brcondi_tl(cond, t1, 0, l_done);
+    gen_store_mxu_gpr(t0, XRa);
+    gen_set_label(l_done);
+}
+
+/*
+ *      MXU instruction category: Addition and subtraction
+ *      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *              S32CPS      D16CPS
+ *                                       Q8ADD
+ */
+
+/*
+ *  S32CPS
+ *    Update XRa if XRc < 0 by value of 0 - XRb
+ *    else XRa = XRb
+ */
+static void gen_mxu_S32CPS(DisasContext *ctx)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 5);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely(XRb == 0)) {
+        /* XRc make no sense 0 - 0 = 0 -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else if (unlikely(XRc == 0)) {
+        /* condition always false -> just move XRb to XRa */
+        tcg_gen_mov_tl(mxu_gpr[XRa - 1], mxu_gpr[XRb - 1]);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGLabel *l_not_less = gen_new_label();
+        TCGLabel *l_done = gen_new_label();
+
+        tcg_gen_brcondi_tl(TCG_COND_GE, mxu_gpr[XRc - 1], 0, l_not_less);
+        tcg_gen_neg_tl(t0, mxu_gpr[XRb - 1]);
+        tcg_gen_br(l_done);
+        gen_set_label(l_not_less);
+        gen_load_mxu_gpr(t0, XRb);
+        gen_set_label(l_done);
+        gen_store_mxu_gpr(t0, XRa);
+    }
+}
+
+/*
+ *  D16CPS
+ *    Update XRa[0..1] if XRc[0..1] < 0 by value of 0 - XRb[0..1]
+ *    else XRa[0..1] = XRb[0..1]
+ */
+static void gen_mxu_D16CPS(DisasContext *ctx)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 5);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely(XRb == 0)) {
+        /* XRc make no sense 0 - 0 = 0 -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else if (unlikely(XRc == 0)) {
+        /* condition always false -> just move XRb to XRa */
+        tcg_gen_mov_tl(mxu_gpr[XRa - 1], mxu_gpr[XRb - 1]);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGLabel *l_done_hi = gen_new_label();
+        TCGLabel *l_not_less_lo = gen_new_label();
+        TCGLabel *l_done_lo = gen_new_label();
+
+        tcg_gen_sextract_tl(t0, mxu_gpr[XRc - 1], 16, 16);
+        tcg_gen_sextract_tl(t1, mxu_gpr[XRb - 1], 16, 16);
+        tcg_gen_brcondi_tl(TCG_COND_GE, t0, 0, l_done_hi);
+        tcg_gen_subfi_tl(t1, 0, t1);
+
+        gen_set_label(l_done_hi);
+        tcg_gen_shli_i32(t1, t1, 16);
+
+        tcg_gen_sextract_tl(t0, mxu_gpr[XRc - 1],  0, 16);
+        tcg_gen_brcondi_tl(TCG_COND_GE, t0, 0, l_not_less_lo);
+        tcg_gen_sextract_tl(t0, mxu_gpr[XRb - 1],  0, 16);
+        tcg_gen_subfi_tl(t0, 0, t0);
+        tcg_gen_br(l_done_lo);
+
+        gen_set_label(l_not_less_lo);
+        tcg_gen_extract_tl(t0, mxu_gpr[XRb - 1],  0, 16);
+
+        gen_set_label(l_done_lo);
+        tcg_gen_deposit_tl(mxu_gpr[XRa - 1], t1, t0, 0, 16);
+    }
+}
+
+/*
+ *  Q8ABD XRa, XRb, XRc
+ *  Gets absolute difference for quadruple of 8-bit
+ *  packed in XRb to another one in XRc,
+ *  put the result in XRa.
+ *  a.k.a. XRa[0..3] = abs(XRb[0..3] - XRc[0..3]);
+ */
+static void gen_mxu_Q8ABD(DisasContext *ctx)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 3);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGv t4 = tcg_temp_new();
+
+        gen_load_mxu_gpr(t3, XRb);
+        gen_load_mxu_gpr(t4, XRc);
+        tcg_gen_movi_tl(t2, 0);
+
+        for (int i = 0; i < 4; i++) {
+            tcg_gen_extract_tl(t0, t3, 8 * i, 8);
+            tcg_gen_extract_tl(t1, t4, 8 * i, 8);
+
+            tcg_gen_sub_tl(t0, t0, t1);
+            tcg_gen_abs_tl(t0, t0);
+
+            tcg_gen_deposit_tl(t2, t2, t0, 8 * i, 8);
+        }
+        gen_store_mxu_gpr(t2, XRa);
+    }
+}
+
+/*
+ *  Q8ADD XRa, XRb, XRc, ptn2
+ *  Add/subtract quadruple of 8-bit packed in XRb
+ *  to another one in XRc, put the result in XRa.
+ */
+static void gen_mxu_Q8ADD(DisasContext *ctx)
+{
+    uint32_t aptn2, pad, XRc, XRb, XRa;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    pad   = extract32(ctx->opcode, 21, 3);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        tcg_gen_movi_i32(mxu_gpr[XRa - 1], 0);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGv t4 = tcg_temp_new();
+
+        gen_load_mxu_gpr(t3, XRb);
+        gen_load_mxu_gpr(t4, XRc);
+
+        for (int i = 0; i < 4; i++) {
+            tcg_gen_andi_tl(t0, t3, 0xff);
+            tcg_gen_andi_tl(t1, t4, 0xff);
+
+            if (i < 2) {
+                if (aptn2 & 0x01) {
+                    tcg_gen_sub_tl(t0, t0, t1);
+                } else {
+                    tcg_gen_add_tl(t0, t0, t1);
+                }
+            } else {
+                if (aptn2 & 0x02) {
+                    tcg_gen_sub_tl(t0, t0, t1);
+                } else {
+                    tcg_gen_add_tl(t0, t0, t1);
+                }
+            }
+            if (i < 3) {
+                tcg_gen_shri_tl(t3, t3, 8);
+                tcg_gen_shri_tl(t4, t4, 8);
+            }
+            if (i > 0) {
+                tcg_gen_deposit_tl(t2, t2, t0, 8 * i, 8);
+            } else {
+                tcg_gen_andi_tl(t0, t0, 0xff);
+                tcg_gen_mov_tl(t2, t0);
+            }
+        }
+        gen_store_mxu_gpr(t2, XRa);
+    }
+}
+
+/*
+ *  Q8ADDE XRa, XRb, XRc, XRd, aptn2
+ *    Add/subtract quadruple of 8-bit packed in XRb
+ *    to another one in XRc, with zero extending
+ *    to 16-bit and put results as packed 16-bit data
+ *    into XRa and XRd.
+ *    aptn2 manages action add or subract of pairs of data.
+ *
+ *  Q8ACCE XRa, XRb, XRc, XRd, aptn2
+ *    Add/subtract quadruple of 8-bit packed in XRb
+ *    to another one in XRc, with zero extending
+ *    to 16-bit and accumulate results as packed 16-bit data
+ *    into XRa and XRd.
+ *    aptn2 manages action add or subract of pairs of data.
+ */
+static void gen_mxu_q8adde(DisasContext *ctx, bool accumulate)
+{
+    uint32_t aptn2, XRd, XRc, XRb, XRa;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        if (XRa != 0) {
+            tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+        }
+        if (XRd != 0) {
+            tcg_gen_movi_tl(mxu_gpr[XRd - 1], 0);
+        }
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGv t4 = tcg_temp_new();
+        TCGv t5 = tcg_temp_new();
+
+        if (XRa != 0) {
+            tcg_gen_extract_tl(t0, mxu_gpr[XRb - 1], 16, 8);
+            tcg_gen_extract_tl(t1, mxu_gpr[XRc - 1], 16, 8);
+            tcg_gen_extract_tl(t2, mxu_gpr[XRb - 1], 24, 8);
+            tcg_gen_extract_tl(t3, mxu_gpr[XRc - 1], 24, 8);
+            if (aptn2 & 2) {
+                tcg_gen_sub_tl(t0, t0, t1);
+                tcg_gen_sub_tl(t2, t2, t3);
+            } else {
+                tcg_gen_add_tl(t0, t0, t1);
+                tcg_gen_add_tl(t2, t2, t3);
+            }
+            if (accumulate) {
+                gen_load_mxu_gpr(t5, XRa);
+                tcg_gen_extract_tl(t1, t5,  0, 16);
+                tcg_gen_extract_tl(t3, t5, 16, 16);
+                tcg_gen_add_tl(t0, t0, t1);
+                tcg_gen_add_tl(t2, t2, t3);
+            }
+            tcg_gen_shli_tl(t2, t2, 16);
+            tcg_gen_extract_tl(t0, t0, 0, 16);
+            tcg_gen_or_tl(t4, t2, t0);
+        }
+        if (XRd != 0) {
+            tcg_gen_extract_tl(t0, mxu_gpr[XRb - 1], 0, 8);
+            tcg_gen_extract_tl(t1, mxu_gpr[XRc - 1], 0, 8);
+            tcg_gen_extract_tl(t2, mxu_gpr[XRb - 1], 8, 8);
+            tcg_gen_extract_tl(t3, mxu_gpr[XRc - 1], 8, 8);
+            if (aptn2 & 1) {
+                tcg_gen_sub_tl(t0, t0, t1);
+                tcg_gen_sub_tl(t2, t2, t3);
+            } else {
+                tcg_gen_add_tl(t0, t0, t1);
+                tcg_gen_add_tl(t2, t2, t3);
+            }
+            if (accumulate) {
+                gen_load_mxu_gpr(t5, XRd);
+                tcg_gen_extract_tl(t1, t5,  0, 16);
+                tcg_gen_extract_tl(t3, t5, 16, 16);
+                tcg_gen_add_tl(t0, t0, t1);
+                tcg_gen_add_tl(t2, t2, t3);
+            }
+            tcg_gen_shli_tl(t2, t2, 16);
+            tcg_gen_extract_tl(t0, t0, 0, 16);
+            tcg_gen_or_tl(t5, t2, t0);
+        }
+
+        gen_store_mxu_gpr(t4, XRa);
+        gen_store_mxu_gpr(t5, XRd);
+    }
+}
+
+/*
+ *  D8SUM XRa, XRb, XRc
+ *    Double parallel add of quadruple unsigned 8-bit together
+ *    with zero extending to 16-bit data.
+ *  D8SUMC XRa, XRb, XRc
+ *    Double parallel add of quadruple unsigned 8-bit together
+ *    with zero extending to 16-bit data and adding 2 to each
+ *    parallel result.
+ */
+static void gen_mxu_d8sum(DisasContext *ctx, bool sumc)
+{
+    uint32_t pad, pad2, XRc, XRb, XRa;
+
+    pad  = extract32(ctx->opcode, 24, 2);
+    pad2 = extract32(ctx->opcode, 18, 4);
+    XRc  = extract32(ctx->opcode, 14, 4);
+    XRb  = extract32(ctx->opcode, 10, 4);
+    XRa  = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0 || pad2 != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to zero */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGv t4 = tcg_temp_new();
+        TCGv t5 = tcg_temp_new();
+
+        if (XRb != 0) {
+            tcg_gen_extract_tl(t0, mxu_gpr[XRb - 1],  0, 8);
+            tcg_gen_extract_tl(t1, mxu_gpr[XRb - 1],  8, 8);
+            tcg_gen_extract_tl(t2, mxu_gpr[XRb - 1], 16, 8);
+            tcg_gen_extract_tl(t3, mxu_gpr[XRb - 1], 24, 8);
+            tcg_gen_add_tl(t4, t0, t1);
+            tcg_gen_add_tl(t4, t4, t2);
+            tcg_gen_add_tl(t4, t4, t3);
+        } else {
+            tcg_gen_mov_tl(t4, 0);
+        }
+        if (XRc != 0) {
+            tcg_gen_extract_tl(t0, mxu_gpr[XRc - 1],  0, 8);
+            tcg_gen_extract_tl(t1, mxu_gpr[XRc - 1],  8, 8);
+            tcg_gen_extract_tl(t2, mxu_gpr[XRc - 1], 16, 8);
+            tcg_gen_extract_tl(t3, mxu_gpr[XRc - 1], 24, 8);
+            tcg_gen_add_tl(t5, t0, t1);
+            tcg_gen_add_tl(t5, t5, t2);
+            tcg_gen_add_tl(t5, t5, t3);
+        } else {
+            tcg_gen_mov_tl(t5, 0);
+        }
+
+        if (sumc) {
+            tcg_gen_addi_tl(t4, t4, 2);
+            tcg_gen_addi_tl(t5, t5, 2);
+        }
+        tcg_gen_shli_tl(t4, t4, 16);
+
+        tcg_gen_or_tl(mxu_gpr[XRa - 1], t4, t5);
+    }
+}
+
+/*
+ * Q16ADD XRa, XRb, XRc, XRd, aptn2, optn2 - Quad packed
+ * 16-bit pattern addition.
+ */
+static void gen_mxu_q16add(DisasContext *ctx)
+{
+    uint32_t aptn2, optn2, XRc, XRb, XRa, XRd;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    optn2 = extract32(ctx->opcode, 22, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+    TCGv t4 = tcg_temp_new();
+    TCGv t5 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t1, XRb);
+    tcg_gen_extract_tl(t0, t1,  0, 16);
+    tcg_gen_extract_tl(t1, t1, 16, 16);
+
+    gen_load_mxu_gpr(t3, XRc);
+    tcg_gen_extract_tl(t2, t3,  0, 16);
+    tcg_gen_extract_tl(t3, t3, 16, 16);
+
+    switch (optn2) {
+    case MXU_OPTN2_WW: /* XRB.H+XRC.H == lop, XRB.L+XRC.L == rop */
+        tcg_gen_mov_tl(t4, t1);
+        tcg_gen_mov_tl(t5, t0);
+        break;
+    case MXU_OPTN2_LW: /* XRB.L+XRC.H == lop, XRB.L+XRC.L == rop */
+        tcg_gen_mov_tl(t4, t0);
+        tcg_gen_mov_tl(t5, t0);
+        break;
+    case MXU_OPTN2_HW: /* XRB.H+XRC.H == lop, XRB.H+XRC.L == rop */
+        tcg_gen_mov_tl(t4, t1);
+        tcg_gen_mov_tl(t5, t1);
+        break;
+    case MXU_OPTN2_XW: /* XRB.L+XRC.H == lop, XRB.H+XRC.L == rop */
+        tcg_gen_mov_tl(t4, t0);
+        tcg_gen_mov_tl(t5, t1);
+        break;
+    }
+
+    switch (aptn2) {
+    case MXU_APTN2_AA: /* lop +, rop + */
+        tcg_gen_add_tl(t0, t4, t3);
+        tcg_gen_add_tl(t1, t5, t2);
+        tcg_gen_add_tl(t4, t4, t3);
+        tcg_gen_add_tl(t5, t5, t2);
+        break;
+    case MXU_APTN2_AS: /* lop +, rop + */
+        tcg_gen_sub_tl(t0, t4, t3);
+        tcg_gen_sub_tl(t1, t5, t2);
+        tcg_gen_add_tl(t4, t4, t3);
+        tcg_gen_add_tl(t5, t5, t2);
+        break;
+    case MXU_APTN2_SA: /* lop +, rop + */
+        tcg_gen_add_tl(t0, t4, t3);
+        tcg_gen_add_tl(t1, t5, t2);
+        tcg_gen_sub_tl(t4, t4, t3);
+        tcg_gen_sub_tl(t5, t5, t2);
+        break;
+    case MXU_APTN2_SS: /* lop +, rop + */
+        tcg_gen_sub_tl(t0, t4, t3);
+        tcg_gen_sub_tl(t1, t5, t2);
+        tcg_gen_sub_tl(t4, t4, t3);
+        tcg_gen_sub_tl(t5, t5, t2);
+        break;
+    }
+
+    tcg_gen_shli_tl(t0, t0, 16);
+    tcg_gen_extract_tl(t1, t1, 0, 16);
+    tcg_gen_shli_tl(t4, t4, 16);
+    tcg_gen_extract_tl(t5, t5, 0, 16);
+
+    tcg_gen_or_tl(mxu_gpr[XRa - 1], t4, t5);
+    tcg_gen_or_tl(mxu_gpr[XRd - 1], t0, t1);
+}
+
+/*
+ * Q16ACC XRa, XRb, XRc, XRd, aptn2 - Quad packed
+ * 16-bit addition/subtraction with accumulate.
+ */
+static void gen_mxu_q16acc(DisasContext *ctx)
+{
+    uint32_t aptn2, XRc, XRb, XRa, XRd;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+    TCGv s3 = tcg_temp_new();
+    TCGv s2 = tcg_temp_new();
+    TCGv s1 = tcg_temp_new();
+    TCGv s0 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t1, XRb);
+    tcg_gen_extract_tl(t0, t1,  0, 16);
+    tcg_gen_extract_tl(t1, t1, 16, 16);
+
+    gen_load_mxu_gpr(t3, XRc);
+    tcg_gen_extract_tl(t2, t3,  0, 16);
+    tcg_gen_extract_tl(t3, t3, 16, 16);
+
+    switch (aptn2) {
+    case MXU_APTN2_AA: /* lop +, rop + */
+        tcg_gen_add_tl(s3, t1, t3);
+        tcg_gen_add_tl(s2, t0, t2);
+        tcg_gen_add_tl(s1, t1, t3);
+        tcg_gen_add_tl(s0, t0, t2);
+        break;
+    case MXU_APTN2_AS: /* lop +, rop - */
+        tcg_gen_sub_tl(s3, t1, t3);
+        tcg_gen_sub_tl(s2, t0, t2);
+        tcg_gen_add_tl(s1, t1, t3);
+        tcg_gen_add_tl(s0, t0, t2);
+        break;
+    case MXU_APTN2_SA: /* lop -, rop + */
+        tcg_gen_add_tl(s3, t1, t3);
+        tcg_gen_add_tl(s2, t0, t2);
+        tcg_gen_sub_tl(s1, t1, t3);
+        tcg_gen_sub_tl(s0, t0, t2);
+        break;
+    case MXU_APTN2_SS: /* lop -, rop - */
+        tcg_gen_sub_tl(s3, t1, t3);
+        tcg_gen_sub_tl(s2, t0, t2);
+        tcg_gen_sub_tl(s1, t1, t3);
+        tcg_gen_sub_tl(s0, t0, t2);
+        break;
+    }
+
+    if (XRa != 0) {
+        tcg_gen_add_tl(t0, mxu_gpr[XRa - 1], s0);
+        tcg_gen_extract_tl(t0, t0, 0, 16);
+        tcg_gen_extract_tl(t1, mxu_gpr[XRa - 1], 16, 16);
+        tcg_gen_add_tl(t1, t1, s1);
+        tcg_gen_shli_tl(t1, t1, 16);
+        tcg_gen_or_tl(mxu_gpr[XRa - 1], t1, t0);
+    }
+
+    if (XRd != 0) {
+        tcg_gen_add_tl(t0, mxu_gpr[XRd - 1], s2);
+        tcg_gen_extract_tl(t0, t0, 0, 16);
+        tcg_gen_extract_tl(t1, mxu_gpr[XRd - 1], 16, 16);
+        tcg_gen_add_tl(t1, t1, s3);
+        tcg_gen_shli_tl(t1, t1, 16);
+        tcg_gen_or_tl(mxu_gpr[XRd - 1], t1, t0);
+    }
+}
+
+/*
+ * Q16ACCM XRa, XRb, XRc, XRd, aptn2 - Quad packed
+ * 16-bit accumulate.
+ */
+static void gen_mxu_q16accm(DisasContext *ctx)
+{
+    uint32_t aptn2, XRc, XRb, XRa, XRd;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t2, XRb);
+    gen_load_mxu_gpr(t3, XRc);
+
+    if (XRa != 0) {
+        TCGv a0 = tcg_temp_new();
+        TCGv a1 = tcg_temp_new();
+
+        tcg_gen_extract_tl(t0, t2,  0, 16);
+        tcg_gen_extract_tl(t1, t2, 16, 16);
+
+        gen_load_mxu_gpr(a1, XRa);
+        tcg_gen_extract_tl(a0, a1,  0, 16);
+        tcg_gen_extract_tl(a1, a1, 16, 16);
+
+        if (aptn2 & 2) {
+            tcg_gen_sub_tl(a0, a0, t0);
+            tcg_gen_sub_tl(a1, a1, t1);
+        } else {
+            tcg_gen_add_tl(a0, a0, t0);
+            tcg_gen_add_tl(a1, a1, t1);
+        }
+        tcg_gen_extract_tl(a0, a0, 0, 16);
+        tcg_gen_shli_tl(a1, a1, 16);
+        tcg_gen_or_tl(mxu_gpr[XRa - 1], a1, a0);
+    }
+
+    if (XRd != 0) {
+        TCGv a0 = tcg_temp_new();
+        TCGv a1 = tcg_temp_new();
+
+        tcg_gen_extract_tl(t0, t3,  0, 16);
+        tcg_gen_extract_tl(t1, t3, 16, 16);
+
+        gen_load_mxu_gpr(a1, XRd);
+        tcg_gen_extract_tl(a0, a1,  0, 16);
+        tcg_gen_extract_tl(a1, a1, 16, 16);
+
+        if (aptn2 & 1) {
+            tcg_gen_sub_tl(a0, a0, t0);
+            tcg_gen_sub_tl(a1, a1, t1);
+        } else {
+            tcg_gen_add_tl(a0, a0, t0);
+            tcg_gen_add_tl(a1, a1, t1);
+        }
+        tcg_gen_extract_tl(a0, a0, 0, 16);
+        tcg_gen_shli_tl(a1, a1, 16);
+        tcg_gen_or_tl(mxu_gpr[XRd - 1], a1, a0);
+    }
+}
+
+
+/*
+ * D16ASUM XRa, XRb, XRc, XRd, aptn2 - Double packed
+ * 16-bit sign extended addition and accumulate.
+ */
+static void gen_mxu_d16asum(DisasContext *ctx)
+{
+    uint32_t aptn2, XRc, XRb, XRa, XRd;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t2, XRb);
+    gen_load_mxu_gpr(t3, XRc);
+
+    if (XRa != 0) {
+        tcg_gen_sextract_tl(t0, t2,  0, 16);
+        tcg_gen_sextract_tl(t1, t2, 16, 16);
+        tcg_gen_add_tl(t0, t0, t1);
+        if (aptn2 & 2) {
+            tcg_gen_sub_tl(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+        } else {
+            tcg_gen_add_tl(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+        }
+    }
+
+    if (XRd != 0) {
+        tcg_gen_sextract_tl(t0, t3,  0, 16);
+        tcg_gen_sextract_tl(t1, t3, 16, 16);
+        tcg_gen_add_tl(t0, t0, t1);
+        if (aptn2 & 1) {
+            tcg_gen_sub_tl(mxu_gpr[XRd - 1], mxu_gpr[XRd - 1], t0);
+        } else {
+            tcg_gen_add_tl(mxu_gpr[XRd - 1], mxu_gpr[XRd - 1], t0);
+        }
+    }
+}
+
+/*
+ * D32ADD XRa, XRb, XRc, XRd, aptn2 - Double
+ * 32 bit pattern addition/subtraction, set carry.
+ *
+ * D32ADDC XRa, XRb, XRc, XRd, aptn2 - Double
+ * 32 bit pattern addition/subtraction with carry.
+ */
+static void gen_mxu_d32add(DisasContext *ctx)
+{
+    uint32_t aptn2, addc, XRc, XRb, XRa, XRd;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    addc  = extract32(ctx->opcode, 22, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv cr = tcg_temp_new();
+
+    if (unlikely(addc > 1)) {
+        /* opcode incorrect -> do nothing */
+    } else if (addc == 1) {
+        if (unlikely(XRa == 0 && XRd == 0)) {
+            /* destinations are zero register -> do nothing */
+        } else {
+            /* FIXME ??? What if XRa == XRd ??? */
+            /* aptn2 is unused here */
+            gen_load_mxu_gpr(t0, XRb);
+            gen_load_mxu_gpr(t1, XRc);
+            gen_load_mxu_cr(cr);
+            if (XRa != 0) {
+                tcg_gen_extract_tl(t2, cr, 31, 1);
+                tcg_gen_add_tl(t0, t0, t2);
+                tcg_gen_add_tl(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+            }
+            if (XRd != 0) {
+                tcg_gen_extract_tl(t2, cr, 30, 1);
+                tcg_gen_add_tl(t1, t1, t2);
+                tcg_gen_add_tl(mxu_gpr[XRd - 1], mxu_gpr[XRd - 1], t1);
+            }
+        }
+    } else if (unlikely(XRa == 0 && XRd == 0)) {
+        /* destinations are zero register -> do nothing */
+    } else {
+        /* common case */
+        /* FIXME ??? What if XRa == XRd ??? */
+        TCGv carry = tcg_temp_new();
+
+        gen_load_mxu_gpr(t0, XRb);
+        gen_load_mxu_gpr(t1, XRc);
+        gen_load_mxu_cr(cr);
+        if (XRa != 0) {
+            if (aptn2 & 2) {
+                tcg_gen_sub_i32(t2, t0, t1);
+                tcg_gen_setcond_tl(TCG_COND_GTU, carry, t0, t1);
+            } else {
+                tcg_gen_add_i32(t2, t0, t1);
+                tcg_gen_setcond_tl(TCG_COND_GTU, carry, t0, t2);
+            }
+            tcg_gen_andi_tl(cr, cr, 0x7fffffff);
+            tcg_gen_shli_tl(carry, carry, 31);
+            tcg_gen_or_tl(cr, cr, carry);
+            gen_store_mxu_gpr(t2, XRa);
+        }
+        if (XRd != 0) {
+            if (aptn2 & 1) {
+                tcg_gen_sub_i32(t2, t0, t1);
+                tcg_gen_setcond_tl(TCG_COND_GTU, carry, t0, t1);
+            } else {
+                tcg_gen_add_i32(t2, t0, t1);
+                tcg_gen_setcond_tl(TCG_COND_GTU, carry, t0, t2);
+            }
+            tcg_gen_andi_tl(cr, cr, 0xbfffffff);
+            tcg_gen_shli_tl(carry, carry, 30);
+            tcg_gen_or_tl(cr, cr, carry);
+            gen_store_mxu_gpr(t2, XRd);
+        }
+        gen_store_mxu_cr(cr);
+    }
+}
+
+/*
+ * D32ACC XRa, XRb, XRc, XRd, aptn2 - Double
+ * 32 bit pattern addition/subtraction and accumulate.
+ */
+static void gen_mxu_d32acc(DisasContext *ctx)
+{
+    uint32_t aptn2, XRc, XRb, XRa, XRd;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+
+    if (unlikely(XRa == 0 && XRd == 0)) {
+        /* destinations are zero register -> do nothing */
+    } else {
+        /* common case */
+        gen_load_mxu_gpr(t0, XRb);
+        gen_load_mxu_gpr(t1, XRc);
+        if (XRa != 0) {
+            if (aptn2 & 2) {
+                tcg_gen_sub_tl(t2, t0, t1);
+            } else {
+                tcg_gen_add_tl(t2, t0, t1);
+            }
+            tcg_gen_add_tl(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t2);
+        }
+        if (XRd != 0) {
+            if (aptn2 & 1) {
+                tcg_gen_sub_tl(t2, t0, t1);
+            } else {
+                tcg_gen_add_tl(t2, t0, t1);
+            }
+            tcg_gen_add_tl(mxu_gpr[XRd - 1], mxu_gpr[XRd - 1], t2);
+        }
+    }
+}
+
+/*
+ * D32ACCM XRa, XRb, XRc, XRd, aptn2 - Double
+ * 32 bit pattern addition/subtraction and accumulate.
+ */
+static void gen_mxu_d32accm(DisasContext *ctx)
+{
+    uint32_t aptn2, XRc, XRb, XRa, XRd;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+
+    if (unlikely(XRa == 0 && XRd == 0)) {
+        /* destinations are zero register -> do nothing */
+    } else {
+        /* common case */
+        gen_load_mxu_gpr(t0, XRb);
+        gen_load_mxu_gpr(t1, XRc);
+        if (XRa != 0) {
+            tcg_gen_add_tl(t2, t0, t1);
+            if (aptn2 & 2) {
+                tcg_gen_sub_tl(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t2);
+            } else {
+                tcg_gen_add_tl(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t2);
+            }
+        }
+        if (XRd != 0) {
+            tcg_gen_sub_tl(t2, t0, t1);
+            if (aptn2 & 1) {
+                tcg_gen_sub_tl(mxu_gpr[XRd - 1], mxu_gpr[XRd - 1], t2);
+            } else {
+                tcg_gen_add_tl(mxu_gpr[XRd - 1], mxu_gpr[XRd - 1], t2);
+            }
+        }
+    }
+}
+
+/*
+ * D32ASUM XRa, XRb, XRc, XRd, aptn2 - Double
+ * 32 bit pattern addition/subtraction.
+ */
+static void gen_mxu_d32asum(DisasContext *ctx)
+{
+    uint32_t aptn2, XRc, XRb, XRa, XRd;
+
+    aptn2 = extract32(ctx->opcode, 24, 2);
+    XRd   = extract32(ctx->opcode, 18, 4);
+    XRc   = extract32(ctx->opcode, 14, 4);
+    XRb   = extract32(ctx->opcode, 10, 4);
+    XRa   = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+
+    if (unlikely(XRa == 0 && XRd == 0)) {
+        /* destinations are zero register -> do nothing */
+    } else {
+        /* common case */
+        gen_load_mxu_gpr(t0, XRb);
+        gen_load_mxu_gpr(t1, XRc);
+        if (XRa != 0) {
+            if (aptn2 & 2) {
+                tcg_gen_sub_tl(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+            } else {
+                tcg_gen_add_tl(mxu_gpr[XRa - 1], mxu_gpr[XRa - 1], t0);
+            }
+        }
+        if (XRd != 0) {
+            if (aptn2 & 1) {
+                tcg_gen_sub_tl(mxu_gpr[XRd - 1], mxu_gpr[XRd - 1], t1);
+            } else {
+                tcg_gen_add_tl(mxu_gpr[XRd - 1], mxu_gpr[XRd - 1], t1);
+            }
+        }
+    }
+}
+
+/*
+ *                 MXU instruction category: Miscellaneous
+ *                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *               S32EXTR      S32LUI
+ *               S32EXTRV
+ *                            Q16SAT
+ *                            Q16SCOP
+ */
+
+/*
+ *  S32EXTR XRa, XRd, rs, bits5
+ *    Extract bits5 bits from 64-bit pair {XRa:XRd}
+ *    starting from rs[4:0] offset and put to the XRa.
+ */
+static void gen_mxu_s32extr(DisasContext *ctx)
+{
+    TCGv t0, t1, t2, t3;
+    uint32_t XRa, XRd, rs, bits5;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+    t2 = tcg_temp_new();
+    t3 = tcg_temp_new();
+
+    XRa   = extract32(ctx->opcode,  6, 4);
+    XRd   = extract32(ctx->opcode, 10, 4);
+    bits5 = extract32(ctx->opcode, 16, 5);
+    rs    = extract32(ctx->opcode, 21, 5);
+
+    /* {tmp} = {XRa:XRd} >> (64 - rt - bits5); */
+    /* {XRa} = extract({tmp}, 0, bits5); */
+    if (bits5 > 0) {
+        TCGLabel *l_xra_only = gen_new_label();
+        TCGLabel *l_done = gen_new_label();
+
+        gen_load_mxu_gpr(t0, XRd);
+        gen_load_mxu_gpr(t1, XRa);
+        gen_load_gpr(t2, rs);
+        tcg_gen_andi_tl(t2, t2, 0x1f);
+        tcg_gen_subfi_tl(t2, 32, t2);
+        tcg_gen_brcondi_tl(TCG_COND_GE, t2, bits5, l_xra_only);
+        tcg_gen_subfi_tl(t2, bits5, t2);
+        tcg_gen_subfi_tl(t3, 32, t2);
+        tcg_gen_shr_tl(t0, t0, t3);
+        tcg_gen_shl_tl(t1, t1, t2);
+        tcg_gen_or_tl(t0, t0, t1);
+        tcg_gen_br(l_done);
+        gen_set_label(l_xra_only);
+        tcg_gen_subi_tl(t2, t2, bits5);
+        tcg_gen_shr_tl(t0, t1, t2);
+        gen_set_label(l_done);
+        tcg_gen_extract_tl(t0, t0, 0, bits5);
+    } else {
+        /* unspecified behavior but matches tests on real hardware*/
+        tcg_gen_movi_tl(t0, 0);
+    }
+    gen_store_mxu_gpr(t0, XRa);
+}
+
+/*
+ *  S32EXTRV XRa, XRd, rs, rt
+ *    Extract rt[4:0] bits from 64-bit pair {XRa:XRd}
+ *    starting from rs[4:0] offset and put to the XRa.
+ */
+static void gen_mxu_s32extrv(DisasContext *ctx)
+{
+    TCGv t0, t1, t2, t3, t4;
+    uint32_t XRa, XRd, rs, rt;
+
+    t0 = tcg_temp_new();
+    t1 = tcg_temp_new();
+    t2 = tcg_temp_new();
+    t3 = tcg_temp_new();
+    t4 = tcg_temp_new();
+    TCGLabel *l_xra_only = gen_new_label();
+    TCGLabel *l_done = gen_new_label();
+    TCGLabel *l_zero = gen_new_label();
+    TCGLabel *l_extract = gen_new_label();
+
+    XRa = extract32(ctx->opcode,  6, 4);
+    XRd = extract32(ctx->opcode, 10, 4);
+    rt  = extract32(ctx->opcode, 16, 5);
+    rs  = extract32(ctx->opcode, 21, 5);
+
+    /* {tmp} = {XRa:XRd} >> (64 - rs - rt) */
+    gen_load_mxu_gpr(t0, XRd);
+    gen_load_mxu_gpr(t1, XRa);
+    gen_load_gpr(t2, rs);
+    gen_load_gpr(t4, rt);
+    tcg_gen_brcondi_tl(TCG_COND_EQ, t4, 0, l_zero);
+    tcg_gen_andi_tl(t2, t2, 0x1f);
+    tcg_gen_subfi_tl(t2, 32, t2);
+    tcg_gen_brcond_tl(TCG_COND_GE, t2, t4, l_xra_only);
+    tcg_gen_sub_tl(t2, t4, t2);
+    tcg_gen_subfi_tl(t3, 32, t2);
+    tcg_gen_shr_tl(t0, t0, t3);
+    tcg_gen_shl_tl(t1, t1, t2);
+    tcg_gen_or_tl(t0, t0, t1);
+    tcg_gen_br(l_extract);
+
+    gen_set_label(l_xra_only);
+    tcg_gen_sub_tl(t2, t2, t4);
+    tcg_gen_shr_tl(t0, t1, t2);
+    tcg_gen_br(l_extract);
+
+    /* unspecified behavior but matches tests on real hardware*/
+    gen_set_label(l_zero);
+    tcg_gen_movi_tl(t0, 0);
+    tcg_gen_br(l_done);
+
+    /* {XRa} = extract({tmp}, 0, rt) */
+    gen_set_label(l_extract);
+    tcg_gen_subfi_tl(t4, 32, t4);
+    tcg_gen_shl_tl(t0, t0, t4);
+    tcg_gen_shr_tl(t0, t0, t4);
+
+    gen_set_label(l_done);
+    gen_store_mxu_gpr(t0, XRa);
+}
+
+/*
+ *  S32LUI XRa, S8, optn3
+ *    Permutate the immediate S8 value to form a word
+ *    to update XRa.
+ */
+static void gen_mxu_s32lui(DisasContext *ctx)
+{
+    uint32_t XRa, s8, optn3, pad;
+
+    XRa   = extract32(ctx->opcode,  6, 4);
+    s8    = extract32(ctx->opcode, 10, 8);
+    pad   = extract32(ctx->opcode, 21, 2);
+    optn3 = extract32(ctx->opcode, 23, 3);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else {
+        uint32_t s16;
+        TCGv t0 = tcg_temp_new();
+
+        switch (optn3) {
+        case 0:
+            tcg_gen_movi_tl(t0, s8);
+            break;
+        case 1:
+            tcg_gen_movi_tl(t0, s8 << 8);
+            break;
+        case 2:
+            tcg_gen_movi_tl(t0, s8 << 16);
+            break;
+        case 3:
+            tcg_gen_movi_tl(t0, s8 << 24);
+            break;
+        case 4:
+            tcg_gen_movi_tl(t0, (s8 << 16) | s8);
+            break;
+        case 5:
+            tcg_gen_movi_tl(t0, (s8 << 24) | (s8 << 8));
+            break;
+        case 6:
+            s16 = (uint16_t)(int16_t)(int8_t)s8;
+            tcg_gen_movi_tl(t0, (s16 << 16) | s16);
+            break;
+        case 7:
+            tcg_gen_movi_tl(t0, (s8 << 24) | (s8 << 16) | (s8 << 8) | s8);
+            break;
+        }
+        gen_store_mxu_gpr(t0, XRa);
+    }
+}
+
+/*
+ *  Q16SAT XRa, XRb, XRc
+ *  Packs four 16-bit signed integers in XRb and XRc to
+ *  four saturated unsigned 8-bit into XRa.
+ *
+ */
+static void gen_mxu_Q16SAT(DisasContext *ctx)
+{
+    uint32_t pad, XRc, XRb, XRa;
+
+    pad = extract32(ctx->opcode, 21, 3);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(pad != 0)) {
+        /* opcode padding incorrect -> do nothing */
+    } else if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+
+        tcg_gen_movi_tl(t2, 0);
+        if (XRb != 0) {
+            TCGLabel *l_less_hi = gen_new_label();
+            TCGLabel *l_less_lo = gen_new_label();
+            TCGLabel *l_lo = gen_new_label();
+            TCGLabel *l_greater_hi = gen_new_label();
+            TCGLabel *l_greater_lo = gen_new_label();
+            TCGLabel *l_done = gen_new_label();
+
+            tcg_gen_sari_tl(t0, mxu_gpr[XRb - 1], 16);
+            tcg_gen_brcondi_tl(TCG_COND_LT, t0, 0, l_less_hi);
+            tcg_gen_brcondi_tl(TCG_COND_GT, t0, 255, l_greater_hi);
+            tcg_gen_br(l_lo);
+            gen_set_label(l_less_hi);
+            tcg_gen_movi_tl(t0, 0);
+            tcg_gen_br(l_lo);
+            gen_set_label(l_greater_hi);
+            tcg_gen_movi_tl(t0, 255);
+
+            gen_set_label(l_lo);
+            tcg_gen_shli_tl(t1, mxu_gpr[XRb - 1], 16);
+            tcg_gen_sari_tl(t1, t1, 16);
+            tcg_gen_brcondi_tl(TCG_COND_LT, t1, 0, l_less_lo);
+            tcg_gen_brcondi_tl(TCG_COND_GT, t1, 255, l_greater_lo);
+            tcg_gen_br(l_done);
+            gen_set_label(l_less_lo);
+            tcg_gen_movi_tl(t1, 0);
+            tcg_gen_br(l_done);
+            gen_set_label(l_greater_lo);
+            tcg_gen_movi_tl(t1, 255);
+
+            gen_set_label(l_done);
+            tcg_gen_shli_tl(t2, t0, 24);
+            tcg_gen_shli_tl(t1, t1, 16);
+            tcg_gen_or_tl(t2, t2, t1);
+        }
+
+        if (XRc != 0) {
+            TCGLabel *l_less_hi = gen_new_label();
+            TCGLabel *l_less_lo = gen_new_label();
+            TCGLabel *l_lo = gen_new_label();
+            TCGLabel *l_greater_hi = gen_new_label();
+            TCGLabel *l_greater_lo = gen_new_label();
+            TCGLabel *l_done = gen_new_label();
+
+            tcg_gen_sari_tl(t0, mxu_gpr[XRc - 1], 16);
+            tcg_gen_brcondi_tl(TCG_COND_LT, t0, 0, l_less_hi);
+            tcg_gen_brcondi_tl(TCG_COND_GT, t0, 255, l_greater_hi);
+            tcg_gen_br(l_lo);
+            gen_set_label(l_less_hi);
+            tcg_gen_movi_tl(t0, 0);
+            tcg_gen_br(l_lo);
+            gen_set_label(l_greater_hi);
+            tcg_gen_movi_tl(t0, 255);
+
+            gen_set_label(l_lo);
+            tcg_gen_shli_tl(t1, mxu_gpr[XRc - 1], 16);
+            tcg_gen_sari_tl(t1, t1, 16);
+            tcg_gen_brcondi_tl(TCG_COND_LT, t1, 0, l_less_lo);
+            tcg_gen_brcondi_tl(TCG_COND_GT, t1, 255, l_greater_lo);
+            tcg_gen_br(l_done);
+            gen_set_label(l_less_lo);
+            tcg_gen_movi_tl(t1, 0);
+            tcg_gen_br(l_done);
+            gen_set_label(l_greater_lo);
+            tcg_gen_movi_tl(t1, 255);
+
+            gen_set_label(l_done);
+            tcg_gen_shli_tl(t0, t0, 8);
+            tcg_gen_or_tl(t2, t2, t0);
+            tcg_gen_or_tl(t2, t2, t1);
+        }
+        gen_store_mxu_gpr(t2, XRa);
+    }
+}
+
+/*
+ *  Q16SCOP XRa, XRd, XRb, XRc
+ *    Determine sign of quad packed 16-bit signed values
+ *    in XRb and XRc put result in XRa and XRd respectively.
+ */
+static void gen_mxu_q16scop(DisasContext *ctx)
+{
+    uint32_t XRd, XRc, XRb, XRa;
+
+    XRd  = extract32(ctx->opcode, 18, 4);
+    XRc  = extract32(ctx->opcode, 14, 4);
+    XRb  = extract32(ctx->opcode, 10, 4);
+    XRa  = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+    TCGv t4 = tcg_temp_new();
+
+    TCGLabel *l_b_hi_lt = gen_new_label();
+    TCGLabel *l_b_hi_gt = gen_new_label();
+    TCGLabel *l_b_lo = gen_new_label();
+    TCGLabel *l_b_lo_lt = gen_new_label();
+    TCGLabel *l_c_hi = gen_new_label();
+    TCGLabel *l_c_hi_lt = gen_new_label();
+    TCGLabel *l_c_hi_gt = gen_new_label();
+    TCGLabel *l_c_lo = gen_new_label();
+    TCGLabel *l_c_lo_lt = gen_new_label();
+    TCGLabel *l_done = gen_new_label();
+
+    gen_load_mxu_gpr(t0, XRb);
+    gen_load_mxu_gpr(t1, XRc);
+
+    tcg_gen_sextract_tl(t2, t0, 16, 16);
+    tcg_gen_brcondi_tl(TCG_COND_LT, t2, 0, l_b_hi_lt);
+    tcg_gen_brcondi_tl(TCG_COND_GT, t2, 0, l_b_hi_gt);
+    tcg_gen_movi_tl(t3, 0);
+    tcg_gen_br(l_b_lo);
+    gen_set_label(l_b_hi_lt);
+    tcg_gen_movi_tl(t3, 0xffff0000);
+    tcg_gen_br(l_b_lo);
+    gen_set_label(l_b_hi_gt);
+    tcg_gen_movi_tl(t3, 0x00010000);
+
+    gen_set_label(l_b_lo);
+    tcg_gen_sextract_tl(t2, t0, 0, 16);
+    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, l_c_hi);
+    tcg_gen_brcondi_tl(TCG_COND_LT, t2, 0, l_b_lo_lt);
+    tcg_gen_ori_tl(t3, t3, 0x00000001);
+    tcg_gen_br(l_c_hi);
+    gen_set_label(l_b_lo_lt);
+    tcg_gen_ori_tl(t3, t3, 0x0000ffff);
+    tcg_gen_br(l_c_hi);
+
+    gen_set_label(l_c_hi);
+    tcg_gen_sextract_tl(t2, t1, 16, 16);
+    tcg_gen_brcondi_tl(TCG_COND_LT, t2, 0, l_c_hi_lt);
+    tcg_gen_brcondi_tl(TCG_COND_GT, t2, 0, l_c_hi_gt);
+    tcg_gen_movi_tl(t4, 0);
+    tcg_gen_br(l_c_lo);
+    gen_set_label(l_c_hi_lt);
+    tcg_gen_movi_tl(t4, 0xffff0000);
+    tcg_gen_br(l_c_lo);
+    gen_set_label(l_c_hi_gt);
+    tcg_gen_movi_tl(t4, 0x00010000);
+
+    gen_set_label(l_c_lo);
+    tcg_gen_sextract_tl(t2, t1, 0, 16);
+    tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, l_done);
+    tcg_gen_brcondi_tl(TCG_COND_LT, t2, 0, l_c_lo_lt);
+    tcg_gen_ori_tl(t4, t4, 0x00000001);
+    tcg_gen_br(l_done);
+    gen_set_label(l_c_lo_lt);
+    tcg_gen_ori_tl(t4, t4, 0x0000ffff);
+
+    gen_set_label(l_done);
+    gen_store_mxu_gpr(t3, XRa);
+    gen_store_mxu_gpr(t4, XRd);
+}
+
+/*
+ *  S32SFL XRa, XRd, XRb, XRc
+ *    Shuffle bytes according to one of four patterns.
+ */
+static void gen_mxu_s32sfl(DisasContext *ctx)
+{
+    uint32_t XRd, XRc, XRb, XRa, ptn2;
+
+    XRd  = extract32(ctx->opcode, 18, 4);
+    XRc  = extract32(ctx->opcode, 14, 4);
+    XRb  = extract32(ctx->opcode, 10, 4);
+    XRa  = extract32(ctx->opcode,  6, 4);
+    ptn2 = extract32(ctx->opcode, 24, 2);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t0, XRb);
+    gen_load_mxu_gpr(t1, XRc);
+
+    switch (ptn2) {
+    case 0:
+        tcg_gen_andi_tl(t2, t0, 0xff000000);
+        tcg_gen_andi_tl(t3, t1, 0x000000ff);
+        tcg_gen_deposit_tl(t3, t3, t0,  8, 8);
+        tcg_gen_shri_tl(t0, t0,  8);
+        tcg_gen_shri_tl(t1, t1,  8);
+        tcg_gen_deposit_tl(t3, t3, t0, 24, 8);
+        tcg_gen_deposit_tl(t3, t3, t1, 16, 8);
+        tcg_gen_shri_tl(t0, t0,  8);
+        tcg_gen_shri_tl(t1, t1,  8);
+        tcg_gen_deposit_tl(t2, t2, t0,  8, 8);
+        tcg_gen_deposit_tl(t2, t2, t1,  0, 8);
+        tcg_gen_shri_tl(t1, t1,  8);
+        tcg_gen_deposit_tl(t2, t2, t1, 16, 8);
+        break;
+    case 1:
+        tcg_gen_andi_tl(t2, t0, 0xff000000);
+        tcg_gen_andi_tl(t3, t1, 0x000000ff);
+        tcg_gen_deposit_tl(t3, t3, t0, 16, 8);
+        tcg_gen_shri_tl(t0, t0,  8);
+        tcg_gen_shri_tl(t1, t1,  8);
+        tcg_gen_deposit_tl(t2, t2, t0, 16, 8);
+        tcg_gen_deposit_tl(t2, t2, t1,  0, 8);
+        tcg_gen_shri_tl(t0, t0,  8);
+        tcg_gen_shri_tl(t1, t1,  8);
+        tcg_gen_deposit_tl(t3, t3, t0, 24, 8);
+        tcg_gen_deposit_tl(t3, t3, t1,  8, 8);
+        tcg_gen_shri_tl(t1, t1,  8);
+        tcg_gen_deposit_tl(t2, t2, t1,  8, 8);
+        break;
+    case 2:
+        tcg_gen_andi_tl(t2, t0, 0xff00ff00);
+        tcg_gen_andi_tl(t3, t1, 0x00ff00ff);
+        tcg_gen_deposit_tl(t3, t3, t0,  8, 8);
+        tcg_gen_shri_tl(t0, t0, 16);
+        tcg_gen_shri_tl(t1, t1,  8);
+        tcg_gen_deposit_tl(t2, t2, t1,  0, 8);
+        tcg_gen_deposit_tl(t3, t3, t0, 24, 8);
+        tcg_gen_shri_tl(t1, t1, 16);
+        tcg_gen_deposit_tl(t2, t2, t1, 16, 8);
+        break;
+    case 3:
+        tcg_gen_andi_tl(t2, t0, 0xffff0000);
+        tcg_gen_andi_tl(t3, t1, 0x0000ffff);
+        tcg_gen_shri_tl(t1, t1, 16);
+        tcg_gen_deposit_tl(t2, t2, t1,  0, 16);
+        tcg_gen_deposit_tl(t3, t3, t0, 16, 16);
+        break;
+    }
+
+    gen_store_mxu_gpr(t2, XRa);
+    gen_store_mxu_gpr(t3, XRd);
+}
+
+/*
+ *  Q8SAD XRa, XRd, XRb, XRc
+ *    Typical SAD opration for motion estimation.
+ */
+static void gen_mxu_q8sad(DisasContext *ctx)
+{
+    uint32_t XRd, XRc, XRb, XRa;
+
+    XRd = extract32(ctx->opcode, 18, 4);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    TCGv t0 = tcg_temp_new();
+    TCGv t1 = tcg_temp_new();
+    TCGv t2 = tcg_temp_new();
+    TCGv t3 = tcg_temp_new();
+    TCGv t4 = tcg_temp_new();
+    TCGv t5 = tcg_temp_new();
+
+    gen_load_mxu_gpr(t2, XRb);
+    gen_load_mxu_gpr(t3, XRc);
+    gen_load_mxu_gpr(t5, XRd);
+    tcg_gen_movi_tl(t4, 0);
+
+    for (int i = 0; i < 4; i++) {
+        tcg_gen_andi_tl(t0, t2, 0xff);
+        tcg_gen_andi_tl(t1, t3, 0xff);
+        tcg_gen_sub_tl(t0, t0, t1);
+        tcg_gen_abs_tl(t0, t0);
+        tcg_gen_add_tl(t4, t4, t0);
+        if (i < 3) {
+            tcg_gen_shri_tl(t2, t2, 8);
+            tcg_gen_shri_tl(t3, t3, 8);
+        }
+    }
+    tcg_gen_add_tl(t5, t5, t4);
+    gen_store_mxu_gpr(t4, XRa);
+    gen_store_mxu_gpr(t5, XRd);
+}
 
 /*
  *                 MXU instruction category: align
@@ -1408,6 +4258,129 @@ static void gen_mxu_S32ALNI(DisasContext *ctx)
     }
 }
 
+/*
+ *  S32ALN XRc, XRb, XRa, rs
+ *    Arrange bytes from XRb and XRc according to one of five sets of
+ *    rules determined by rs[2:0], and place the result in XRa.
+ */
+static void gen_mxu_S32ALN(DisasContext *ctx)
+{
+    uint32_t rs, XRc, XRb, XRa;
+
+    rs  = extract32(ctx->opcode, 21, 5);
+    XRc = extract32(ctx->opcode, 14, 4);
+    XRb = extract32(ctx->opcode, 10, 4);
+    XRa = extract32(ctx->opcode,  6, 4);
+
+    if (unlikely(XRa == 0)) {
+        /* destination is zero register -> do nothing */
+    } else if (unlikely((XRb == 0) && (XRc == 0))) {
+        /* both operands zero registers -> just set destination to all 0s */
+        tcg_gen_movi_tl(mxu_gpr[XRa - 1], 0);
+    } else {
+        /* the most general case */
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv t2 = tcg_temp_new();
+        TCGv t3 = tcg_temp_new();
+        TCGLabel *l_exit = gen_new_label();
+        TCGLabel *l_b_only = gen_new_label();
+        TCGLabel *l_c_only = gen_new_label();
+
+        gen_load_mxu_gpr(t0, XRb);
+        gen_load_mxu_gpr(t1, XRc);
+        gen_load_gpr(t2, rs);
+        tcg_gen_andi_tl(t2, t2, 0x07);
+
+        /* do nothing for undefined cases */
+        tcg_gen_brcondi_tl(TCG_COND_GE, t2, 5, l_exit);
+
+        tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 0, l_b_only);
+        tcg_gen_brcondi_tl(TCG_COND_EQ, t2, 4, l_c_only);
+
+        tcg_gen_shli_tl(t2, t2, 3);
+        tcg_gen_subfi_tl(t3, 32, t2);
+
+        tcg_gen_shl_tl(t0, t0, t2);
+        tcg_gen_shr_tl(t1, t1, t3);
+        tcg_gen_or_tl(mxu_gpr[XRa - 1], t0, t1);
+        tcg_gen_br(l_exit);
+
+        gen_set_label(l_b_only);
+        gen_store_mxu_gpr(t0, XRa);
+        tcg_gen_br(l_exit);
+
+        gen_set_label(l_c_only);
+        gen_store_mxu_gpr(t1, XRa);
+
+        gen_set_label(l_exit);
+    }
+}
+
+/*
+ *  S32MADD XRa, XRd, rb, rc
+ *    32 to 64 bit signed multiply with subsequent add
+ *    result stored in {XRa, XRd} pair, stain HI/LO.
+ *  S32MADDU XRa, XRd, rb, rc
+ *    32 to 64 bit unsigned multiply with subsequent add
+ *    result stored in {XRa, XRd} pair, stain HI/LO.
+ *  S32MSUB XRa, XRd, rb, rc
+ *    32 to 64 bit signed multiply with subsequent subtract
+ *    result stored in {XRa, XRd} pair, stain HI/LO.
+ *  S32MSUBU XRa, XRd, rb, rc
+ *    32 to 64 bit unsigned multiply with subsequent subtract
+ *    result stored in {XRa, XRd} pair, stain HI/LO.
+ */
+static void gen_mxu_s32madd_sub(DisasContext *ctx, bool sub, bool uns)
+{
+    uint32_t XRa, XRd, Rb, Rc;
+
+    XRa  = extract32(ctx->opcode,  6, 4);
+    XRd  = extract32(ctx->opcode, 10, 4);
+    Rb   = extract32(ctx->opcode, 16, 5);
+    Rc   = extract32(ctx->opcode, 21, 5);
+
+    if (unlikely(Rb == 0 || Rc == 0)) {
+        /* do nothing because x + 0 * y => x */
+    } else if (unlikely(XRa == 0 && XRd == 0)) {
+        /* do nothing because result just dropped */
+    } else {
+        TCGv t0 = tcg_temp_new();
+        TCGv t1 = tcg_temp_new();
+        TCGv_i64 t2 = tcg_temp_new_i64();
+        TCGv_i64 t3 = tcg_temp_new_i64();
+
+        gen_load_gpr(t0, Rb);
+        gen_load_gpr(t1, Rc);
+
+        if (uns) {
+            tcg_gen_extu_tl_i64(t2, t0);
+            tcg_gen_extu_tl_i64(t3, t1);
+        } else {
+            tcg_gen_ext_tl_i64(t2, t0);
+            tcg_gen_ext_tl_i64(t3, t1);
+        }
+        tcg_gen_mul_i64(t2, t2, t3);
+
+        gen_load_mxu_gpr(t0, XRa);
+        gen_load_mxu_gpr(t1, XRd);
+
+        tcg_gen_concat_tl_i64(t3, t1, t0);
+        if (sub) {
+            tcg_gen_sub_i64(t3, t3, t2);
+        } else {
+            tcg_gen_add_i64(t3, t3, t2);
+        }
+        gen_move_low32(t1, t3);
+        gen_move_high32(t0, t3);
+
+        tcg_gen_mov_tl(cpu_HI[0], t0);
+        tcg_gen_mov_tl(cpu_LO[0], t1);
+
+        gen_store_mxu_gpr(t1, XRd);
+        gen_store_mxu_gpr(t0, XRa);
+    }
+}
 
 /*
  * Decoding engine for MXU
@@ -1431,6 +4404,116 @@ static void decode_opc_mxu__pool00(DisasContext *ctx)
     case OPC_MXU_Q8MIN:
         gen_mxu_Q8MAX_Q8MIN(ctx);
         break;
+    case OPC_MXU_Q8SLT:
+        gen_mxu_q8slt(ctx, false);
+        break;
+    case OPC_MXU_Q8SLTU:
+        gen_mxu_q8slt(ctx, true);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static bool decode_opc_mxu_s32madd_sub(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 0, 6);
+    uint32_t pad  = extract32(ctx->opcode, 14, 2);
+
+    if (pad != 2) {
+        /* MIPS32R1 MADD/MADDU/MSUB/MSUBU are on pad == 0 */
+        return false;
+    }
+
+    switch (opcode) {
+    case OPC_MXU_S32MADD:
+        gen_mxu_s32madd_sub(ctx, false, false);
+        break;
+    case OPC_MXU_S32MADDU:
+        gen_mxu_s32madd_sub(ctx, false, true);
+        break;
+    case OPC_MXU_S32MSUB:
+        gen_mxu_s32madd_sub(ctx, true, false);
+        break;
+    case OPC_MXU_S32MSUBU:
+        gen_mxu_s32madd_sub(ctx, true, true);
+        break;
+    default:
+        return false;
+    }
+    return true;
+}
+
+static void decode_opc_mxu__pool01(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 18, 3);
+
+    switch (opcode) {
+    case OPC_MXU_S32SLT:
+        gen_mxu_S32SLT(ctx);
+        break;
+    case OPC_MXU_D16SLT:
+        gen_mxu_D16SLT(ctx);
+        break;
+    case OPC_MXU_D16AVG:
+        gen_mxu_d16avg(ctx, false);
+        break;
+    case OPC_MXU_D16AVGR:
+        gen_mxu_d16avg(ctx, true);
+        break;
+    case OPC_MXU_Q8AVG:
+        gen_mxu_q8avg(ctx, false);
+        break;
+    case OPC_MXU_Q8AVGR:
+        gen_mxu_q8avg(ctx, true);
+        break;
+    case OPC_MXU_Q8ADD:
+        gen_mxu_Q8ADD(ctx);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool02(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 18, 3);
+
+    switch (opcode) {
+    case OPC_MXU_S32CPS:
+        gen_mxu_S32CPS(ctx);
+        break;
+    case OPC_MXU_D16CPS:
+        gen_mxu_D16CPS(ctx);
+        break;
+    case OPC_MXU_Q8ABD:
+        gen_mxu_Q8ABD(ctx);
+        break;
+    case OPC_MXU_Q16SAT:
+        gen_mxu_Q16SAT(ctx);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool03(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 24, 2);
+
+    switch (opcode) {
+    case OPC_MXU_D16MULF:
+        gen_mxu_d16mul(ctx, true, true);
+        break;
+    case OPC_MXU_D16MULE:
+        gen_mxu_d16mul(ctx, true, false);
+        break;
     default:
         MIPS_INVAL("decode_opc_mxu");
         gen_reserved_instruction(ctx);
@@ -1440,12 +4523,215 @@ static void decode_opc_mxu__pool00(DisasContext *ctx)
 
 static void decode_opc_mxu__pool04(DisasContext *ctx)
 {
-    uint32_t opcode = extract32(ctx->opcode, 20, 1);
+    uint32_t reversed = extract32(ctx->opcode, 20, 1);
+    uint32_t opcode = extract32(ctx->opcode, 10, 4);
+
+    /* Don't care about opcode bits as their meaning is unknown yet */
+    switch (opcode) {
+    default:
+        gen_mxu_s32ldxx(ctx, reversed, false);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool05(DisasContext *ctx)
+{
+    uint32_t reversed = extract32(ctx->opcode, 20, 1);
+    uint32_t opcode = extract32(ctx->opcode, 10, 4);
+
+    /* Don't care about opcode bits as their meaning is unknown yet */
+    switch (opcode) {
+    default:
+        gen_mxu_s32stxx(ctx, reversed, false);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool06(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 10, 4);
+    uint32_t strd2  = extract32(ctx->opcode, 14, 2);
 
     switch (opcode) {
-    case OPC_MXU_S32LDD:
-    case OPC_MXU_S32LDDR:
-        gen_mxu_s32ldd_s32lddr(ctx);
+    case OPC_MXU_S32LDST:
+    case OPC_MXU_S32LDSTR:
+        if (strd2 <= 2) {
+            gen_mxu_s32ldxvx(ctx, opcode, false, strd2);
+            break;
+        }
+        /* fallthrough */
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool07(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 10, 4);
+    uint32_t strd2  = extract32(ctx->opcode, 14, 2);
+
+    switch (opcode) {
+    case OPC_MXU_S32LDST:
+    case OPC_MXU_S32LDSTR:
+        if (strd2 <= 2) {
+            gen_mxu_s32stxvx(ctx, opcode, false, strd2);
+            break;
+        }
+        /* fallthrough */
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool08(DisasContext *ctx)
+{
+    uint32_t reversed = extract32(ctx->opcode, 20, 1);
+    uint32_t opcode = extract32(ctx->opcode, 10, 4);
+
+    /* Don't care about opcode bits as their meaning is unknown yet */
+    switch (opcode) {
+    default:
+        gen_mxu_s32ldxx(ctx, reversed, true);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool09(DisasContext *ctx)
+{
+    uint32_t reversed = extract32(ctx->opcode, 20, 1);
+    uint32_t opcode = extract32(ctx->opcode, 10, 4);
+
+    /* Don't care about opcode bits as their meaning is unknown yet */
+    switch (opcode) {
+    default:
+        gen_mxu_s32stxx(ctx, reversed, true);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool10(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 10, 4);
+    uint32_t strd2  = extract32(ctx->opcode, 14, 2);
+
+    switch (opcode) {
+    case OPC_MXU_S32LDST:
+    case OPC_MXU_S32LDSTR:
+        if (strd2 <= 2) {
+            gen_mxu_s32ldxvx(ctx, opcode, true, strd2);
+            break;
+        }
+        /* fallthrough */
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool11(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 10, 4);
+    uint32_t strd2  = extract32(ctx->opcode, 14, 2);
+
+    switch (opcode) {
+    case OPC_MXU_S32LDST:
+    case OPC_MXU_S32LDSTR:
+        if (strd2 <= 2) {
+            gen_mxu_s32stxvx(ctx, opcode, true, strd2);
+            break;
+        }
+        /* fallthrough */
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool12(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 22, 2);
+
+    switch (opcode) {
+    case OPC_MXU_D32ACC:
+        gen_mxu_d32acc(ctx);
+        break;
+    case OPC_MXU_D32ACCM:
+        gen_mxu_d32accm(ctx);
+        break;
+    case OPC_MXU_D32ASUM:
+        gen_mxu_d32asum(ctx);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool13(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 22, 2);
+
+    switch (opcode) {
+    case OPC_MXU_Q16ACC:
+        gen_mxu_q16acc(ctx);
+        break;
+    case OPC_MXU_Q16ACCM:
+        gen_mxu_q16accm(ctx);
+        break;
+    case OPC_MXU_D16ASUM:
+        gen_mxu_d16asum(ctx);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool14(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 22, 2);
+
+    switch (opcode) {
+    case OPC_MXU_Q8ADDE:
+        gen_mxu_q8adde(ctx, false);
+        break;
+    case OPC_MXU_D8SUM:
+        gen_mxu_d8sum(ctx, false);
+        break;
+    case OPC_MXU_D8SUMC:
+        gen_mxu_d8sum(ctx, true);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool15(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 14, 2);
+
+    switch (opcode) {
+    case OPC_MXU_S32MUL:
+        gen_mxu_s32mul(ctx, false);
+        break;
+    case OPC_MXU_S32MULU:
+        gen_mxu_s32mul(ctx, true);
+        break;
+    case OPC_MXU_S32EXTR:
+        gen_mxu_s32extr(ctx);
+        break;
+    case OPC_MXU_S32EXTRV:
+        gen_mxu_s32extrv(ctx);
         break;
     default:
         MIPS_INVAL("decode_opc_mxu");
@@ -1459,9 +4745,18 @@ static void decode_opc_mxu__pool16(DisasContext *ctx)
     uint32_t opcode = extract32(ctx->opcode, 18, 3);
 
     switch (opcode) {
+    case OPC_MXU_D32SARW:
+        gen_mxu_d32sarl(ctx, true);
+        break;
+    case OPC_MXU_S32ALN:
+        gen_mxu_S32ALN(ctx);
+        break;
     case OPC_MXU_S32ALNI:
         gen_mxu_S32ALNI(ctx);
         break;
+    case OPC_MXU_S32LUI:
+        gen_mxu_s32lui(ctx);
+        break;
     case OPC_MXU_S32NOR:
         gen_mxu_S32NOR(ctx);
         break;
@@ -1481,14 +4776,128 @@ static void decode_opc_mxu__pool16(DisasContext *ctx)
     }
 }
 
+static void decode_opc_mxu__pool17(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 6, 3);
+    uint32_t strd2  = extract32(ctx->opcode, 9, 2);
+
+    if (strd2 > 2) {
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        return;
+    }
+
+    switch (opcode) {
+    case OPC_MXU_LXW:
+          gen_mxu_lxx(ctx, strd2, MO_TE | MO_UL);
+          break;
+    case OPC_MXU_LXB:
+          gen_mxu_lxx(ctx, strd2, MO_TE | MO_SB);
+          break;
+    case OPC_MXU_LXH:
+          gen_mxu_lxx(ctx, strd2, MO_TE | MO_SW);
+          break;
+    case OPC_MXU_LXBU:
+          gen_mxu_lxx(ctx, strd2, MO_TE | MO_UB);
+          break;
+    case OPC_MXU_LXHU:
+          gen_mxu_lxx(ctx, strd2, MO_TE | MO_UW);
+          break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool18(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 18, 3);
+
+    switch (opcode) {
+    case OPC_MXU_D32SLLV:
+        gen_mxu_d32sxxv(ctx, false, false);
+        break;
+    case OPC_MXU_D32SLRV:
+        gen_mxu_d32sxxv(ctx, true, false);
+        break;
+    case OPC_MXU_D32SARV:
+        gen_mxu_d32sxxv(ctx, true, true);
+        break;
+    case OPC_MXU_Q16SLLV:
+        gen_mxu_q16sxxv(ctx, false, false);
+        break;
+    case OPC_MXU_Q16SLRV:
+        gen_mxu_q16sxxv(ctx, true, false);
+        break;
+    case OPC_MXU_Q16SARV:
+        gen_mxu_q16sxxv(ctx, true, true);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
 static void decode_opc_mxu__pool19(DisasContext *ctx)
 {
-    uint32_t opcode = extract32(ctx->opcode, 22, 2);
+    uint32_t opcode = extract32(ctx->opcode, 22, 4);
 
     switch (opcode) {
     case OPC_MXU_Q8MUL:
+        gen_mxu_q8mul_mac(ctx, false, false);
+        break;
     case OPC_MXU_Q8MULSU:
-        gen_mxu_q8mul_q8mulsu(ctx);
+        gen_mxu_q8mul_mac(ctx, true, false);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool20(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 18, 3);
+
+    switch (opcode) {
+    case OPC_MXU_Q8MOVZ:
+        gen_mxu_q8movzn(ctx, TCG_COND_NE);
+        break;
+    case OPC_MXU_Q8MOVN:
+        gen_mxu_q8movzn(ctx, TCG_COND_EQ);
+        break;
+    case OPC_MXU_D16MOVZ:
+        gen_mxu_d16movzn(ctx, TCG_COND_NE);
+        break;
+    case OPC_MXU_D16MOVN:
+        gen_mxu_d16movzn(ctx, TCG_COND_EQ);
+        break;
+    case OPC_MXU_S32MOVZ:
+        gen_mxu_s32movzn(ctx, TCG_COND_NE);
+        break;
+    case OPC_MXU_S32MOVN:
+        gen_mxu_s32movzn(ctx, TCG_COND_EQ);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+static void decode_opc_mxu__pool21(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 22, 2);
+
+    switch (opcode) {
+    case OPC_MXU_Q8MAC:
+        gen_mxu_q8mul_mac(ctx, false, true);
+        break;
+    case OPC_MXU_Q8MACSU:
+        gen_mxu_q8mul_mac(ctx, true, true);
         break;
     default:
         MIPS_INVAL("decode_opc_mxu");
@@ -1497,6 +4906,7 @@ static void decode_opc_mxu__pool19(DisasContext *ctx)
     }
 }
 
+
 bool decode_ase_mxu(DisasContext *ctx, uint32_t insn)
 {
     uint32_t opcode = extract32(insn, 0, 6);
@@ -1520,30 +4930,163 @@ bool decode_ase_mxu(DisasContext *ctx, uint32_t insn)
         tcg_gen_brcondi_tl(TCG_COND_NE, t_mxu_cr, MXU_CR_MXU_EN, l_exit);
 
         switch (opcode) {
+        case OPC_MXU_S32MADD:
+        case OPC_MXU_S32MADDU:
+        case OPC_MXU_S32MSUB:
+        case OPC_MXU_S32MSUBU:
+            return decode_opc_mxu_s32madd_sub(ctx);
         case OPC_MXU__POOL00:
             decode_opc_mxu__pool00(ctx);
             break;
         case OPC_MXU_D16MUL:
-            gen_mxu_d16mul(ctx);
+            gen_mxu_d16mul(ctx, false, false);
             break;
         case OPC_MXU_D16MAC:
-            gen_mxu_d16mac(ctx);
+            gen_mxu_d16mac(ctx, false, false);
+            break;
+        case OPC_MXU_D16MACF:
+            gen_mxu_d16mac(ctx, true, true);
+            break;
+        case OPC_MXU_D16MADL:
+            gen_mxu_d16madl(ctx);
+            break;
+        case OPC_MXU_S16MAD:
+            gen_mxu_s16mad(ctx);
+            break;
+        case OPC_MXU_Q16ADD:
+            gen_mxu_q16add(ctx);
+            break;
+        case OPC_MXU_D16MACE:
+            gen_mxu_d16mac(ctx, true, false);
+            break;
+        case OPC_MXU__POOL01:
+            decode_opc_mxu__pool01(ctx);
+            break;
+        case OPC_MXU__POOL02:
+            decode_opc_mxu__pool02(ctx);
+            break;
+        case OPC_MXU__POOL03:
+            decode_opc_mxu__pool03(ctx);
             break;
         case OPC_MXU__POOL04:
             decode_opc_mxu__pool04(ctx);
             break;
+        case OPC_MXU__POOL05:
+            decode_opc_mxu__pool05(ctx);
+            break;
+        case OPC_MXU__POOL06:
+            decode_opc_mxu__pool06(ctx);
+            break;
+        case OPC_MXU__POOL07:
+            decode_opc_mxu__pool07(ctx);
+            break;
+        case OPC_MXU__POOL08:
+            decode_opc_mxu__pool08(ctx);
+            break;
+        case OPC_MXU__POOL09:
+            decode_opc_mxu__pool09(ctx);
+            break;
+        case OPC_MXU__POOL10:
+            decode_opc_mxu__pool10(ctx);
+            break;
+        case OPC_MXU__POOL11:
+            decode_opc_mxu__pool11(ctx);
+            break;
+        case OPC_MXU_D32ADD:
+            gen_mxu_d32add(ctx);
+            break;
+        case OPC_MXU__POOL12:
+            decode_opc_mxu__pool12(ctx);
+            break;
+        case OPC_MXU__POOL13:
+            decode_opc_mxu__pool13(ctx);
+            break;
+        case OPC_MXU__POOL14:
+            decode_opc_mxu__pool14(ctx);
+            break;
+        case OPC_MXU_Q8ACCE:
+            gen_mxu_q8adde(ctx, true);
+            break;
         case OPC_MXU_S8LDD:
-            gen_mxu_s8ldd(ctx);
+            gen_mxu_s8ldd(ctx, false);
+            break;
+        case OPC_MXU_S8STD:
+            gen_mxu_s8std(ctx, false);
+            break;
+        case OPC_MXU_S8LDI:
+            gen_mxu_s8ldd(ctx, true);
+            break;
+        case OPC_MXU_S8SDI:
+            gen_mxu_s8std(ctx, true);
+            break;
+        case OPC_MXU__POOL15:
+            decode_opc_mxu__pool15(ctx);
             break;
         case OPC_MXU__POOL16:
             decode_opc_mxu__pool16(ctx);
             break;
+        case OPC_MXU__POOL17:
+            decode_opc_mxu__pool17(ctx);
+            break;
+        case OPC_MXU_S16LDD:
+            gen_mxu_s16ldd(ctx, false);
+            break;
+        case OPC_MXU_S16STD:
+            gen_mxu_s16std(ctx, false);
+            break;
+        case OPC_MXU_S16LDI:
+            gen_mxu_s16ldd(ctx, true);
+            break;
+        case OPC_MXU_S16SDI:
+            gen_mxu_s16std(ctx, true);
+            break;
+        case OPC_MXU_D32SLL:
+            gen_mxu_d32sxx(ctx, false, false);
+            break;
+        case OPC_MXU_D32SLR:
+            gen_mxu_d32sxx(ctx, true, false);
+            break;
+        case OPC_MXU_D32SARL:
+            gen_mxu_d32sarl(ctx, false);
+            break;
+        case OPC_MXU_D32SAR:
+            gen_mxu_d32sxx(ctx, true, true);
+            break;
+        case OPC_MXU_Q16SLL:
+            gen_mxu_q16sxx(ctx, false, false);
+            break;
+        case OPC_MXU__POOL18:
+            decode_opc_mxu__pool18(ctx);
+            break;
+        case OPC_MXU_Q16SLR:
+            gen_mxu_q16sxx(ctx, true, false);
+            break;
+        case OPC_MXU_Q16SAR:
+            gen_mxu_q16sxx(ctx, true, true);
+            break;
         case OPC_MXU__POOL19:
             decode_opc_mxu__pool19(ctx);
             break;
+        case OPC_MXU__POOL20:
+            decode_opc_mxu__pool20(ctx);
+            break;
+        case OPC_MXU__POOL21:
+            decode_opc_mxu__pool21(ctx);
+            break;
+        case OPC_MXU_Q16SCOP:
+            gen_mxu_q16scop(ctx);
+            break;
+        case OPC_MXU_Q8MADL:
+            gen_mxu_q8madl(ctx);
+            break;
+        case OPC_MXU_S32SFL:
+            gen_mxu_s32sfl(ctx);
+            break;
+        case OPC_MXU_Q8SAD:
+            gen_mxu_q8sad(ctx);
+            break;
         default:
-            MIPS_INVAL("decode_opc_mxu");
-            gen_reserved_instruction(ctx);
+            return false;
         }
 
         gen_set_label(l_exit);
diff --git a/target/mips/tcg/op_helper.c b/target/mips/tcg/op_helper.c
index ef3dafcbb3..98935b5e64 100644
--- a/target/mips/tcg/op_helper.c
+++ b/target/mips/tcg/op_helper.c
@@ -257,6 +257,22 @@ void helper_pmon(CPUMIPSState *env, int function)
     }
 }
 
+#ifdef TARGET_MIPS64
+target_ulong helper_lcsr_cpucfg(CPUMIPSState *env, target_ulong rs)
+{
+    switch (rs) {
+    case 0:
+        return env->CP0_PRid;
+    case 1:
+        return env->lcsr_cpucfg1;
+    case 2:
+        return env->lcsr_cpucfg2;
+    default:
+        return 0;
+    }
+}
+#endif
+
 #if !defined(CONFIG_USER_ONLY)
 
 void mips_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
diff --git a/target/mips/tcg/sysemu/lcsr_helper.c b/target/mips/tcg/sysemu/lcsr_helper.c
new file mode 100644
index 0000000000..942143d209
--- /dev/null
+++ b/target/mips/tcg/sysemu/lcsr_helper.c
@@ -0,0 +1,45 @@
+/*
+ * Loongson CSR instructions translation routines
+ *
+ *  Copyright (c) 2023 Jiaxun Yang <jiaxun.yang@flygoat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "cpu.h"
+#include "internal.h"
+#include "qemu/host-utils.h"
+#include "exec/helper-proto.h"
+#include "exec/exec-all.h"
+#include "exec/cpu_ldst.h"
+
+#define GET_MEMTXATTRS(cas) \
+        ((MemTxAttrs){.requester_id = env_cpu(cas)->cpu_index})
+
+uint64_t helper_lcsr_rdcsr(CPUMIPSState *env, target_ulong r_addr)
+{
+    return address_space_ldl(&env->iocsr.as, r_addr,
+                             GET_MEMTXATTRS(env), NULL);
+}
+
+uint64_t helper_lcsr_drdcsr(CPUMIPSState *env, target_ulong r_addr)
+{
+    return address_space_ldq(&env->iocsr.as, r_addr,
+                             GET_MEMTXATTRS(env), NULL);
+}
+
+void helper_lcsr_wrcsr(CPUMIPSState *env, target_ulong w_addr,
+                      target_ulong val)
+{
+    address_space_stl(&env->iocsr.as, w_addr,
+                      val, GET_MEMTXATTRS(env), NULL);
+}
+
+void helper_lcsr_dwrcsr(CPUMIPSState *env, target_ulong w_addr,
+                      target_ulong val)
+{
+    address_space_stq(&env->iocsr.as, w_addr,
+                      val, GET_MEMTXATTRS(env), NULL);
+}
diff --git a/target/mips/tcg/sysemu/meson.build b/target/mips/tcg/sysemu/meson.build
index 43b35b3803..ec665a4b1e 100644
--- a/target/mips/tcg/sysemu/meson.build
+++ b/target/mips/tcg/sysemu/meson.build
@@ -4,3 +4,7 @@ mips_system_ss.add(files(
   'special_helper.c',
   'tlb_helper.c',
 ))
+
+mips_system_ss.add(when: 'TARGET_MIPS64', if_true: files(
+  'lcsr_helper.c',
+))
diff --git a/target/mips/tcg/sysemu_helper.h.inc b/target/mips/tcg/sysemu_helper.h.inc
index af585b5d9c..f163af1eac 100644
--- a/target/mips/tcg/sysemu_helper.h.inc
+++ b/target/mips/tcg/sysemu_helper.h.inc
@@ -181,3 +181,11 @@ DEF_HELPER_1(eret, void, env)
 DEF_HELPER_1(eretnc, void, env)
 DEF_HELPER_1(deret, void, env)
 DEF_HELPER_3(cache, void, env, tl, i32)
+
+#ifdef TARGET_MIPS64
+/* Longson CSR */
+DEF_HELPER_2(lcsr_rdcsr, i64, env, tl)
+DEF_HELPER_2(lcsr_drdcsr, i64, env, tl)
+DEF_HELPER_3(lcsr_wrcsr, void, env, tl, tl)
+DEF_HELPER_3(lcsr_dwrcsr, void, env, tl, tl)
+#endif
diff --git a/target/mips/tcg/translate.c b/target/mips/tcg/translate.c
index 74af91e4f5..9bb40f1849 100644
--- a/target/mips/tcg/translate.c
+++ b/target/mips/tcg/translate.c
@@ -14644,12 +14644,9 @@ static bool decode_opc_legacy(CPUMIPSState *env, DisasContext *ctx)
         }
 #endif
         if (TARGET_LONG_BITS == 32 && (ctx->insn_flags & ASE_MXU)) {
-            if (MASK_SPECIAL2(ctx->opcode) == OPC_MUL) {
-                gen_arith(ctx, OPC_MUL, rd, rs, rt);
-            } else {
-                decode_ase_mxu(ctx, ctx->opcode);
+            if (decode_ase_mxu(ctx, ctx->opcode)) {
+                break;
             }
-            break;
         }
         decode_opc_special2_legacy(env, ctx);
         break;
@@ -15352,6 +15349,9 @@ static void decode_opc(CPUMIPSState *env, DisasContext *ctx)
         return;
     }
 #if defined(TARGET_MIPS64)
+    if (ase_lcsr_available(env) && decode_ase_lcsr(ctx, ctx->opcode)) {
+        return;
+    }
     if (cpu_supports_isa(env, INSN_OCTEON) && decode_ext_octeon(ctx, ctx->opcode)) {
         return;
     }
diff --git a/target/mips/tcg/translate.h b/target/mips/tcg/translate.h
index 3b0498a47a..db3dc932c7 100644
--- a/target/mips/tcg/translate.h
+++ b/target/mips/tcg/translate.h
@@ -221,6 +221,7 @@ bool decode_isa_rel6(DisasContext *ctx, uint32_t insn);
 bool decode_ase_msa(DisasContext *ctx, uint32_t insn);
 bool decode_ext_txx9(DisasContext *ctx, uint32_t insn);
 #if defined(TARGET_MIPS64)
+bool decode_ase_lcsr(DisasContext *ctx, uint32_t insn);
 bool decode_ext_tx79(DisasContext *ctx, uint32_t insn);
 bool decode_ext_octeon(DisasContext *ctx, uint32_t insn);
 #endif