21 files changed, 1980 insertions, 437 deletions
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 9b1b98705f..3b365a78cb 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -4742,7 +4742,7 @@ static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri,
     uint64_t pageaddr = sextract64(value << 12, 0, 56);
     bool secure = arm_is_secure_below_el3(env);
     int mask = secure ? ARMMMUIdxBit_SE2 : ARMMMUIdxBit_E2;
-    int bits = tlbbits_for_regime(env, secure ? ARMMMUIdx_E2 : ARMMMUIdx_SE2,
+    int bits = tlbbits_for_regime(env, secure ? ARMMMUIdx_SE2 : ARMMMUIdx_E2,
                                   pageaddr);
 
     tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, mask, bits);
diff --git a/target/arm/meson.build b/target/arm/meson.build
index 15b936c101..5bfaf43b50 100644
--- a/target/arm/meson.build
+++ b/target/arm/meson.build
@@ -1,11 +1,11 @@
 gen = [
   decodetree.process('sve.decode', extra_args: '--decode=disas_sve'),
-  decodetree.process('neon-shared.decode', extra_args: '--static-decode=disas_neon_shared'),
-  decodetree.process('neon-dp.decode', extra_args: '--static-decode=disas_neon_dp'),
-  decodetree.process('neon-ls.decode', extra_args: '--static-decode=disas_neon_ls'),
-  decodetree.process('vfp.decode', extra_args: '--static-decode=disas_vfp'),
-  decodetree.process('vfp-uncond.decode', extra_args: '--static-decode=disas_vfp_uncond'),
-  decodetree.process('m-nocp.decode', extra_args: '--static-decode=disas_m_nocp'),
+  decodetree.process('neon-shared.decode', extra_args: '--decode=disas_neon_shared'),
+  decodetree.process('neon-dp.decode', extra_args: '--decode=disas_neon_dp'),
+  decodetree.process('neon-ls.decode', extra_args: '--decode=disas_neon_ls'),
+  decodetree.process('vfp.decode', extra_args: '--decode=disas_vfp'),
+  decodetree.process('vfp-uncond.decode', extra_args: '--decode=disas_vfp_uncond'),
+  decodetree.process('m-nocp.decode', extra_args: '--decode=disas_m_nocp'),
   decodetree.process('a32.decode', extra_args: '--static-decode=disas_a32'),
   decodetree.process('a32-uncond.decode', extra_args: '--static-decode=disas_a32_uncond'),
   decodetree.process('t32.decode', extra_args: '--static-decode=disas_t32'),
@@ -26,6 +26,9 @@ arm_ss.add(files(
   'op_helper.c',
   'tlb_helper.c',
   'translate.c',
+  'translate-m-nocp.c',
+  'translate-neon.c',
+  'translate-vfp.c',
   'vec_helper.c',
   'vfp_helper.c',
   'cpu_tcg.c',
diff --git a/target/arm/op_helper.c b/target/arm/op_helper.c
index 78b831f181..efcb600992 100644
--- a/target/arm/op_helper.c
+++ b/target/arm/op_helper.c
@@ -228,6 +228,7 @@ void HELPER(setend)(CPUARMState *env)
     arm_rebuild_hflags(env);
 }
 
+#ifndef CONFIG_USER_ONLY
 /* Function checks whether WFx (WFI/WFE) instructions are set up to be trapped.
  * The function returns the target EL (1-3) if the instruction is to be trapped;
  * otherwise it returns 0 indicating it is not trapped.
@@ -282,9 +283,21 @@ static inline int check_wfx_trap(CPUARMState *env, bool is_wfe)
 
     return 0;
 }
+#endif
 
 void HELPER(wfi)(CPUARMState *env, uint32_t insn_len)
 {
+#ifdef CONFIG_USER_ONLY
+    /*
+     * WFI in the user-mode emulator is technically permitted but not
+     * something any real-world code would do. AArch64 Linux kernels
+     * trap it via SCTRL_EL1.nTWI and make it an (expensive) NOP;
+     * AArch32 kernels don't trap it so it will delay a bit.
+     * For QEMU, make it NOP here, because trying to raise EXCP_HLT
+     * would trigger an abort.
+     */
+    return;
+#else
     CPUState *cs = env_cpu(env);
     int target_el = check_wfx_trap(env, false);
 
@@ -309,6 +322,7 @@ void HELPER(wfi)(CPUARMState *env, uint32_t insn_len)
     cs->exception_index = EXCP_HLT;
     cs->halted = 1;
     cpu_loop_exit(cs);
+#endif
 }
 
 void HELPER(wfe)(CPUARMState *env)
diff --git a/target/arm/translate-a32.h b/target/arm/translate-a32.h
new file mode 100644
index 0000000000..c997f4e321
--- /dev/null
+++ b/target/arm/translate-a32.h
@@ -0,0 +1,144 @@
+/*
+ *  AArch32 translation, common definitions.
+ *
+ * Copyright (c) 2021 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TARGET_ARM_TRANSLATE_A64_H
+#define TARGET_ARM_TRANSLATE_A64_H
+
+/* Prototypes for autogenerated disassembler functions */
+bool disas_m_nocp(DisasContext *dc, uint32_t insn);
+bool disas_vfp(DisasContext *s, uint32_t insn);
+bool disas_vfp_uncond(DisasContext *s, uint32_t insn);
+bool disas_neon_dp(DisasContext *s, uint32_t insn);
+bool disas_neon_ls(DisasContext *s, uint32_t insn);
+bool disas_neon_shared(DisasContext *s, uint32_t insn);
+
+void load_reg_var(DisasContext *s, TCGv_i32 var, int reg);
+void arm_gen_condlabel(DisasContext *s);
+bool vfp_access_check(DisasContext *s);
+void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop);
+void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop);
+void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop);
+void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop);
+TCGv_i32 add_reg_for_lit(DisasContext *s, int reg, int ofs);
+void gen_set_cpsr(TCGv_i32 var, uint32_t mask);
+void gen_set_condexec(DisasContext *s);
+void gen_set_pc_im(DisasContext *s, target_ulong val);
+void gen_lookup_tb(DisasContext *s);
+long vfp_reg_offset(bool dp, unsigned reg);
+long neon_full_reg_offset(unsigned reg);
+long neon_element_offset(int reg, int element, MemOp memop);
+void gen_rev16(TCGv_i32 dest, TCGv_i32 var);
+
+static inline TCGv_i32 load_cpu_offset(int offset)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    tcg_gen_ld_i32(tmp, cpu_env, offset);
+    return tmp;
+}
+
+#define load_cpu_field(name) load_cpu_offset(offsetof(CPUARMState, name))
+
+static inline void store_cpu_offset(TCGv_i32 var, int offset)
+{
+    tcg_gen_st_i32(var, cpu_env, offset);
+    tcg_temp_free_i32(var);
+}
+
+#define store_cpu_field(var, name) \
+    store_cpu_offset(var, offsetof(CPUARMState, name))
+
+/* Create a new temporary and set it to the value of a CPU register.  */
+static inline TCGv_i32 load_reg(DisasContext *s, int reg)
+{
+    TCGv_i32 tmp = tcg_temp_new_i32();
+    load_reg_var(s, tmp, reg);
+    return tmp;
+}
+
+void store_reg(DisasContext *s, int reg, TCGv_i32 var);
+
+void gen_aa32_ld_internal_i32(DisasContext *s, TCGv_i32 val,
+                              TCGv_i32 a32, int index, MemOp opc);
+void gen_aa32_st_internal_i32(DisasContext *s, TCGv_i32 val,
+                              TCGv_i32 a32, int index, MemOp opc);
+void gen_aa32_ld_internal_i64(DisasContext *s, TCGv_i64 val,
+                              TCGv_i32 a32, int index, MemOp opc);
+void gen_aa32_st_internal_i64(DisasContext *s, TCGv_i64 val,
+                              TCGv_i32 a32, int index, MemOp opc);
+void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                     int index, MemOp opc);
+void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                     int index, MemOp opc);
+void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                     int index, MemOp opc);
+void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                     int index, MemOp opc);
+
+#define DO_GEN_LD(SUFF, OPC)                                            \
+    static inline void gen_aa32_ld##SUFF(DisasContext *s, TCGv_i32 val, \
+                                         TCGv_i32 a32, int index)       \
+    {                                                                   \
+        gen_aa32_ld_i32(s, val, a32, index, OPC);                       \
+    }
+
+#define DO_GEN_ST(SUFF, OPC)                                            \
+    static inline void gen_aa32_st##SUFF(DisasContext *s, TCGv_i32 val, \
+                                         TCGv_i32 a32, int index)       \
+    {                                                                   \
+        gen_aa32_st_i32(s, val, a32, index, OPC);                       \
+    }
+
+static inline void gen_aa32_ld64(DisasContext *s, TCGv_i64 val,
+                                 TCGv_i32 a32, int index)
+{
+    gen_aa32_ld_i64(s, val, a32, index, MO_Q);
+}
+
+static inline void gen_aa32_st64(DisasContext *s, TCGv_i64 val,
+                                 TCGv_i32 a32, int index)
+{
+    gen_aa32_st_i64(s, val, a32, index, MO_Q);
+}
+
+DO_GEN_LD(8u, MO_UB)
+DO_GEN_LD(16u, MO_UW)
+DO_GEN_LD(32u, MO_UL)
+DO_GEN_ST(8, MO_UB)
+DO_GEN_ST(16, MO_UW)
+DO_GEN_ST(32, MO_UL)
+
+#undef DO_GEN_LD
+#undef DO_GEN_ST
+
+#if defined(CONFIG_USER_ONLY)
+#define IS_USER(s) 1
+#else
+#define IS_USER(s) (s->user)
+#endif
+
+/* Set NZCV flags from the high 4 bits of var.  */
+#define gen_set_nzcv(var) gen_set_cpsr(var, CPSR_NZCV)
+
+/* Swap low and high halfwords.  */
+static inline void gen_swap_half(TCGv_i32 dest, TCGv_i32 var)
+{
+    tcg_gen_rotri_i32(dest, var, 16);
+}
+
+#endif
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 95897e63af..0c80d0b505 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -359,14 +359,6 @@ static void gen_exception_internal_insn(DisasContext *s, uint64_t pc, int excp)
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_exception_insn(DisasContext *s, uint64_t pc, int excp,
-                               uint32_t syndrome, uint32_t target_el)
-{
-    gen_a64_set_pc_im(pc);
-    gen_exception(excp, syndrome, target_el);
-    s->base.is_jmp = DISAS_NORETURN;
-}
-
 static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syndrome)
 {
     TCGv_i32 tcg_syn;
@@ -437,13 +429,6 @@ static inline void gen_goto_tb(DisasContext *s, int n, uint64_t dest)
     }
 }
 
-void unallocated_encoding(DisasContext *s)
-{
-    /* Unallocated and reserved encodings are uncategorized */
-    gen_exception_insn(s, s->pc_curr, EXCP_UDEF, syn_uncategorized(),
-                       default_exception_el(s));
-}
-
 static void init_tmp_a64_array(DisasContext *s)
 {
 #ifdef CONFIG_DEBUG_TCG
diff --git a/target/arm/translate-a64.h b/target/arm/translate-a64.h
index 868d355048..89437276e7 100644
--- a/target/arm/translate-a64.h
+++ b/target/arm/translate-a64.h
@@ -18,8 +18,6 @@
 #ifndef TARGET_ARM_TRANSLATE_A64_H
 #define TARGET_ARM_TRANSLATE_A64_H
 
-void unallocated_encoding(DisasContext *s);
-
 #define unsupported_encoding(s, insn)                                    \
     do {                                                                 \
         qemu_log_mask(LOG_UNIMP,                                         \
diff --git a/target/arm/translate-m-nocp.c b/target/arm/translate-m-nocp.c
new file mode 100644
index 0000000000..d47eb8e153
--- /dev/null
+++ b/target/arm/translate-m-nocp.c
@@ -0,0 +1,221 @@
+/*
+ *  ARM translation: M-profile NOCP special-case instructions
+ *
+ *  Copyright (c) 2020 Linaro, Ltd.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "translate.h"
+#include "translate-a32.h"
+
+#include "decode-m-nocp.c.inc"
+
+/*
+ * Decode VLLDM and VLSTM are nonstandard because:
+ *  * if there is no FPU then these insns must NOP in
+ *    Secure state and UNDEF in Nonsecure state
+ *  * if there is an FPU then these insns do not have
+ *    the usual behaviour that vfp_access_check() provides of
+ *    being controlled by CPACR/NSACR enable bits or the
+ *    lazy-stacking logic.
+ */
+static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
+{
+    TCGv_i32 fptr;
+
+    if (!arm_dc_feature(s, ARM_FEATURE_M) ||
+        !arm_dc_feature(s, ARM_FEATURE_V8)) {
+        return false;
+    }
+
+    if (a->op) {
+        /*
+         * T2 encoding ({D0-D31} reglist): v8.1M and up. We choose not
+         * to take the IMPDEF option to make memory accesses to the stack
+         * slots that correspond to the D16-D31 registers (discarding
+         * read data and writing UNKNOWN values), so for us the T2
+         * encoding behaves identically to the T1 encoding.
+         */
+        if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+            return false;
+        }
+    } else {
+        /*
+         * T1 encoding ({D0-D15} reglist); undef if we have 32 Dregs.
+         * This is currently architecturally impossible, but we add the
+         * check to stay in line with the pseudocode. Note that we must
+         * emit code for the UNDEF so it takes precedence over the NOCP.
+         */
+        if (dc_isar_feature(aa32_simd_r32, s)) {
+            unallocated_encoding(s);
+            return true;
+        }
+    }
+
+    /*
+     * If not secure, UNDEF. We must emit code for this
+     * rather than returning false so that this takes
+     * precedence over the m-nocp.decode NOCP fallback.
+     */
+    if (!s->v8m_secure) {
+        unallocated_encoding(s);
+        return true;
+    }
+    /* If no fpu, NOP. */
+    if (!dc_isar_feature(aa32_vfp, s)) {
+        return true;
+    }
+
+    fptr = load_reg(s, a->rn);
+    if (a->l) {
+        gen_helper_v7m_vlldm(cpu_env, fptr);
+    } else {
+        gen_helper_v7m_vlstm(cpu_env, fptr);
+    }
+    tcg_temp_free_i32(fptr);
+
+    /* End the TB, because we have updated FP control bits */
+    s->base.is_jmp = DISAS_UPDATE_EXIT;
+    return true;
+}
+
+static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
+{
+    int btmreg, topreg;
+    TCGv_i64 zero;
+    TCGv_i32 aspen, sfpa;
+
+    if (!dc_isar_feature(aa32_m_sec_state, s)) {
+        /* Before v8.1M, fall through in decode to NOCP check */
+        return false;
+    }
+
+    /* Explicitly UNDEF because this takes precedence over NOCP */
+    if (!arm_dc_feature(s, ARM_FEATURE_M_MAIN) || !s->v8m_secure) {
+        unallocated_encoding(s);
+        return true;
+    }
+
+    if (!dc_isar_feature(aa32_vfp_simd, s)) {
+        /* NOP if we have neither FP nor MVE */
+        return true;
+    }
+
+    /*
+     * If FPCCR.ASPEN != 0 && CONTROL_S.SFPA == 0 then there is no
+     * active floating point context so we must NOP (without doing
+     * any lazy state preservation or the NOCP check).
+     */
+    aspen = load_cpu_field(v7m.fpccr[M_REG_S]);
+    sfpa = load_cpu_field(v7m.control[M_REG_S]);
+    tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+    tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
+    tcg_gen_andi_i32(sfpa, sfpa, R_V7M_CONTROL_SFPA_MASK);
+    tcg_gen_or_i32(sfpa, sfpa, aspen);
+    arm_gen_condlabel(s);
+    tcg_gen_brcondi_i32(TCG_COND_EQ, sfpa, 0, s->condlabel);
+
+    if (s->fp_excp_el != 0) {
+        gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
+                           syn_uncategorized(), s->fp_excp_el);
+        return true;
+    }
+
+    topreg = a->vd + a->imm - 1;
+    btmreg = a->vd;
+
+    /* Convert to Sreg numbers if the insn specified in Dregs */
+    if (a->size == 3) {
+        topreg = topreg * 2 + 1;
+        btmreg *= 2;
+    }
+
+    if (topreg > 63 || (topreg > 31 && !(topreg & 1))) {
+        /* UNPREDICTABLE: we choose to undef */
+        unallocated_encoding(s);
+        return true;
+    }
+
+    /* Silently ignore requests to clear D16-D31 if they don't exist */
+    if (topreg > 31 && !dc_isar_feature(aa32_simd_r32, s)) {
+        topreg = 31;
+    }
+
+    if (!vfp_access_check(s)) {
+        return true;
+    }
+
+    /* Zero the Sregs from btmreg to topreg inclusive. */
+    zero = tcg_const_i64(0);
+    if (btmreg & 1) {
+        write_neon_element64(zero, btmreg >> 1, 1, MO_32);
+        btmreg++;
+    }
+    for (; btmreg + 1 <= topreg; btmreg += 2) {
+        write_neon_element64(zero, btmreg >> 1, 0, MO_64);
+    }
+    if (btmreg == topreg) {
+        write_neon_element64(zero, btmreg >> 1, 0, MO_32);
+        btmreg++;
+    }
+    assert(btmreg == topreg + 1);
+    /* TODO: when MVE is implemented, zero VPR here */
+    return true;
+}
+
+static bool trans_NOCP(DisasContext *s, arg_nocp *a)
+{
+    /*
+     * Handle M-profile early check for disabled coprocessor:
+     * all we need to do here is emit the NOCP exception if
+     * the coprocessor is disabled. Otherwise we return false
+     * and the real VFP/etc decode will handle the insn.
+     */
+    assert(arm_dc_feature(s, ARM_FEATURE_M));
+
+    if (a->cp == 11) {
+        a->cp = 10;
+    }
+    if (arm_dc_feature(s, ARM_FEATURE_V8_1M) &&
+        (a->cp == 8 || a->cp == 9 || a->cp == 14 || a->cp == 15)) {
+        /* in v8.1M cp 8, 9, 14, 15 also are governed by the cp10 enable */
+        a->cp = 10;
+    }
+
+    if (a->cp != 10) {
+        gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
+                           syn_uncategorized(), default_exception_el(s));
+        return true;
+    }
+
+    if (s->fp_excp_el != 0) {
+        gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
+                           syn_uncategorized(), s->fp_excp_el);
+        return true;
+    }
+
+    return false;
+}
+
+static bool trans_NOCP_8_1(DisasContext *s, arg_nocp *a)
+{
+    /* This range needs a coprocessor check for v8.1M and later only */
+    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
+        return false;
+    }
+    return trans_NOCP(s, a);
+}
diff --git a/target/arm/translate-neon.c.inc b/target/arm/translate-neon.c
index a02b8369a1..658bd275da 100644
--- a/target/arm/translate-neon.c.inc
+++ b/target/arm/translate-neon.c
@@ -20,11 +20,13 @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
-/*
- * This file is intended to be included from translate.c; it uses
- * some macros and definitions provided by that file.
- * It might be possible to convert it to a standalone .c file eventually.
- */
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
 
 static inline int plus1(DisasContext *s, int x)
 {
@@ -60,6 +62,13 @@ static inline int neon_3same_fp_size(DisasContext *s, int x)
 #include "decode-neon-ls.c.inc"
 #include "decode-neon-shared.c.inc"
 
+static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
+{
+    TCGv_ptr ret = tcg_temp_new_ptr();
+    tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
+    return ret;
+}
+
 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
 {
     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
diff --git a/target/arm/translate-vfp.c.inc b/target/arm/translate-vfp.c
index e20d9c7ba6..3da84f30a0 100644
--- a/target/arm/translate-vfp.c.inc
+++ b/target/arm/translate-vfp.c
@@ -20,16 +20,38 @@
  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  */
 
-/*
- * This file is intended to be included from translate.c; it uses
- * some macros and definitions provided by that file.
- * It might be possible to convert it to a standalone .c file eventually.
- */
+#include "qemu/osdep.h"
+#include "tcg/tcg-op.h"
+#include "tcg/tcg-op-gvec.h"
+#include "exec/exec-all.h"
+#include "exec/gen-icount.h"
+#include "translate.h"
+#include "translate-a32.h"
 
 /* Include the generated VFP decoder */
 #include "decode-vfp.c.inc"
 #include "decode-vfp-uncond.c.inc"
 
+static inline void vfp_load_reg64(TCGv_i64 var, int reg)
+{
+    tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(true, reg));
+}
+
+static inline void vfp_store_reg64(TCGv_i64 var, int reg)
+{
+    tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(true, reg));
+}
+
+static inline void vfp_load_reg32(TCGv_i32 var, int reg)
+{
+    tcg_gen_ld_i32(var, cpu_env, vfp_reg_offset(false, reg));
+}
+
+static inline void vfp_store_reg32(TCGv_i32 var, int reg)
+{
+    tcg_gen_st_i32(var, cpu_env, vfp_reg_offset(false, reg));
+}
+
 /*
  * The imm8 encodes the sign bit, enough bits to represent an exponent in
  * the range 01....1xx to 10....0xx, and the most significant 4 bits of
@@ -191,7 +213,7 @@ static bool full_vfp_access_check(DisasContext *s, bool ignore_vfp_enabled)
  * The most usual kind of VFP access check, for everything except
  * FMXR/FMRX to the always-available special registers.
  */
-static bool vfp_access_check(DisasContext *s)
+bool vfp_access_check(DisasContext *s)
 {
     return full_vfp_access_check(s, false);
 }
@@ -3800,202 +3822,6 @@ static bool trans_VCVT_dp_int(DisasContext *s, arg_VCVT_dp_int *a)
     return true;
 }
 
-/*
- * Decode VLLDM and VLSTM are nonstandard because:
- *  * if there is no FPU then these insns must NOP in
- *    Secure state and UNDEF in Nonsecure state
- *  * if there is an FPU then these insns do not have
- *    the usual behaviour that vfp_access_check() provides of
- *    being controlled by CPACR/NSACR enable bits or the
- *    lazy-stacking logic.
- */
-static bool trans_VLLDM_VLSTM(DisasContext *s, arg_VLLDM_VLSTM *a)
-{
-    TCGv_i32 fptr;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_M) ||
-        !arm_dc_feature(s, ARM_FEATURE_V8)) {
-        return false;
-    }
-
-    if (a->op) {
-        /*
-         * T2 encoding ({D0-D31} reglist): v8.1M and up. We choose not
-         * to take the IMPDEF option to make memory accesses to the stack
-         * slots that correspond to the D16-D31 registers (discarding
-         * read data and writing UNKNOWN values), so for us the T2
-         * encoding behaves identically to the T1 encoding.
-         */
-        if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-            return false;
-        }
-    } else {
-        /*
-         * T1 encoding ({D0-D15} reglist); undef if we have 32 Dregs.
-         * This is currently architecturally impossible, but we add the
-         * check to stay in line with the pseudocode. Note that we must
-         * emit code for the UNDEF so it takes precedence over the NOCP.
-         */
-        if (dc_isar_feature(aa32_simd_r32, s)) {
-            unallocated_encoding(s);
-            return true;
-        }
-    }
-
-    /*
-     * If not secure, UNDEF. We must emit code for this
-     * rather than returning false so that this takes
-     * precedence over the m-nocp.decode NOCP fallback.
-     */
-    if (!s->v8m_secure) {
-        unallocated_encoding(s);
-        return true;
-    }
-    /* If no fpu, NOP. */
-    if (!dc_isar_feature(aa32_vfp, s)) {
-        return true;
-    }
-
-    fptr = load_reg(s, a->rn);
-    if (a->l) {
-        gen_helper_v7m_vlldm(cpu_env, fptr);
-    } else {
-        gen_helper_v7m_vlstm(cpu_env, fptr);
-    }
-    tcg_temp_free_i32(fptr);
-
-    /* End the TB, because we have updated FP control bits */
-    s->base.is_jmp = DISAS_UPDATE_EXIT;
-    return true;
-}
-
-static bool trans_VSCCLRM(DisasContext *s, arg_VSCCLRM *a)
-{
-    int btmreg, topreg;
-    TCGv_i64 zero;
-    TCGv_i32 aspen, sfpa;
-
-    if (!dc_isar_feature(aa32_m_sec_state, s)) {
-        /* Before v8.1M, fall through in decode to NOCP check */
-        return false;
-    }
-
-    /* Explicitly UNDEF because this takes precedence over NOCP */
-    if (!arm_dc_feature(s, ARM_FEATURE_M_MAIN) || !s->v8m_secure) {
-        unallocated_encoding(s);
-        return true;
-    }
-
-    if (!dc_isar_feature(aa32_vfp_simd, s)) {
-        /* NOP if we have neither FP nor MVE */
-        return true;
-    }
-
-    /*
-     * If FPCCR.ASPEN != 0 && CONTROL_S.SFPA == 0 then there is no
-     * active floating point context so we must NOP (without doing
-     * any lazy state preservation or the NOCP check).
-     */
-    aspen = load_cpu_field(v7m.fpccr[M_REG_S]);
-    sfpa = load_cpu_field(v7m.control[M_REG_S]);
-    tcg_gen_andi_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
-    tcg_gen_xori_i32(aspen, aspen, R_V7M_FPCCR_ASPEN_MASK);
-    tcg_gen_andi_i32(sfpa, sfpa, R_V7M_CONTROL_SFPA_MASK);
-    tcg_gen_or_i32(sfpa, sfpa, aspen);
-    arm_gen_condlabel(s);
-    tcg_gen_brcondi_i32(TCG_COND_EQ, sfpa, 0, s->condlabel);
-
-    if (s->fp_excp_el != 0) {
-        gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
-                           syn_uncategorized(), s->fp_excp_el);
-        return true;
-    }
-
-    topreg = a->vd + a->imm - 1;
-    btmreg = a->vd;
-
-    /* Convert to Sreg numbers if the insn specified in Dregs */
-    if (a->size == 3) {
-        topreg = topreg * 2 + 1;
-        btmreg *= 2;
-    }
-
-    if (topreg > 63 || (topreg > 31 && !(topreg & 1))) {
-        /* UNPREDICTABLE: we choose to undef */
-        unallocated_encoding(s);
-        return true;
-    }
-
-    /* Silently ignore requests to clear D16-D31 if they don't exist */
-    if (topreg > 31 && !dc_isar_feature(aa32_simd_r32, s)) {
-        topreg = 31;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /* Zero the Sregs from btmreg to topreg inclusive. */
-    zero = tcg_const_i64(0);
-    if (btmreg & 1) {
-        write_neon_element64(zero, btmreg >> 1, 1, MO_32);
-        btmreg++;
-    }
-    for (; btmreg + 1 <= topreg; btmreg += 2) {
-        write_neon_element64(zero, btmreg >> 1, 0, MO_64);
-    }
-    if (btmreg == topreg) {
-        write_neon_element64(zero, btmreg >> 1, 0, MO_32);
-        btmreg++;
-    }
-    assert(btmreg == topreg + 1);
-    /* TODO: when MVE is implemented, zero VPR here */
-    return true;
-}
-
-static bool trans_NOCP(DisasContext *s, arg_nocp *a)
-{
-    /*
-     * Handle M-profile early check for disabled coprocessor:
-     * all we need to do here is emit the NOCP exception if
-     * the coprocessor is disabled. Otherwise we return false
-     * and the real VFP/etc decode will handle the insn.
-     */
-    assert(arm_dc_feature(s, ARM_FEATURE_M));
-
-    if (a->cp == 11) {
-        a->cp = 10;
-    }
-    if (arm_dc_feature(s, ARM_FEATURE_V8_1M) &&
-        (a->cp == 8 || a->cp == 9 || a->cp == 14 || a->cp == 15)) {
-        /* in v8.1M cp 8, 9, 14, 15 also are governed by the cp10 enable */
-        a->cp = 10;
-    }
-
-    if (a->cp != 10) {
-        gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
-                           syn_uncategorized(), default_exception_el(s));
-        return true;
-    }
-
-    if (s->fp_excp_el != 0) {
-        gen_exception_insn(s, s->pc_curr, EXCP_NOCP,
-                           syn_uncategorized(), s->fp_excp_el);
-        return true;
-    }
-
-    return false;
-}
-
-static bool trans_NOCP_8_1(DisasContext *s, arg_nocp *a)
-{
-    /* This range needs a coprocessor check for v8.1M and later only */
-    if (!arm_dc_feature(s, ARM_FEATURE_V8_1M)) {
-        return false;
-    }
-    return trans_NOCP(s, a);
-}
-
 static bool trans_VINS(DisasContext *s, arg_VINS *a)
 {
     TCGv_i32 rd, rm;
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 43ff0d4b8a..455352bcf6 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -50,12 +50,7 @@
 #define ENABLE_ARCH_8     arm_dc_feature(s, ARM_FEATURE_V8)
 
 #include "translate.h"
-
-#if defined(CONFIG_USER_ONLY)
-#define IS_USER(s) 1
-#else
-#define IS_USER(s) (s->user)
-#endif
+#include "translate-a32.h"
 
 /* These are TCG temporaries used only by the legacy iwMMXt decoder */
 static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
@@ -71,11 +66,6 @@ static const char * const regnames[] =
     { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
       "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
 
-/* Function prototypes for gen_ functions calling Neon helpers.  */
-typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32,
-                                 TCGv_i32, TCGv_i32);
-/* Function prototypes for gen_ functions for fix point conversions */
-typedef void VFPGenFixPointFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
 
 /* initialize TCG globals.  */
 void arm_translate_init(void)
@@ -101,7 +91,7 @@ void arm_translate_init(void)
 }
 
 /* Generate a label used for skipping this instruction */
-static void arm_gen_condlabel(DisasContext *s)
+void arm_gen_condlabel(DisasContext *s)
 {
     if (!s->condjmp) {
         s->condlabel = gen_new_label();
@@ -109,30 +99,6 @@ static void arm_gen_condlabel(DisasContext *s)
     }
 }
 
-/*
- * Constant expanders for the decoders.
- */
-
-static int negate(DisasContext *s, int x)
-{
-    return -x;
-}
-
-static int plus_2(DisasContext *s, int x)
-{
-    return x + 2;
-}
-
-static int times_2(DisasContext *s, int x)
-{
-    return x * 2;
-}
-
-static int times_4(DisasContext *s, int x)
-{
-    return x * 4;
-}
-
 /* Flags for the disas_set_da_iss info argument:
  * lower bits hold the Rt register number, higher bits are flags.
  */
@@ -211,24 +177,6 @@ static inline int get_a32_user_mem_index(DisasContext *s)
     }
 }
 
-static inline TCGv_i32 load_cpu_offset(int offset)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    tcg_gen_ld_i32(tmp, cpu_env, offset);
-    return tmp;
-}
-
-#define load_cpu_field(name) load_cpu_offset(offsetof(CPUARMState, name))
-
-static inline void store_cpu_offset(TCGv_i32 var, int offset)
-{
-    tcg_gen_st_i32(var, cpu_env, offset);
-    tcg_temp_free_i32(var);
-}
-
-#define store_cpu_field(var, name) \
-    store_cpu_offset(var, offsetof(CPUARMState, name))
-
 /* The architectural value of PC.  */
 static uint32_t read_pc(DisasContext *s)
 {
@@ -236,7 +184,7 @@ static uint32_t read_pc(DisasContext *s)
 }
 
 /* Set a variable to the value of a CPU register.  */
-static void load_reg_var(DisasContext *s, TCGv_i32 var, int reg)
+void load_reg_var(DisasContext *s, TCGv_i32 var, int reg)
 {
     if (reg == 15) {
         tcg_gen_movi_i32(var, read_pc(s));
@@ -245,20 +193,12 @@ static void load_reg_var(DisasContext *s, TCGv_i32 var, int reg)
     }
 }
 
-/* Create a new temporary and set it to the value of a CPU register.  */
-static inline TCGv_i32 load_reg(DisasContext *s, int reg)
-{
-    TCGv_i32 tmp = tcg_temp_new_i32();
-    load_reg_var(s, tmp, reg);
-    return tmp;
-}
-
 /*
  * Create a new temp, REG + OFS, except PC is ALIGN(PC, 4).
  * This is used for load/store for which use of PC implies (literal),
  * or ADD that implies ADR.
  */
-static TCGv_i32 add_reg_for_lit(DisasContext *s, int reg, int ofs)
+TCGv_i32 add_reg_for_lit(DisasContext *s, int reg, int ofs)
 {
     TCGv_i32 tmp = tcg_temp_new_i32();
 
@@ -272,7 +212,7 @@ static TCGv_i32 add_reg_for_lit(DisasContext *s, int reg, int ofs)
 
 /* Set a CPU register.  The source must be a temporary and will be
    marked as dead.  */
-static void store_reg(DisasContext *s, int reg, TCGv_i32 var)
+void store_reg(DisasContext *s, int reg, TCGv_i32 var)
 {
     if (reg == 15) {
         /* In Thumb mode, we must ignore bit 0.
@@ -313,15 +253,12 @@ static void store_sp_checked(DisasContext *s, TCGv_i32 var)
 #define gen_sxtb16(var) gen_helper_sxtb16(var, var)
 #define gen_uxtb16(var) gen_helper_uxtb16(var, var)
 
-
-static inline void gen_set_cpsr(TCGv_i32 var, uint32_t mask)
+void gen_set_cpsr(TCGv_i32 var, uint32_t mask)
 {
     TCGv_i32 tmp_mask = tcg_const_i32(mask);
     gen_helper_cpsr_write(cpu_env, var, tmp_mask);
     tcg_temp_free_i32(tmp_mask);
 }
-/* Set NZCV flags from the high 4 bits of var.  */
-#define gen_set_nzcv(var) gen_set_cpsr(var, CPSR_NZCV)
 
 static void gen_exception_internal(int excp)
 {
@@ -388,7 +325,7 @@ static void gen_smul_dual(TCGv_i32 a, TCGv_i32 b)
 }
 
 /* Byteswap each halfword.  */
-static void gen_rev16(TCGv_i32 dest, TCGv_i32 var)
+void gen_rev16(TCGv_i32 dest, TCGv_i32 var)
 {
     TCGv_i32 tmp = tcg_temp_new_i32();
     TCGv_i32 mask = tcg_const_i32(0x00ff00ff);
@@ -409,12 +346,6 @@ static void gen_revsh(TCGv_i32 dest, TCGv_i32 var)
     tcg_gen_ext16s_i32(dest, var);
 }
 
-/* Swap low and high halfwords.  */
-static void gen_swap_half(TCGv_i32 dest, TCGv_i32 var)
-{
-    tcg_gen_rotri_i32(dest, var, 16);
-}
-
 /* Dual 16-bit add.  Result placed in t0 and t1 is marked as dead.
     tmp = (t0 ^ t1) & 0x8000;
     t0 &= ~0x8000;
@@ -746,7 +677,7 @@ void arm_gen_test_cc(int cc, TCGLabel *label)
     arm_free_cc(&cmp);
 }
 
-static inline void gen_set_condexec(DisasContext *s)
+void gen_set_condexec(DisasContext *s)
 {
     if (s->condexec_mask) {
         uint32_t val = (s->condexec_cond << 4) | (s->condexec_mask >> 1);
@@ -756,7 +687,7 @@ static inline void gen_set_condexec(DisasContext *s)
     }
 }
 
-static inline void gen_set_pc_im(DisasContext *s, target_ulong val)
+void gen_set_pc_im(DisasContext *s, target_ulong val)
 {
     tcg_gen_movi_i32(cpu_R[15], val);
 }
@@ -948,24 +879,24 @@ static TCGv gen_aa32_addr(DisasContext *s, TCGv_i32 a32, MemOp op)
  * Internal routines are used for NEON cases where the endianness
  * and/or alignment has already been taken into account and manipulated.
  */
-static void gen_aa32_ld_internal_i32(DisasContext *s, TCGv_i32 val,
-                                     TCGv_i32 a32, int index, MemOp opc)
+void gen_aa32_ld_internal_i32(DisasContext *s, TCGv_i32 val,
+                              TCGv_i32 a32, int index, MemOp opc)
 {
     TCGv addr = gen_aa32_addr(s, a32, opc);
     tcg_gen_qemu_ld_i32(val, addr, index, opc);
     tcg_temp_free(addr);
 }
 
-static void gen_aa32_st_internal_i32(DisasContext *s, TCGv_i32 val,
-                                     TCGv_i32 a32, int index, MemOp opc)
+void gen_aa32_st_internal_i32(DisasContext *s, TCGv_i32 val,
+                              TCGv_i32 a32, int index, MemOp opc)
 {
     TCGv addr = gen_aa32_addr(s, a32, opc);
     tcg_gen_qemu_st_i32(val, addr, index, opc);
     tcg_temp_free(addr);
 }
 
-static void gen_aa32_ld_internal_i64(DisasContext *s, TCGv_i64 val,
-                                     TCGv_i32 a32, int index, MemOp opc)
+void gen_aa32_ld_internal_i64(DisasContext *s, TCGv_i64 val,
+                              TCGv_i32 a32, int index, MemOp opc)
 {
     TCGv addr = gen_aa32_addr(s, a32, opc);
 
@@ -978,8 +909,8 @@ static void gen_aa32_ld_internal_i64(DisasContext *s, TCGv_i64 val,
     tcg_temp_free(addr);
 }
 
-static void gen_aa32_st_internal_i64(DisasContext *s, TCGv_i64 val,
-                                     TCGv_i32 a32, int index, MemOp opc)
+void gen_aa32_st_internal_i64(DisasContext *s, TCGv_i64 val,
+                              TCGv_i32 a32, int index, MemOp opc)
 {
     TCGv addr = gen_aa32_addr(s, a32, opc);
 
@@ -995,26 +926,26 @@ static void gen_aa32_st_internal_i64(DisasContext *s, TCGv_i64 val,
     tcg_temp_free(addr);
 }
 
-static void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
-                            int index, MemOp opc)
+void gen_aa32_ld_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                     int index, MemOp opc)
 {
     gen_aa32_ld_internal_i32(s, val, a32, index, finalize_memop(s, opc));
 }
 
-static void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
-                            int index, MemOp opc)
+void gen_aa32_st_i32(DisasContext *s, TCGv_i32 val, TCGv_i32 a32,
+                     int index, MemOp opc)
 {
     gen_aa32_st_internal_i32(s, val, a32, index, finalize_memop(s, opc));
 }
 
-static void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
-                            int index, MemOp opc)
+void gen_aa32_ld_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                     int index, MemOp opc)
 {
     gen_aa32_ld_internal_i64(s, val, a32, index, finalize_memop(s, opc));
 }
 
-static void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
-                            int index, MemOp opc)
+void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
+                     int index, MemOp opc)
 {
     gen_aa32_st_internal_i64(s, val, a32, index, finalize_memop(s, opc));
 }
@@ -1033,25 +964,6 @@ static void gen_aa32_st_i64(DisasContext *s, TCGv_i64 val, TCGv_i32 a32,
         gen_aa32_st_i32(s, val, a32, index, OPC);                       \
     }
 
-static inline void gen_aa32_ld64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 a32, int index)
-{
-    gen_aa32_ld_i64(s, val, a32, index, MO_Q);
-}
-
-static inline void gen_aa32_st64(DisasContext *s, TCGv_i64 val,
-                                 TCGv_i32 a32, int index)
-{
-    gen_aa32_st_i64(s, val, a32, index, MO_Q);
-}
-
-DO_GEN_LD(8u, MO_UB)
-DO_GEN_LD(16u, MO_UW)
-DO_GEN_LD(32u, MO_UL)
-DO_GEN_ST(8, MO_UB)
-DO_GEN_ST(16, MO_UW)
-DO_GEN_ST(32, MO_UL)
-
 static inline void gen_hvc(DisasContext *s, int imm16)
 {
     /* The pre HVC helper handles cases when HVC gets trapped
@@ -1093,11 +1005,15 @@ static void gen_exception_internal_insn(DisasContext *s, uint32_t pc, int excp)
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static void gen_exception_insn(DisasContext *s, uint32_t pc, int excp,
-                               int syn, uint32_t target_el)
+void gen_exception_insn(DisasContext *s, uint64_t pc, int excp,
+                        uint32_t syn, uint32_t target_el)
 {
-    gen_set_condexec(s);
-    gen_set_pc_im(s, pc);
+    if (s->aarch64) {
+        gen_a64_set_pc_im(pc);
+    } else {
+        gen_set_condexec(s);
+        gen_set_pc_im(s, pc);
+    }
     gen_exception(excp, syn, target_el);
     s->base.is_jmp = DISAS_NORETURN;
 }
@@ -1114,7 +1030,7 @@ static void gen_exception_bkpt_insn(DisasContext *s, uint32_t syn)
     s->base.is_jmp = DISAS_NORETURN;
 }
 
-static void unallocated_encoding(DisasContext *s)
+void unallocated_encoding(DisasContext *s)
 {
     /* Unallocated and reserved encodings are uncategorized */
     gen_exception_insn(s, s->pc_curr, EXCP_UDEF, syn_uncategorized(),
@@ -1138,7 +1054,7 @@ static void gen_exception_el(DisasContext *s, int excp, uint32_t syn,
 }
 
 /* Force a TB lookup after an instruction that changes the CPU state.  */
-static inline void gen_lookup_tb(DisasContext *s)
+void gen_lookup_tb(DisasContext *s)
 {
     tcg_gen_movi_i32(cpu_R[15], s->base.pc_next);
     s->base.is_jmp = DISAS_EXIT;
@@ -1173,7 +1089,7 @@ static inline void gen_hlt(DisasContext *s, int imm)
 /*
  * Return the offset of a "full" NEON Dreg.
  */
-static long neon_full_reg_offset(unsigned reg)
+long neon_full_reg_offset(unsigned reg)
 {
     return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]);
 }
@@ -1182,7 +1098,7 @@ static long neon_full_reg_offset(unsigned reg)
  * Return the offset of a 2**SIZE piece of a NEON register, at index ELE,
  * where 0 is the least significant end of the register.
  */
-static long neon_element_offset(int reg, int element, MemOp memop)
+long neon_element_offset(int reg, int element, MemOp memop)
 {
     int element_size = 1 << (memop & MO_SIZE);
     int ofs = element * element_size;
@@ -1199,7 +1115,7 @@ static long neon_element_offset(int reg, int element, MemOp memop)
 }
 
 /* Return the offset of a VFP Dreg (dp = true) or VFP Sreg (dp = false). */
-static long vfp_reg_offset(bool dp, unsigned reg)
+long vfp_reg_offset(bool dp, unsigned reg)
 {
     if (dp) {
         return neon_element_offset(reg, 0, MO_64);
@@ -1208,27 +1124,7 @@ static long vfp_reg_offset(bool dp, unsigned reg)
     }
 }
 
-static inline void vfp_load_reg64(TCGv_i64 var, int reg)
-{
-    tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(true, reg));
-}
-
-static inline void vfp_store_reg64(TCGv_i64 var, int reg)
-{
-    tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(true, reg));
-}
-
-static inline void vfp_load_reg32(TCGv_i32 var, int reg)
-{
-    tcg_gen_ld_i32(var, cpu_env, vfp_reg_offset(false, reg));
-}
-
-static inline void vfp_store_reg32(TCGv_i32 var, int reg)
-{
-    tcg_gen_st_i32(var, cpu_env, vfp_reg_offset(false, reg));
-}
-
-static void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop)
+void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop)
 {
     long off = neon_element_offset(reg, ele, memop);
 
@@ -1254,7 +1150,7 @@ static void read_neon_element32(TCGv_i32 dest, int reg, int ele, MemOp memop)
     }
 }
 
-static void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop)
+void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop)
 {
     long off = neon_element_offset(reg, ele, memop);
 
@@ -1273,7 +1169,7 @@ static void read_neon_element64(TCGv_i64 dest, int reg, int ele, MemOp memop)
     }
 }
 
-static void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop)
+void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop)
 {
     long off = neon_element_offset(reg, ele, memop);
 
@@ -1292,7 +1188,7 @@ static void write_neon_element32(TCGv_i32 src, int reg, int ele, MemOp memop)
     }
 }
 
-static void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop)
+void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop)
 {
     long off = neon_element_offset(reg, ele, memop);
 
@@ -1308,20 +1204,8 @@ static void write_neon_element64(TCGv_i64 src, int reg, int ele, MemOp memop)
     }
 }
 
-static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
-{
-    TCGv_ptr ret = tcg_temp_new_ptr();
-    tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
-    return ret;
-}
-
 #define ARM_CP_RW_BIT   (1 << 20)
 
-/* Include the VFP and Neon decoders */
-#include "decode-m-nocp.c.inc"
-#include "translate-vfp.c.inc"
-#include "translate-neon.c.inc"
-
 static inline void iwmmxt_load_reg(TCGv_i64 var, int reg)
 {
     tcg_gen_ld_i64(var, cpu_env, offsetof(CPUARMState, iwmmxt.regs[reg]));
diff --git a/target/arm/translate.h b/target/arm/translate.h
index ccf60c96d8..12c28b0d32 100644
--- a/target/arm/translate.h
+++ b/target/arm/translate.h
@@ -118,6 +118,30 @@ extern TCGv_i32 cpu_NF, cpu_ZF, cpu_CF, cpu_VF;
 extern TCGv_i64 cpu_exclusive_addr;
 extern TCGv_i64 cpu_exclusive_val;
 
+/*
+ * Constant expanders for the decoders.
+ */
+
+static inline int negate(DisasContext *s, int x)
+{
+    return -x;
+}
+
+static inline int plus_2(DisasContext *s, int x)
+{
+    return x + 2;
+}
+
+static inline int times_2(DisasContext *s, int x)
+{
+    return x * 2;
+}
+
+static inline int times_4(DisasContext *s, int x)
+{
+    return x * 4;
+}
+
 static inline int arm_dc_feature(DisasContext *dc, int feature)
 {
     return (dc->features & (1ULL << feature)) != 0;
@@ -205,6 +229,9 @@ void arm_free_cc(DisasCompare *cmp);
 void arm_jump_cc(DisasCompare *cmp, TCGLabel *label);
 void arm_gen_test_cc(int cc, TCGLabel *label);
 MemOp pow2_align(unsigned i);
+void unallocated_encoding(DisasContext *s);
+void gen_exception_insn(DisasContext *s, uint64_t pc, int excp,
+                        uint32_t syn, uint32_t target_el);
 
 /* Return state of Alternate Half-precision flag, caller frees result */
 static inline TCGv_i32 get_ahp_flag(void)
@@ -382,6 +409,8 @@ typedef void NeonGenOneOpFn(TCGv_i32, TCGv_i32);
 typedef void NeonGenOneOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32);
 typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32);
 typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32);
+typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32,
+                                 TCGv_i32, TCGv_i32);
 typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64);
 typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64);
 typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64);
diff --git a/target/i386/helper.c b/target/i386/helper.c
index 618ad1c409..8c180b5b2b 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -495,7 +495,7 @@ void cpu_report_tpr_access(CPUX86State *env, TPRAccess access)
     X86CPU *cpu = env_archcpu(env);
     CPUState *cs = env_cpu(env);
 
-    if (kvm_enabled() || whpx_enabled()) {
+    if (kvm_enabled() || whpx_enabled() || nvmm_enabled()) {
         env->tpr_access_type = access;
 
         cpu_interrupt(cs, CPU_INTERRUPT_TPR);
diff --git a/target/i386/meson.build b/target/i386/meson.build
index c4bf20b319..b0c04f3d89 100644
--- a/target/i386/meson.build
+++ b/target/i386/meson.build
@@ -19,6 +19,7 @@ i386_softmmu_ss.add(files(
 subdir('kvm')
 subdir('hax')
 subdir('whpx')
+subdir('nvmm')
 subdir('hvf')
 subdir('tcg')
 
diff --git a/target/i386/nvmm/meson.build b/target/i386/nvmm/meson.build
new file mode 100644
index 0000000000..733e334083
--- /dev/null
+++ b/target/i386/nvmm/meson.build
@@ -0,0 +1,8 @@
+i386_softmmu_ss.add(when: 'CONFIG_NVMM', if_true:
+  files(
+  'nvmm-all.c',
+  'nvmm-accel-ops.c',
+  )
+)
+
+i386_softmmu_ss.add(when: 'CONFIG_NVMM', if_true: nvmm)
diff --git a/target/i386/nvmm/nvmm-accel-ops.c b/target/i386/nvmm/nvmm-accel-ops.c
new file mode 100644
index 0000000000..f788f75289
--- /dev/null
+++ b/target/i386/nvmm/nvmm-accel-ops.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018-2019 Maxime Villard, All rights reserved.
+ *
+ * NetBSD Virtual Machine Monitor (NVMM) accelerator for QEMU.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "sysemu/kvm_int.h"
+#include "qemu/main-loop.h"
+#include "sysemu/cpus.h"
+#include "qemu/guest-random.h"
+
+#include "sysemu/nvmm.h"
+#include "nvmm-accel-ops.h"
+
+static void *qemu_nvmm_cpu_thread_fn(void *arg)
+{
+    CPUState *cpu = arg;
+    int r;
+
+    assert(nvmm_enabled());
+
+    rcu_register_thread();
+
+    qemu_mutex_lock_iothread();
+    qemu_thread_get_self(cpu->thread);
+    cpu->thread_id = qemu_get_thread_id();
+    current_cpu = cpu;
+
+    r = nvmm_init_vcpu(cpu);
+    if (r < 0) {
+        fprintf(stderr, "nvmm_init_vcpu failed: %s\n", strerror(-r));
+        exit(1);
+    }
+
+    /* signal CPU creation */
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    do {
+        if (cpu_can_run(cpu)) {
+            r = nvmm_vcpu_exec(cpu);
+            if (r == EXCP_DEBUG) {
+                cpu_handle_guest_debug(cpu);
+            }
+        }
+        while (cpu_thread_is_idle(cpu)) {
+            qemu_cond_wait_iothread(cpu->halt_cond);
+        }
+        qemu_wait_io_event_common(cpu);
+    } while (!cpu->unplug || cpu_can_run(cpu));
+
+    nvmm_destroy_vcpu(cpu);
+    cpu_thread_signal_destroyed(cpu);
+    qemu_mutex_unlock_iothread();
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void nvmm_start_vcpu_thread(CPUState *cpu)
+{
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+
+    cpu->thread = g_malloc0(sizeof(QemuThread));
+    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+    qemu_cond_init(cpu->halt_cond);
+    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/NVMM",
+             cpu->cpu_index);
+    qemu_thread_create(cpu->thread, thread_name, qemu_nvmm_cpu_thread_fn,
+                       cpu, QEMU_THREAD_JOINABLE);
+}
+
+/*
+ * Abort the call to run the virtual processor by another thread, and to
+ * return the control to that thread.
+ */
+static void nvmm_kick_vcpu_thread(CPUState *cpu)
+{
+    cpu->exit_request = 1;
+    cpus_kick_thread(cpu);
+}
+
+static void nvmm_accel_ops_class_init(ObjectClass *oc, void *data)
+{
+    AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
+
+    ops->create_vcpu_thread = nvmm_start_vcpu_thread;
+    ops->kick_vcpu_thread = nvmm_kick_vcpu_thread;
+
+    ops->synchronize_post_reset = nvmm_cpu_synchronize_post_reset;
+    ops->synchronize_post_init = nvmm_cpu_synchronize_post_init;
+    ops->synchronize_state = nvmm_cpu_synchronize_state;
+    ops->synchronize_pre_loadvm = nvmm_cpu_synchronize_pre_loadvm;
+}
+
+static const TypeInfo nvmm_accel_ops_type = {
+    .name = ACCEL_OPS_NAME("nvmm"),
+
+    .parent = TYPE_ACCEL_OPS,
+    .class_init = nvmm_accel_ops_class_init,
+    .abstract = true,
+};
+
+static void nvmm_accel_ops_register_types(void)
+{
+    type_register_static(&nvmm_accel_ops_type);
+}
+type_init(nvmm_accel_ops_register_types);
diff --git a/target/i386/nvmm/nvmm-accel-ops.h b/target/i386/nvmm/nvmm-accel-ops.h
new file mode 100644
index 0000000000..43e24adcaf
--- /dev/null
+++ b/target/i386/nvmm/nvmm-accel-ops.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2018-2019 Maxime Villard, All rights reserved.
+ *
+ * NetBSD Virtual Machine Monitor (NVMM) accelerator for QEMU.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef NVMM_CPUS_H
+#define NVMM_CPUS_H
+
+#include "sysemu/cpus.h"
+
+int nvmm_init_vcpu(CPUState *cpu);
+int nvmm_vcpu_exec(CPUState *cpu);
+void nvmm_destroy_vcpu(CPUState *cpu);
+
+void nvmm_cpu_synchronize_state(CPUState *cpu);
+void nvmm_cpu_synchronize_post_reset(CPUState *cpu);
+void nvmm_cpu_synchronize_post_init(CPUState *cpu);
+void nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu);
+
+#endif /* NVMM_CPUS_H */
diff --git a/target/i386/nvmm/nvmm-all.c b/target/i386/nvmm/nvmm-all.c
new file mode 100644
index 0000000000..dfa690d65d
--- /dev/null
+++ b/target/i386/nvmm/nvmm-all.c
@@ -0,0 +1,1226 @@
+/*
+ * Copyright (c) 2018-2019 Maxime Villard, All rights reserved.
+ *
+ * NetBSD Virtual Machine Monitor (NVMM) accelerator for QEMU.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/address-spaces.h"
+#include "exec/ioport.h"
+#include "qemu-common.h"
+#include "qemu/accel.h"
+#include "sysemu/nvmm.h"
+#include "sysemu/cpus.h"
+#include "sysemu/runstate.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/queue.h"
+#include "migration/blocker.h"
+#include "strings.h"
+
+#include "nvmm-accel-ops.h"
+
+#include <nvmm.h>
+
+struct qemu_vcpu {
+    struct nvmm_vcpu vcpu;
+    uint8_t tpr;
+    bool stop;
+
+    /* Window-exiting for INTs/NMIs. */
+    bool int_window_exit;
+    bool nmi_window_exit;
+
+    /* The guest is in an interrupt shadow (POP SS, etc). */
+    bool int_shadow;
+};
+
+struct qemu_machine {
+    struct nvmm_capability cap;
+    struct nvmm_machine mach;
+};
+
+/* -------------------------------------------------------------------------- */
+
+static bool nvmm_allowed;
+static struct qemu_machine qemu_mach;
+
+static struct qemu_vcpu *
+get_qemu_vcpu(CPUState *cpu)
+{
+    return (struct qemu_vcpu *)cpu->hax_vcpu;
+}
+
+static struct nvmm_machine *
+get_nvmm_mach(void)
+{
+    return &qemu_mach.mach;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_set_segment(struct nvmm_x64_state_seg *nseg, const SegmentCache *qseg)
+{
+    uint32_t attrib = qseg->flags;
+
+    nseg->selector = qseg->selector;
+    nseg->limit = qseg->limit;
+    nseg->base = qseg->base;
+    nseg->attrib.type = __SHIFTOUT(attrib, DESC_TYPE_MASK);
+    nseg->attrib.s = __SHIFTOUT(attrib, DESC_S_MASK);
+    nseg->attrib.dpl = __SHIFTOUT(attrib, DESC_DPL_MASK);
+    nseg->attrib.p = __SHIFTOUT(attrib, DESC_P_MASK);
+    nseg->attrib.avl = __SHIFTOUT(attrib, DESC_AVL_MASK);
+    nseg->attrib.l = __SHIFTOUT(attrib, DESC_L_MASK);
+    nseg->attrib.def = __SHIFTOUT(attrib, DESC_B_MASK);
+    nseg->attrib.g = __SHIFTOUT(attrib, DESC_G_MASK);
+}
+
+static void
+nvmm_set_registers(CPUState *cpu)
+{
+    struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+    struct nvmm_machine *mach = get_nvmm_mach();
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+    struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+    struct nvmm_x64_state *state = vcpu->state;
+    uint64_t bitmap;
+    size_t i;
+    int ret;
+
+    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
+
+    /* GPRs. */
+    state->gprs[NVMM_X64_GPR_RAX] = env->regs[R_EAX];
+    state->gprs[NVMM_X64_GPR_RCX] = env->regs[R_ECX];
+    state->gprs[NVMM_X64_GPR_RDX] = env->regs[R_EDX];
+    state->gprs[NVMM_X64_GPR_RBX] = env->regs[R_EBX];
+    state->gprs[NVMM_X64_GPR_RSP] = env->regs[R_ESP];
+    state->gprs[NVMM_X64_GPR_RBP] = env->regs[R_EBP];
+    state->gprs[NVMM_X64_GPR_RSI] = env->regs[R_ESI];
+    state->gprs[NVMM_X64_GPR_RDI] = env->regs[R_EDI];
+#ifdef TARGET_X86_64
+    state->gprs[NVMM_X64_GPR_R8]  = env->regs[R_R8];
+    state->gprs[NVMM_X64_GPR_R9]  = env->regs[R_R9];
+    state->gprs[NVMM_X64_GPR_R10] = env->regs[R_R10];
+    state->gprs[NVMM_X64_GPR_R11] = env->regs[R_R11];
+    state->gprs[NVMM_X64_GPR_R12] = env->regs[R_R12];
+    state->gprs[NVMM_X64_GPR_R13] = env->regs[R_R13];
+    state->gprs[NVMM_X64_GPR_R14] = env->regs[R_R14];
+    state->gprs[NVMM_X64_GPR_R15] = env->regs[R_R15];
+#endif
+
+    /* RIP and RFLAGS. */
+    state->gprs[NVMM_X64_GPR_RIP] = env->eip;
+    state->gprs[NVMM_X64_GPR_RFLAGS] = env->eflags;
+
+    /* Segments. */
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_CS], &env->segs[R_CS]);
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_DS], &env->segs[R_DS]);
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_ES], &env->segs[R_ES]);
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_FS], &env->segs[R_FS]);
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_GS], &env->segs[R_GS]);
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_SS], &env->segs[R_SS]);
+
+    /* Special segments. */
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_GDT], &env->gdt);
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_LDT], &env->ldt);
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_TR], &env->tr);
+    nvmm_set_segment(&state->segs[NVMM_X64_SEG_IDT], &env->idt);
+
+    /* Control registers. */
+    state->crs[NVMM_X64_CR_CR0] = env->cr[0];
+    state->crs[NVMM_X64_CR_CR2] = env->cr[2];
+    state->crs[NVMM_X64_CR_CR3] = env->cr[3];
+    state->crs[NVMM_X64_CR_CR4] = env->cr[4];
+    state->crs[NVMM_X64_CR_CR8] = qcpu->tpr;
+    state->crs[NVMM_X64_CR_XCR0] = env->xcr0;
+
+    /* Debug registers. */
+    state->drs[NVMM_X64_DR_DR0] = env->dr[0];
+    state->drs[NVMM_X64_DR_DR1] = env->dr[1];
+    state->drs[NVMM_X64_DR_DR2] = env->dr[2];
+    state->drs[NVMM_X64_DR_DR3] = env->dr[3];
+    state->drs[NVMM_X64_DR_DR6] = env->dr[6];
+    state->drs[NVMM_X64_DR_DR7] = env->dr[7];
+
+    /* FPU. */
+    state->fpu.fx_cw = env->fpuc;
+    state->fpu.fx_sw = (env->fpus & ~0x3800) | ((env->fpstt & 0x7) << 11);
+    state->fpu.fx_tw = 0;
+    for (i = 0; i < 8; i++) {
+        state->fpu.fx_tw |= (!env->fptags[i]) << i;
+    }
+    state->fpu.fx_opcode = env->fpop;
+    state->fpu.fx_ip.fa_64 = env->fpip;
+    state->fpu.fx_dp.fa_64 = env->fpdp;
+    state->fpu.fx_mxcsr = env->mxcsr;
+    state->fpu.fx_mxcsr_mask = 0x0000FFFF;
+    assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs));
+    memcpy(state->fpu.fx_87_ac, env->fpregs, sizeof(env->fpregs));
+    for (i = 0; i < CPU_NB_REGS; i++) {
+        memcpy(&state->fpu.fx_xmm[i].xmm_bytes[0],
+            &env->xmm_regs[i].ZMM_Q(0), 8);
+        memcpy(&state->fpu.fx_xmm[i].xmm_bytes[8],
+            &env->xmm_regs[i].ZMM_Q(1), 8);
+    }
+
+    /* MSRs. */
+    state->msrs[NVMM_X64_MSR_EFER] = env->efer;
+    state->msrs[NVMM_X64_MSR_STAR] = env->star;
+#ifdef TARGET_X86_64
+    state->msrs[NVMM_X64_MSR_LSTAR] = env->lstar;
+    state->msrs[NVMM_X64_MSR_CSTAR] = env->cstar;
+    state->msrs[NVMM_X64_MSR_SFMASK] = env->fmask;
+    state->msrs[NVMM_X64_MSR_KERNELGSBASE] = env->kernelgsbase;
+#endif
+    state->msrs[NVMM_X64_MSR_SYSENTER_CS]  = env->sysenter_cs;
+    state->msrs[NVMM_X64_MSR_SYSENTER_ESP] = env->sysenter_esp;
+    state->msrs[NVMM_X64_MSR_SYSENTER_EIP] = env->sysenter_eip;
+    state->msrs[NVMM_X64_MSR_PAT] = env->pat;
+    state->msrs[NVMM_X64_MSR_TSC] = env->tsc;
+
+    bitmap =
+        NVMM_X64_STATE_SEGS |
+        NVMM_X64_STATE_GPRS |
+        NVMM_X64_STATE_CRS  |
+        NVMM_X64_STATE_DRS  |
+        NVMM_X64_STATE_MSRS |
+        NVMM_X64_STATE_FPU;
+
+    ret = nvmm_vcpu_setstate(mach, vcpu, bitmap);
+    if (ret == -1) {
+        error_report("NVMM: Failed to set virtual processor context,"
+            " error=%d", errno);
+    }
+}
+
+static void
+nvmm_get_segment(SegmentCache *qseg, const struct nvmm_x64_state_seg *nseg)
+{
+    qseg->selector = nseg->selector;
+    qseg->limit = nseg->limit;
+    qseg->base = nseg->base;
+
+    qseg->flags =
+        __SHIFTIN((uint32_t)nseg->attrib.type, DESC_TYPE_MASK) |
+        __SHIFTIN((uint32_t)nseg->attrib.s, DESC_S_MASK) |
+        __SHIFTIN((uint32_t)nseg->attrib.dpl, DESC_DPL_MASK) |
+        __SHIFTIN((uint32_t)nseg->attrib.p, DESC_P_MASK) |
+        __SHIFTIN((uint32_t)nseg->attrib.avl, DESC_AVL_MASK) |
+        __SHIFTIN((uint32_t)nseg->attrib.l, DESC_L_MASK) |
+        __SHIFTIN((uint32_t)nseg->attrib.def, DESC_B_MASK) |
+        __SHIFTIN((uint32_t)nseg->attrib.g, DESC_G_MASK);
+}
+
+static void
+nvmm_get_registers(CPUState *cpu)
+{
+    struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+    struct nvmm_machine *mach = get_nvmm_mach();
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+    struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    struct nvmm_x64_state *state = vcpu->state;
+    uint64_t bitmap, tpr;
+    size_t i;
+    int ret;
+
+    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
+
+    bitmap =
+        NVMM_X64_STATE_SEGS |
+        NVMM_X64_STATE_GPRS |
+        NVMM_X64_STATE_CRS  |
+        NVMM_X64_STATE_DRS  |
+        NVMM_X64_STATE_MSRS |
+        NVMM_X64_STATE_FPU;
+
+    ret = nvmm_vcpu_getstate(mach, vcpu, bitmap);
+    if (ret == -1) {
+        error_report("NVMM: Failed to get virtual processor context,"
+            " error=%d", errno);
+    }
+
+    /* GPRs. */
+    env->regs[R_EAX] = state->gprs[NVMM_X64_GPR_RAX];
+    env->regs[R_ECX] = state->gprs[NVMM_X64_GPR_RCX];
+    env->regs[R_EDX] = state->gprs[NVMM_X64_GPR_RDX];
+    env->regs[R_EBX] = state->gprs[NVMM_X64_GPR_RBX];
+    env->regs[R_ESP] = state->gprs[NVMM_X64_GPR_RSP];
+    env->regs[R_EBP] = state->gprs[NVMM_X64_GPR_RBP];
+    env->regs[R_ESI] = state->gprs[NVMM_X64_GPR_RSI];
+    env->regs[R_EDI] = state->gprs[NVMM_X64_GPR_RDI];
+#ifdef TARGET_X86_64
+    env->regs[R_R8]  = state->gprs[NVMM_X64_GPR_R8];
+    env->regs[R_R9]  = state->gprs[NVMM_X64_GPR_R9];
+    env->regs[R_R10] = state->gprs[NVMM_X64_GPR_R10];
+    env->regs[R_R11] = state->gprs[NVMM_X64_GPR_R11];
+    env->regs[R_R12] = state->gprs[NVMM_X64_GPR_R12];
+    env->regs[R_R13] = state->gprs[NVMM_X64_GPR_R13];
+    env->regs[R_R14] = state->gprs[NVMM_X64_GPR_R14];
+    env->regs[R_R15] = state->gprs[NVMM_X64_GPR_R15];
+#endif
+
+    /* RIP and RFLAGS. */
+    env->eip = state->gprs[NVMM_X64_GPR_RIP];
+    env->eflags = state->gprs[NVMM_X64_GPR_RFLAGS];
+
+    /* Segments. */
+    nvmm_get_segment(&env->segs[R_ES], &state->segs[NVMM_X64_SEG_ES]);
+    nvmm_get_segment(&env->segs[R_CS], &state->segs[NVMM_X64_SEG_CS]);
+    nvmm_get_segment(&env->segs[R_SS], &state->segs[NVMM_X64_SEG_SS]);
+    nvmm_get_segment(&env->segs[R_DS], &state->segs[NVMM_X64_SEG_DS]);
+    nvmm_get_segment(&env->segs[R_FS], &state->segs[NVMM_X64_SEG_FS]);
+    nvmm_get_segment(&env->segs[R_GS], &state->segs[NVMM_X64_SEG_GS]);
+
+    /* Special segments. */
+    nvmm_get_segment(&env->gdt, &state->segs[NVMM_X64_SEG_GDT]);
+    nvmm_get_segment(&env->ldt, &state->segs[NVMM_X64_SEG_LDT]);
+    nvmm_get_segment(&env->tr, &state->segs[NVMM_X64_SEG_TR]);
+    nvmm_get_segment(&env->idt, &state->segs[NVMM_X64_SEG_IDT]);
+
+    /* Control registers. */
+    env->cr[0] = state->crs[NVMM_X64_CR_CR0];
+    env->cr[2] = state->crs[NVMM_X64_CR_CR2];
+    env->cr[3] = state->crs[NVMM_X64_CR_CR3];
+    env->cr[4] = state->crs[NVMM_X64_CR_CR4];
+    tpr = state->crs[NVMM_X64_CR_CR8];
+    if (tpr != qcpu->tpr) {
+        qcpu->tpr = tpr;
+        cpu_set_apic_tpr(x86_cpu->apic_state, tpr);
+    }
+    env->xcr0 = state->crs[NVMM_X64_CR_XCR0];
+
+    /* Debug registers. */
+    env->dr[0] = state->drs[NVMM_X64_DR_DR0];
+    env->dr[1] = state->drs[NVMM_X64_DR_DR1];
+    env->dr[2] = state->drs[NVMM_X64_DR_DR2];
+    env->dr[3] = state->drs[NVMM_X64_DR_DR3];
+    env->dr[6] = state->drs[NVMM_X64_DR_DR6];
+    env->dr[7] = state->drs[NVMM_X64_DR_DR7];
+
+    /* FPU. */
+    env->fpuc = state->fpu.fx_cw;
+    env->fpstt = (state->fpu.fx_sw >> 11) & 0x7;
+    env->fpus = state->fpu.fx_sw & ~0x3800;
+    for (i = 0; i < 8; i++) {
+        env->fptags[i] = !((state->fpu.fx_tw >> i) & 1);
+    }
+    env->fpop = state->fpu.fx_opcode;
+    env->fpip = state->fpu.fx_ip.fa_64;
+    env->fpdp = state->fpu.fx_dp.fa_64;
+    env->mxcsr = state->fpu.fx_mxcsr;
+    assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs));
+    memcpy(env->fpregs, state->fpu.fx_87_ac, sizeof(env->fpregs));
+    for (i = 0; i < CPU_NB_REGS; i++) {
+        memcpy(&env->xmm_regs[i].ZMM_Q(0),
+            &state->fpu.fx_xmm[i].xmm_bytes[0], 8);
+        memcpy(&env->xmm_regs[i].ZMM_Q(1),
+            &state->fpu.fx_xmm[i].xmm_bytes[8], 8);
+    }
+
+    /* MSRs. */
+    env->efer = state->msrs[NVMM_X64_MSR_EFER];
+    env->star = state->msrs[NVMM_X64_MSR_STAR];
+#ifdef TARGET_X86_64
+    env->lstar = state->msrs[NVMM_X64_MSR_LSTAR];
+    env->cstar = state->msrs[NVMM_X64_MSR_CSTAR];
+    env->fmask = state->msrs[NVMM_X64_MSR_SFMASK];
+    env->kernelgsbase = state->msrs[NVMM_X64_MSR_KERNELGSBASE];
+#endif
+    env->sysenter_cs  = state->msrs[NVMM_X64_MSR_SYSENTER_CS];
+    env->sysenter_esp = state->msrs[NVMM_X64_MSR_SYSENTER_ESP];
+    env->sysenter_eip = state->msrs[NVMM_X64_MSR_SYSENTER_EIP];
+    env->pat = state->msrs[NVMM_X64_MSR_PAT];
+    env->tsc = state->msrs[NVMM_X64_MSR_TSC];
+
+    x86_update_hflags(env);
+}
+
+static bool
+nvmm_can_take_int(CPUState *cpu)
+{
+    struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+    struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+    struct nvmm_machine *mach = get_nvmm_mach();
+
+    if (qcpu->int_window_exit) {
+        return false;
+    }
+
+    if (qcpu->int_shadow || !(env->eflags & IF_MASK)) {
+        struct nvmm_x64_state *state = vcpu->state;
+
+        /* Exit on interrupt window. */
+        nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_INTR);
+        state->intr.int_window_exiting = 1;
+        nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_INTR);
+
+        return false;
+    }
+
+    return true;
+}
+
+static bool
+nvmm_can_take_nmi(CPUState *cpu)
+{
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+
+    /*
+     * Contrary to INTs, NMIs always schedule an exit when they are
+     * completed. Therefore, if window-exiting is enabled, it means
+     * NMIs are blocked.
+     */
+    if (qcpu->nmi_window_exit) {
+        return false;
+    }
+
+    return true;
+}
+
+/*
+ * Called before the VCPU is run. We inject events generated by the I/O
+ * thread, and synchronize the guest TPR.
+ */
+static void
+nvmm_vcpu_pre_run(CPUState *cpu)
+{
+    struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+    struct nvmm_machine *mach = get_nvmm_mach();
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+    struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    struct nvmm_x64_state *state = vcpu->state;
+    struct nvmm_vcpu_event *event = vcpu->event;
+    bool has_event = false;
+    bool sync_tpr = false;
+    uint8_t tpr;
+    int ret;
+
+    qemu_mutex_lock_iothread();
+
+    tpr = cpu_get_apic_tpr(x86_cpu->apic_state);
+    if (tpr != qcpu->tpr) {
+        qcpu->tpr = tpr;
+        sync_tpr = true;
+    }
+
+    /*
+     * Force the VCPU out of its inner loop to process any INIT requests
+     * or commit pending TPR access.
+     */
+    if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
+        cpu->exit_request = 1;
+    }
+
+    if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
+        if (nvmm_can_take_nmi(cpu)) {
+            cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
+            event->type = NVMM_VCPU_EVENT_INTR;
+            event->vector = 2;
+            has_event = true;
+        }
+    }
+
+    if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
+        if (nvmm_can_take_int(cpu)) {
+            cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
+            event->type = NVMM_VCPU_EVENT_INTR;
+            event->vector = cpu_get_pic_interrupt(env);
+            has_event = true;
+        }
+    }
+
+    /* Don't want SMIs. */
+    if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
+        cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
+    }
+
+    if (sync_tpr) {
+        ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_CRS);
+        if (ret == -1) {
+            error_report("NVMM: Failed to get CPU state,"
+                " error=%d", errno);
+        }
+
+        state->crs[NVMM_X64_CR_CR8] = qcpu->tpr;
+
+        ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_CRS);
+        if (ret == -1) {
+            error_report("NVMM: Failed to set CPU state,"
+                " error=%d", errno);
+        }
+    }
+
+    if (has_event) {
+        ret = nvmm_vcpu_inject(mach, vcpu);
+        if (ret == -1) {
+            error_report("NVMM: Failed to inject event,"
+                " error=%d", errno);
+        }
+    }
+
+    qemu_mutex_unlock_iothread();
+}
+
+/*
+ * Called after the VCPU ran. We synchronize the host view of the TPR and
+ * RFLAGS.
+ */
+static void
+nvmm_vcpu_post_run(CPUState *cpu, struct nvmm_vcpu_exit *exit)
+{
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+    struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    uint64_t tpr;
+
+    env->eflags = exit->exitstate.rflags;
+    qcpu->int_shadow = exit->exitstate.int_shadow;
+    qcpu->int_window_exit = exit->exitstate.int_window_exiting;
+    qcpu->nmi_window_exit = exit->exitstate.nmi_window_exiting;
+
+    tpr = exit->exitstate.cr8;
+    if (qcpu->tpr != tpr) {
+        qcpu->tpr = tpr;
+        qemu_mutex_lock_iothread();
+        cpu_set_apic_tpr(x86_cpu->apic_state, qcpu->tpr);
+        qemu_mutex_unlock_iothread();
+    }
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_io_callback(struct nvmm_io *io)
+{
+    MemTxAttrs attrs = { 0 };
+    int ret;
+
+    ret = address_space_rw(&address_space_io, io->port, attrs, io->data,
+        io->size, !io->in);
+    if (ret != MEMTX_OK) {
+        error_report("NVMM: I/O Transaction Failed "
+            "[%s, port=%u, size=%zu]", (io->in ? "in" : "out"),
+            io->port, io->size);
+    }
+
+    /* Needed, otherwise infinite loop. */
+    current_cpu->vcpu_dirty = false;
+}
+
+static void
+nvmm_mem_callback(struct nvmm_mem *mem)
+{
+    cpu_physical_memory_rw(mem->gpa, mem->data, mem->size, mem->write);
+
+    /* Needed, otherwise infinite loop. */
+    current_cpu->vcpu_dirty = false;
+}
+
+static struct nvmm_assist_callbacks nvmm_callbacks = {
+    .io = nvmm_io_callback,
+    .mem = nvmm_mem_callback
+};
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_handle_mem(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
+{
+    int ret;
+
+    ret = nvmm_assist_mem(mach, vcpu);
+    if (ret == -1) {
+        error_report("NVMM: Mem Assist Failed [gpa=%p]",
+            (void *)vcpu->exit->u.mem.gpa);
+    }
+
+    return ret;
+}
+
+static int
+nvmm_handle_io(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
+{
+    int ret;
+
+    ret = nvmm_assist_io(mach, vcpu);
+    if (ret == -1) {
+        error_report("NVMM: I/O Assist Failed [port=%d]",
+            (int)vcpu->exit->u.io.port);
+    }
+
+    return ret;
+}
+
+static int
+nvmm_handle_rdmsr(struct nvmm_machine *mach, CPUState *cpu,
+    struct nvmm_vcpu_exit *exit)
+{
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+    struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    struct nvmm_x64_state *state = vcpu->state;
+    uint64_t val;
+    int ret;
+
+    switch (exit->u.rdmsr.msr) {
+    case MSR_IA32_APICBASE:
+        val = cpu_get_apic_base(x86_cpu->apic_state);
+        break;
+    case MSR_MTRRcap:
+    case MSR_MTRRdefType:
+    case MSR_MCG_CAP:
+    case MSR_MCG_STATUS:
+        val = 0;
+        break;
+    default: /* More MSRs to add? */
+        val = 0;
+        error_report("NVMM: Unexpected RDMSR 0x%x, ignored",
+            exit->u.rdmsr.msr);
+        break;
+    }
+
+    ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS);
+    if (ret == -1) {
+        return -1;
+    }
+
+    state->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF);
+    state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
+    state->gprs[NVMM_X64_GPR_RIP] = exit->u.rdmsr.npc;
+
+    ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
+    if (ret == -1) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+nvmm_handle_wrmsr(struct nvmm_machine *mach, CPUState *cpu,
+    struct nvmm_vcpu_exit *exit)
+{
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+    struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    struct nvmm_x64_state *state = vcpu->state;
+    uint64_t val;
+    int ret;
+
+    val = exit->u.wrmsr.val;
+
+    switch (exit->u.wrmsr.msr) {
+    case MSR_IA32_APICBASE:
+        cpu_set_apic_base(x86_cpu->apic_state, val);
+        break;
+    case MSR_MTRRdefType:
+    case MSR_MCG_STATUS:
+        break;
+    default: /* More MSRs to add? */
+        error_report("NVMM: Unexpected WRMSR 0x%x [val=0x%lx], ignored",
+            exit->u.wrmsr.msr, val);
+        break;
+    }
+
+    ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS);
+    if (ret == -1) {
+        return -1;
+    }
+
+    state->gprs[NVMM_X64_GPR_RIP] = exit->u.wrmsr.npc;
+
+    ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
+    if (ret == -1) {
+        return -1;
+    }
+
+    return 0;
+}
+
+static int
+nvmm_handle_halted(struct nvmm_machine *mach, CPUState *cpu,
+    struct nvmm_vcpu_exit *exit)
+{
+    struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+    int ret = 0;
+
+    qemu_mutex_lock_iothread();
+
+    if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
+          (env->eflags & IF_MASK)) &&
+        !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
+        cpu->exception_index = EXCP_HLT;
+        cpu->halted = true;
+        ret = 1;
+    }
+
+    qemu_mutex_unlock_iothread();
+
+    return ret;
+}
+
+static int
+nvmm_inject_ud(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
+{
+    struct nvmm_vcpu_event *event = vcpu->event;
+
+    event->type = NVMM_VCPU_EVENT_EXCP;
+    event->vector = 6;
+    event->u.excp.error = 0;
+
+    return nvmm_vcpu_inject(mach, vcpu);
+}
+
+static int
+nvmm_vcpu_loop(CPUState *cpu)
+{
+    struct CPUX86State *env = (CPUArchState *)cpu->env_ptr;
+    struct nvmm_machine *mach = get_nvmm_mach();
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+    struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    struct nvmm_vcpu_exit *exit = vcpu->exit;
+    int ret;
+
+    /*
+     * Some asynchronous events must be handled outside of the inner
+     * VCPU loop. They are handled here.
+     */
+    if (cpu->interrupt_request & CPU_INTERRUPT_INIT) {
+        nvmm_cpu_synchronize_state(cpu);
+        do_cpu_init(x86_cpu);
+        /* set int/nmi windows back to the reset state */
+    }
+    if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
+        cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
+        apic_poll_irq(x86_cpu->apic_state);
+    }
+    if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
+         (env->eflags & IF_MASK)) ||
+        (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
+        cpu->halted = false;
+    }
+    if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
+        nvmm_cpu_synchronize_state(cpu);
+        do_cpu_sipi(x86_cpu);
+    }
+    if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
+        cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
+        nvmm_cpu_synchronize_state(cpu);
+        apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
+            env->tpr_access_type);
+    }
+
+    if (cpu->halted) {
+        cpu->exception_index = EXCP_HLT;
+        qatomic_set(&cpu->exit_request, false);
+        return 0;
+    }
+
+    qemu_mutex_unlock_iothread();
+    cpu_exec_start(cpu);
+
+    /*
+     * Inner VCPU loop.
+     */
+    do {
+        if (cpu->vcpu_dirty) {
+            nvmm_set_registers(cpu);
+            cpu->vcpu_dirty = false;
+        }
+
+        if (qcpu->stop) {
+            cpu->exception_index = EXCP_INTERRUPT;
+            qcpu->stop = false;
+            ret = 1;
+            break;
+        }
+
+        nvmm_vcpu_pre_run(cpu);
+
+        if (qatomic_read(&cpu->exit_request)) {
+            nvmm_vcpu_stop(vcpu);
+        }
+
+        /* Read exit_request before the kernel reads the immediate exit flag */
+        smp_rmb();
+        ret = nvmm_vcpu_run(mach, vcpu);
+        if (ret == -1) {
+            error_report("NVMM: Failed to exec a virtual processor,"
+                " error=%d", errno);
+            break;
+        }
+
+        nvmm_vcpu_post_run(cpu, exit);
+
+        switch (exit->reason) {
+        case NVMM_VCPU_EXIT_NONE:
+            break;
+        case NVMM_VCPU_EXIT_STOPPED:
+            /*
+             * The kernel cleared the immediate exit flag; cpu->exit_request
+             * must be cleared after
+             */
+            smp_wmb();
+            qcpu->stop = true;
+            break;
+        case NVMM_VCPU_EXIT_MEMORY:
+            ret = nvmm_handle_mem(mach, vcpu);
+            break;
+        case NVMM_VCPU_EXIT_IO:
+            ret = nvmm_handle_io(mach, vcpu);
+            break;
+        case NVMM_VCPU_EXIT_INT_READY:
+        case NVMM_VCPU_EXIT_NMI_READY:
+        case NVMM_VCPU_EXIT_TPR_CHANGED:
+            break;
+        case NVMM_VCPU_EXIT_HALTED:
+            ret = nvmm_handle_halted(mach, cpu, exit);
+            break;
+        case NVMM_VCPU_EXIT_SHUTDOWN:
+            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
+            cpu->exception_index = EXCP_INTERRUPT;
+            ret = 1;
+            break;
+        case NVMM_VCPU_EXIT_RDMSR:
+            ret = nvmm_handle_rdmsr(mach, cpu, exit);
+            break;
+        case NVMM_VCPU_EXIT_WRMSR:
+            ret = nvmm_handle_wrmsr(mach, cpu, exit);
+            break;
+        case NVMM_VCPU_EXIT_MONITOR:
+        case NVMM_VCPU_EXIT_MWAIT:
+            ret = nvmm_inject_ud(mach, vcpu);
+            break;
+        default:
+            error_report("NVMM: Unexpected VM exit code 0x%lx [hw=0x%lx]",
+                exit->reason, exit->u.inv.hwcode);
+            nvmm_get_registers(cpu);
+            qemu_mutex_lock_iothread();
+            qemu_system_guest_panicked(cpu_get_crash_info(cpu));
+            qemu_mutex_unlock_iothread();
+            ret = -1;
+            break;
+        }
+    } while (ret == 0);
+
+    cpu_exec_end(cpu);
+    qemu_mutex_lock_iothread();
+
+    qatomic_set(&cpu->exit_request, false);
+
+    return ret < 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+do_nvmm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
+{
+    nvmm_get_registers(cpu);
+    cpu->vcpu_dirty = true;
+}
+
+static void
+do_nvmm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
+{
+    nvmm_set_registers(cpu);
+    cpu->vcpu_dirty = false;
+}
+
+static void
+do_nvmm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
+{
+    nvmm_set_registers(cpu);
+    cpu->vcpu_dirty = false;
+}
+
+static void
+do_nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
+{
+    cpu->vcpu_dirty = true;
+}
+
+void nvmm_cpu_synchronize_state(CPUState *cpu)
+{
+    if (!cpu->vcpu_dirty) {
+        run_on_cpu(cpu, do_nvmm_cpu_synchronize_state, RUN_ON_CPU_NULL);
+    }
+}
+
+void nvmm_cpu_synchronize_post_reset(CPUState *cpu)
+{
+    run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
+}
+
+void nvmm_cpu_synchronize_post_init(CPUState *cpu)
+{
+    run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
+}
+
+void nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu)
+{
+    run_on_cpu(cpu, do_nvmm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static Error *nvmm_migration_blocker;
+
+/*
+ * The nvmm_vcpu_stop() mechanism breaks races between entering the VMM
+ * and another thread signaling the vCPU thread to exit.
+ */
+
+static void
+nvmm_ipi_signal(int sigcpu)
+{
+    if (current_cpu) {
+        struct qemu_vcpu *qcpu = get_qemu_vcpu(current_cpu);
+        struct nvmm_vcpu *vcpu = &qcpu->vcpu;
+        nvmm_vcpu_stop(vcpu);
+    }
+}
+
+static void
+nvmm_init_cpu_signals(void)
+{
+    struct sigaction sigact;
+    sigset_t set;
+
+    /* Install the IPI handler. */
+    memset(&sigact, 0, sizeof(sigact));
+    sigact.sa_handler = nvmm_ipi_signal;
+    sigaction(SIG_IPI, &sigact, NULL);
+
+    /* Allow IPIs on the current thread. */
+    sigprocmask(SIG_BLOCK, NULL, &set);
+    sigdelset(&set, SIG_IPI);
+    pthread_sigmask(SIG_SETMASK, &set, NULL);
+}
+
+int
+nvmm_init_vcpu(CPUState *cpu)
+{
+    struct nvmm_machine *mach = get_nvmm_mach();
+    struct nvmm_vcpu_conf_cpuid cpuid;
+    struct nvmm_vcpu_conf_tpr tpr;
+    Error *local_error = NULL;
+    struct qemu_vcpu *qcpu;
+    int ret, err;
+
+    nvmm_init_cpu_signals();
+
+    if (nvmm_migration_blocker == NULL) {
+        error_setg(&nvmm_migration_blocker,
+            "NVMM: Migration not supported");
+
+        (void)migrate_add_blocker(nvmm_migration_blocker, &local_error);
+        if (local_error) {
+            error_report_err(local_error);
+            migrate_del_blocker(nvmm_migration_blocker);
+            error_free(nvmm_migration_blocker);
+            return -EINVAL;
+        }
+    }
+
+    qcpu = g_malloc0(sizeof(*qcpu));
+    if (qcpu == NULL) {
+        error_report("NVMM: Failed to allocate VCPU context.");
+        return -ENOMEM;
+    }
+
+    ret = nvmm_vcpu_create(mach, cpu->cpu_index, &qcpu->vcpu);
+    if (ret == -1) {
+        err = errno;
+        error_report("NVMM: Failed to create a virtual processor,"
+            " error=%d", err);
+        g_free(qcpu);
+        return -err;
+    }
+
+    memset(&cpuid, 0, sizeof(cpuid));
+    cpuid.mask = 1;
+    cpuid.leaf = 0x00000001;
+    cpuid.u.mask.set.edx = CPUID_MCE | CPUID_MCA | CPUID_MTRR;
+    ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CPUID,
+        &cpuid);
+    if (ret == -1) {
+        err = errno;
+        error_report("NVMM: Failed to configure a virtual processor,"
+            " error=%d", err);
+        g_free(qcpu);
+        return -err;
+    }
+
+    ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CALLBACKS,
+        &nvmm_callbacks);
+    if (ret == -1) {
+        err = errno;
+        error_report("NVMM: Failed to configure a virtual processor,"
+            " error=%d", err);
+        g_free(qcpu);
+        return -err;
+    }
+
+    if (qemu_mach.cap.arch.vcpu_conf_support & NVMM_CAP_ARCH_VCPU_CONF_TPR) {
+        memset(&tpr, 0, sizeof(tpr));
+        tpr.exit_changed = 1;
+        ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_TPR, &tpr);
+        if (ret == -1) {
+            err = errno;
+            error_report("NVMM: Failed to configure a virtual processor,"
+                " error=%d", err);
+            g_free(qcpu);
+            return -err;
+        }
+    }
+
+    cpu->vcpu_dirty = true;
+    cpu->hax_vcpu = (struct hax_vcpu_state *)qcpu;
+
+    return 0;
+}
+
+int
+nvmm_vcpu_exec(CPUState *cpu)
+{
+    int ret, fatal;
+
+    while (1) {
+        if (cpu->exception_index >= EXCP_INTERRUPT) {
+            ret = cpu->exception_index;
+            cpu->exception_index = -1;
+            break;
+        }
+
+        fatal = nvmm_vcpu_loop(cpu);
+
+        if (fatal) {
+            error_report("NVMM: Failed to execute a VCPU.");
+            abort();
+        }
+    }
+
+    return ret;
+}
+
+void
+nvmm_destroy_vcpu(CPUState *cpu)
+{
+    struct nvmm_machine *mach = get_nvmm_mach();
+    struct qemu_vcpu *qcpu = get_qemu_vcpu(cpu);
+
+    nvmm_vcpu_destroy(mach, &qcpu->vcpu);
+    g_free(cpu->hax_vcpu);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static void
+nvmm_update_mapping(hwaddr start_pa, ram_addr_t size, uintptr_t hva,
+    bool add, bool rom, const char *name)
+{
+    struct nvmm_machine *mach = get_nvmm_mach();
+    int ret, prot;
+
+    if (add) {
+        prot = PROT_READ | PROT_EXEC;
+        if (!rom) {
+            prot |= PROT_WRITE;
+        }
+        ret = nvmm_gpa_map(mach, hva, start_pa, size, prot);
+    } else {
+        ret = nvmm_gpa_unmap(mach, hva, start_pa, size);
+    }
+
+    if (ret == -1) {
+        error_report("NVMM: Failed to %s GPA range '%s' PA:%p, "
+            "Size:%p bytes, HostVA:%p, error=%d",
+            (add ? "map" : "unmap"), name, (void *)(uintptr_t)start_pa,
+            (void *)size, (void *)hva, errno);
+    }
+}
+
+static void
+nvmm_process_section(MemoryRegionSection *section, int add)
+{
+    MemoryRegion *mr = section->mr;
+    hwaddr start_pa = section->offset_within_address_space;
+    ram_addr_t size = int128_get64(section->size);
+    unsigned int delta;
+    uintptr_t hva;
+
+    if (!memory_region_is_ram(mr)) {
+        return;
+    }
+
+    /* Adjust start_pa and size so that they are page-aligned. */
+    delta = qemu_real_host_page_size - (start_pa & ~qemu_real_host_page_mask);
+    delta &= ~qemu_real_host_page_mask;
+    if (delta > size) {
+        return;
+    }
+    start_pa += delta;
+    size -= delta;
+    size &= qemu_real_host_page_mask;
+    if (!size || (start_pa & ~qemu_real_host_page_mask)) {
+        return;
+    }
+
+    hva = (uintptr_t)memory_region_get_ram_ptr(mr) +
+        section->offset_within_region + delta;
+
+    nvmm_update_mapping(start_pa, size, hva, add,
+        memory_region_is_rom(mr), mr->name);
+}
+
+static void
+nvmm_region_add(MemoryListener *listener, MemoryRegionSection *section)
+{
+    memory_region_ref(section->mr);
+    nvmm_process_section(section, 1);
+}
+
+static void
+nvmm_region_del(MemoryListener *listener, MemoryRegionSection *section)
+{
+    nvmm_process_section(section, 0);
+    memory_region_unref(section->mr);
+}
+
+static void
+nvmm_transaction_begin(MemoryListener *listener)
+{
+    /* nothing */
+}
+
+static void
+nvmm_transaction_commit(MemoryListener *listener)
+{
+    /* nothing */
+}
+
+static void
+nvmm_log_sync(MemoryListener *listener, MemoryRegionSection *section)
+{
+    MemoryRegion *mr = section->mr;
+
+    if (!memory_region_is_ram(mr)) {
+        return;
+    }
+
+    memory_region_set_dirty(mr, 0, int128_get64(section->size));
+}
+
+static MemoryListener nvmm_memory_listener = {
+    .begin = nvmm_transaction_begin,
+    .commit = nvmm_transaction_commit,
+    .region_add = nvmm_region_add,
+    .region_del = nvmm_region_del,
+    .log_sync = nvmm_log_sync,
+    .priority = 10,
+};
+
+static void
+nvmm_ram_block_added(RAMBlockNotifier *n, void *host, size_t size)
+{
+    struct nvmm_machine *mach = get_nvmm_mach();
+    uintptr_t hva = (uintptr_t)host;
+    int ret;
+
+    ret = nvmm_hva_map(mach, hva, size);
+
+    if (ret == -1) {
+        error_report("NVMM: Failed to map HVA, HostVA:%p "
+            "Size:%p bytes, error=%d",
+            (void *)hva, (void *)size, errno);
+    }
+}
+
+static struct RAMBlockNotifier nvmm_ram_notifier = {
+    .ram_block_added = nvmm_ram_block_added
+};
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_accel_init(MachineState *ms)
+{
+    int ret, err;
+
+    ret = nvmm_init();
+    if (ret == -1) {
+        err = errno;
+        error_report("NVMM: Initialization failed, error=%d", errno);
+        return -err;
+    }
+
+    ret = nvmm_capability(&qemu_mach.cap);
+    if (ret == -1) {
+        err = errno;
+        error_report("NVMM: Unable to fetch capability, error=%d", errno);
+        return -err;
+    }
+    if (qemu_mach.cap.version < NVMM_KERN_VERSION) {
+        error_report("NVMM: Unsupported version %u", qemu_mach.cap.version);
+        return -EPROGMISMATCH;
+    }
+    if (qemu_mach.cap.state_size != sizeof(struct nvmm_x64_state)) {
+        error_report("NVMM: Wrong state size %u", qemu_mach.cap.state_size);
+        return -EPROGMISMATCH;
+    }
+
+    ret = nvmm_machine_create(&qemu_mach.mach);
+    if (ret == -1) {
+        err = errno;
+        error_report("NVMM: Machine creation failed, error=%d", errno);
+        return -err;
+    }
+
+    memory_listener_register(&nvmm_memory_listener, &address_space_memory);
+    ram_block_notifier_add(&nvmm_ram_notifier);
+
+    printf("NetBSD Virtual Machine Monitor accelerator is operational\n");
+    return 0;
+}
+
+int
+nvmm_enabled(void)
+{
+    return nvmm_allowed;
+}
+
+static void
+nvmm_accel_class_init(ObjectClass *oc, void *data)
+{
+    AccelClass *ac = ACCEL_CLASS(oc);
+    ac->name = "NVMM";
+    ac->init_machine = nvmm_accel_init;
+    ac->allowed = &nvmm_allowed;
+}
+
+static const TypeInfo nvmm_accel_type = {
+    .name = ACCEL_CLASS_NAME("nvmm"),
+    .parent = TYPE_ACCEL,
+    .class_init = nvmm_accel_class_init,
+};
+
+static void
+nvmm_type_init(void)
+{
+    type_register_static(&nvmm_accel_type);
+}
+
+type_init(nvmm_type_init);
diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
index 4b2290650b..ff8ae73002 100644
--- a/target/sparc/cpu.h
+++ b/target/sparc/cpu.h
@@ -615,15 +615,9 @@ int cpu_cwp_inc(CPUSPARCState *env1, int cwp);
 int cpu_cwp_dec(CPUSPARCState *env1, int cwp);
 void cpu_set_cwp(CPUSPARCState *env1, int new_cwp);
 
-/* int_helper.c */
-void leon3_irq_manager(CPUSPARCState *env, void *irq_manager, int intno);
-
 /* sun4m.c, sun4u.c */
 void cpu_check_irqs(CPUSPARCState *env);
 
-/* leon3.c */
-void leon3_irq_ack(void *irq_manager, int intno);
-
 #if defined (TARGET_SPARC64)
 
 static inline int compare_masked(uint64_t x, uint64_t y, uint64_t mask)
diff --git a/target/sparc/int32_helper.c b/target/sparc/int32_helper.c
index 817a463a17..82e8418e46 100644
--- a/target/sparc/int32_helper.c
+++ b/target/sparc/int32_helper.c
@@ -18,6 +18,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include "cpu.h"
 #include "trace.h"
 #include "exec/log.h"
@@ -64,6 +65,38 @@ static const char *excp_name_str(int32_t exception_index)
     return excp_names[exception_index];
 }
 
+void cpu_check_irqs(CPUSPARCState *env)
+{
+    CPUState *cs;
+
+    /* We should be holding the BQL before we mess with IRQs */
+    g_assert(qemu_mutex_iothread_locked());
+
+    if (env->pil_in && (env->interrupt_index == 0 ||
+                        (env->interrupt_index & ~15) == TT_EXTINT)) {
+        unsigned int i;
+
+        for (i = 15; i > 0; i--) {
+            if (env->pil_in & (1 << i)) {
+                int old_interrupt = env->interrupt_index;
+
+                env->interrupt_index = TT_EXTINT | i;
+                if (old_interrupt != env->interrupt_index) {
+                    cs = env_cpu(env);
+                    trace_sun4m_cpu_interrupt(i);
+                    cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+                }
+                break;
+            }
+        }
+    } else if (!env->pil_in && (env->interrupt_index & ~15) == TT_EXTINT) {
+        cs = env_cpu(env);
+        trace_sun4m_cpu_reset_interrupt(env->interrupt_index & 15);
+        env->interrupt_index = 0;
+        cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+    }
+}
+
 void sparc_cpu_do_interrupt(CPUState *cs)
 {
     SPARCCPU *cpu = SPARC_CPU(cs);
@@ -136,40 +169,3 @@ void sparc_cpu_do_interrupt(CPUState *cs)
     }
 #endif
 }
-
-#if !defined(CONFIG_USER_ONLY)
-static void leon3_cache_control_int(CPUSPARCState *env)
-{
-    uint32_t state = 0;
-
-    if (env->cache_control & CACHE_CTRL_IF) {
-        /* Instruction cache state */
-        state = env->cache_control & CACHE_STATE_MASK;
-        if (state == CACHE_ENABLED) {
-            state = CACHE_FROZEN;
-            trace_int_helper_icache_freeze();
-        }
-
-        env->cache_control &= ~CACHE_STATE_MASK;
-        env->cache_control |= state;
-    }
-
-    if (env->cache_control & CACHE_CTRL_DF) {
-        /* Data cache state */
-        state = (env->cache_control >> 2) & CACHE_STATE_MASK;
-        if (state == CACHE_ENABLED) {
-            state = CACHE_FROZEN;
-            trace_int_helper_dcache_freeze();
-        }
-
-        env->cache_control &= ~(CACHE_STATE_MASK << 2);
-        env->cache_control |= (state << 2);
-    }
-}
-
-void leon3_irq_manager(CPUSPARCState *env, void *irq_manager, int intno)
-{
-    leon3_irq_ack(irq_manager, intno);
-    leon3_cache_control_int(env);
-}
-#endif
diff --git a/target/sparc/int64_helper.c b/target/sparc/int64_helper.c
index 7fb8ab211c..793e57c536 100644
--- a/target/sparc/int64_helper.c
+++ b/target/sparc/int64_helper.c
@@ -62,6 +62,72 @@ static const char * const excp_names[0x80] = {
 };
 #endif
 
+void cpu_check_irqs(CPUSPARCState *env)
+{
+    CPUState *cs;
+    uint32_t pil = env->pil_in |
+                  (env->softint & ~(SOFTINT_TIMER | SOFTINT_STIMER));
+
+    /* We should be holding the BQL before we mess with IRQs */
+    g_assert(qemu_mutex_iothread_locked());
+
+    /* TT_IVEC has a higher priority (16) than TT_EXTINT (31..17) */
+    if (env->ivec_status & 0x20) {
+        return;
+    }
+    cs = env_cpu(env);
+    /*
+     * check if TM or SM in SOFTINT are set
+     * setting these also causes interrupt 14
+     */
+    if (env->softint & (SOFTINT_TIMER | SOFTINT_STIMER)) {
+        pil |= 1 << 14;
+    }
+
+    /*
+     * The bit corresponding to psrpil is (1<< psrpil),
+     * the next bit is (2 << psrpil).
+     */
+    if (pil < (2 << env->psrpil)) {
+        if (cs->interrupt_request & CPU_INTERRUPT_HARD) {
+            trace_sparc64_cpu_check_irqs_reset_irq(env->interrupt_index);
+            env->interrupt_index = 0;
+            cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+        }
+        return;
+    }
+
+    if (cpu_interrupts_enabled(env)) {
+
+        unsigned int i;
+
+        for (i = 15; i > env->psrpil; i--) {
+            if (pil & (1 << i)) {
+                int old_interrupt = env->interrupt_index;
+                int new_interrupt = TT_EXTINT | i;
+
+                if (unlikely(env->tl > 0 && cpu_tsptr(env)->tt > new_interrupt
+                  && ((cpu_tsptr(env)->tt & 0x1f0) == TT_EXTINT))) {
+                    trace_sparc64_cpu_check_irqs_noset_irq(env->tl,
+                                                      cpu_tsptr(env)->tt,
+                                                      new_interrupt);
+                } else if (old_interrupt != new_interrupt) {
+                    env->interrupt_index = new_interrupt;
+                    trace_sparc64_cpu_check_irqs_set_irq(i, old_interrupt,
+                                                         new_interrupt);
+                    cpu_interrupt(cs, CPU_INTERRUPT_HARD);
+                }
+                break;
+            }
+        }
+    } else if (cs->interrupt_request & CPU_INTERRUPT_HARD) {
+        trace_sparc64_cpu_check_irqs_disabled(pil, env->pil_in, env->softint,
+                                              env->interrupt_index);
+        env->interrupt_index = 0;
+        cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD);
+    }
+}
+
 void sparc_cpu_do_interrupt(CPUState *cs)
 {
     SPARCCPU *cpu = SPARC_CPU(cs);
diff --git a/target/sparc/trace-events b/target/sparc/trace-events
index 6a064e2327..75e7093d5f 100644
--- a/target/sparc/trace-events
+++ b/target/sparc/trace-events
@@ -10,14 +10,18 @@ mmu_helper_get_phys_addr_code(uint32_t tl, int mmu_idx, uint64_t prim_context, u
 mmu_helper_get_phys_addr_data(uint32_t tl, int mmu_idx, uint64_t prim_context, uint64_t sec_context, uint64_t address) "tl=%d mmu_idx=%d primary context=0x%"PRIx64" secondary context=0x%"PRIx64" address=0x%"PRIx64
 mmu_helper_mmu_fault(uint64_t address, uint64_t paddr, int mmu_idx, uint32_t tl, uint64_t prim_context, uint64_t sec_context) "Translate at 0x%"PRIx64" -> 0x%"PRIx64", mmu_idx=%d tl=%d primary context=0x%"PRIx64" secondary context=0x%"PRIx64
 
+# int32_helper.c
+sun4m_cpu_interrupt(unsigned int level) "Set CPU IRQ %d"
+sun4m_cpu_reset_interrupt(unsigned int level) "Reset CPU IRQ %d"
+
 # int64_helper.c
 int_helper_set_softint(uint32_t softint) "new 0x%08x"
 int_helper_clear_softint(uint32_t softint) "new 0x%08x"
 int_helper_write_softint(uint32_t softint) "new 0x%08x"
-
-# int32_helper.c
-int_helper_icache_freeze(void) "Instruction cache: freeze"
-int_helper_dcache_freeze(void) "Data cache: freeze"
+sparc64_cpu_check_irqs_reset_irq(int intno) "Reset CPU IRQ (current interrupt 0x%x)"
+sparc64_cpu_check_irqs_noset_irq(uint32_t tl, uint32_t tt, int intno) "Not setting CPU IRQ: TL=%d current 0x%x >= pending 0x%x"
+sparc64_cpu_check_irqs_set_irq(unsigned int i, int old, int new) "Set CPU IRQ %d old=0x%x new=0x%x"
+sparc64_cpu_check_irqs_disabled(uint32_t pil, uint32_t pil_in, uint32_t softint, int intno) "Interrupts disabled, pil=0x%08x pil_in=0x%08x softint=0x%08x current interrupt 0x%x"
 
 # win_helper.c
 win_helper_gregset_error(uint32_t pstate) "ERROR in get_gregset: active pstate bits=0x%x"