summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorBlue Swirl <blauwirbel@gmail.com>2011-07-20 20:23:01 +0000
committerBlue Swirl <blauwirbel@gmail.com>2011-07-20 20:23:01 +0000
commita3ce3668ccff7d350a4f795ad99a012a6d41caef (patch)
tree01f5f907ceaba5daaf3509b0a051dd8c76339271
parent03ff09580ef6cbc4a893b6e3e6bbff33180ec70a (diff)
parent82845826e89fdc02f6f000fca5d5019ec9be4ab3 (diff)
downloadfocaccia-qemu-a3ce3668ccff7d350a4f795ad99a012a6d41caef.tar.gz
focaccia-qemu-a3ce3668ccff7d350a4f795ad99a012a6d41caef.zip
Merge branch 'for-upstream' of git://git.linaro.org/people/pmaydell/qemu-arm
* 'for-upstream' of git://git.linaro.org/people/pmaydell/qemu-arm:
  target-arm: Fix BASEPRI, BASEPRI_MAX, and FAULTMASK access
  target-arm: Minimal implementation of performance counters
  Revert "Makefile.target: Allow target helpers to be in any *_helper.c file"
  Revert "target-arm: Use global env in neon_helper.c helpers"
  target-arm: Pass fp status pointer explicitly to neon fp helpers
  target-arm: Make VFP binop helpers take pointer to fpstatus, not CPUState
  target-arm: Add helper function to generate code to get fpstatus pointer
  Revert "target-arm: Use global env in iwmmxt_helper.c helpers"

Conflicts:
	Makefile.target
-rw-r--r--Makefile.target2
-rw-r--r--target-arm/cpu.h8
-rw-r--r--target-arm/helper.c193
-rw-r--r--target-arm/helper.h293
-rw-r--r--target-arm/iwmmxt_helper.c80
-rw-r--r--target-arm/machine.c12
-rw-r--r--target-arm/neon_helper.c201
-rw-r--r--target-arm/translate.c457
8 files changed, 753 insertions, 493 deletions
diff --git a/Makefile.target b/Makefile.target
index c566eb1149..fe785161cb 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -95,7 +95,7 @@ tcg/tcg.o: cpu.h
 
 # HELPER_CFLAGS is used for all the code compiled with static register
 # variables
-%_helper.o user-exec.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
+op_helper.o user-exec.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
 
 # Note: this is a workaround. The real fix is to avoid compiling
 # cpu_signal_handler() in user-exec.c.
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 1022a03753..adef42785c 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -133,6 +133,12 @@ typedef struct CPUARMState {
         uint32_t c7_par;  /* Translation result. */
         uint32_t c9_insn; /* Cache lockdown registers.  */
         uint32_t c9_data;
+        uint32_t c9_pmcr; /* performance monitor control register */
+        uint32_t c9_pmcnten; /* perf monitor counter enables */
+        uint32_t c9_pmovsr; /* perf monitor overflow status */
+        uint32_t c9_pmxevtyper; /* perf monitor event type */
+        uint32_t c9_pmuserenr; /* perf monitor user enable */
+        uint32_t c9_pminten; /* perf monitor interrupt enables */
         uint32_t c13_fcse; /* FCSE PID.  */
         uint32_t c13_context; /* Context ID.  */
         uint32_t c13_tls1; /* User RW Thread register.  */
@@ -438,7 +444,7 @@ void cpu_arm_set_cp_io(CPUARMState *env, int cpnum,
 #define cpu_signal_handler cpu_arm_signal_handler
 #define cpu_list arm_cpu_list
 
-#define CPU_SAVE_VERSION 3
+#define CPU_SAVE_VERSION 4
 
 /* MMU modes definitions */
 #define MMU_MODE0_SUFFIX _kernel
diff --git a/target-arm/helper.c b/target-arm/helper.c
index f4d12aab55..ae4f334e40 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -269,6 +269,10 @@ void cpu_reset(CPUARMState *env)
     }
     env->vfp.xregs[ARM_VFP_FPEXC] = 0;
     env->cp15.c2_base_mask = 0xffffc000u;
+    /* v7 performance monitor control register: same implementor
+     * field as main ID register, and we implement no event counters.
+     */
+    env->cp15.c9_pmcr = (id & 0xff000000);
 #endif
     set_flush_to_zero(1, &env->vfp.standard_fp_status);
     set_flush_inputs_to_zero(1, &env->vfp.standard_fp_status);
@@ -1587,6 +1591,81 @@ void HELPER(set_cp15)(CPUState *env, uint32_t insn, uint32_t val)
         case 1: /* TCM memory region registers.  */
             /* Not implemented.  */
             goto bad_reg;
+        case 12: /* Performance monitor control */
+            /* Performance monitors are implementation defined in v7,
+             * but with an ARM recommended set of registers, which we
+             * follow (although we don't actually implement any counters)
+             */
+            if (!arm_feature(env, ARM_FEATURE_V7)) {
+                goto bad_reg;
+            }
+            switch (op2) {
+            case 0: /* performance monitor control register */
+                /* only the DP, X, D and E bits are writable */
+                env->cp15.c9_pmcr &= ~0x39;
+                env->cp15.c9_pmcr |= (val & 0x39);
+                break;
+            case 1: /* Count enable set register */
+                val &= (1 << 31);
+                env->cp15.c9_pmcnten |= val;
+                break;
+            case 2: /* Count enable clear */
+                val &= (1 << 31);
+                env->cp15.c9_pmcnten &= ~val;
+                break;
+            case 3: /* Overflow flag status */
+                env->cp15.c9_pmovsr &= ~val;
+                break;
+            case 4: /* Software increment */
+                /* RAZ/WI since we don't implement the software-count event */
+                break;
+            case 5: /* Event counter selection register */
+                /* Since we don't implement any events, writing to this register
+                 * is actually UNPREDICTABLE. So we choose to RAZ/WI.
+                 */
+                break;
+            default:
+                goto bad_reg;
+            }
+            break;
+        case 13: /* Performance counters */
+            if (!arm_feature(env, ARM_FEATURE_V7)) {
+                goto bad_reg;
+            }
+            switch (op2) {
+            case 0: /* Cycle count register: not implemented, so RAZ/WI */
+                break;
+            case 1: /* Event type select */
+                env->cp15.c9_pmxevtyper = val & 0xff;
+                break;
+            case 2: /* Event count register */
+                /* Unimplemented (we have no events), RAZ/WI */
+                break;
+            default:
+                goto bad_reg;
+            }
+            break;
+        case 14: /* Performance monitor control */
+            if (!arm_feature(env, ARM_FEATURE_V7)) {
+                goto bad_reg;
+            }
+            switch (op2) {
+            case 0: /* user enable */
+                env->cp15.c9_pmuserenr = val & 1;
+                /* changes access rights for cp registers, so flush tbs */
+                tb_flush(env);
+                break;
+            case 1: /* interrupt enable set */
+                /* We have no event counters so only the C bit can be changed */
+                val &= (1 << 31);
+                env->cp15.c9_pminten |= val;
+                break;
+            case 2: /* interrupt enable clear */
+                val &= (1 << 31);
+                env->cp15.c9_pminten &= ~val;
+                break;
+            }
+            break;
         default:
             goto bad_reg;
         }
@@ -1878,27 +1957,81 @@ uint32_t HELPER(get_cp15)(CPUState *env, uint32_t insn)
         return 0;
     case 8: /* MMU TLB control.  */
         goto bad_reg;
-    case 9: /* Cache lockdown.  */
-        switch (op1) {
-        case 0: /* L1 cache.  */
-	    if (arm_feature(env, ARM_FEATURE_OMAPCP))
-		return 0;
+    case 9:
+        switch (crm) {
+        case 0: /* Cache lockdown */
+            switch (op1) {
+            case 0: /* L1 cache.  */
+                if (arm_feature(env, ARM_FEATURE_OMAPCP)) {
+                    return 0;
+                }
+                switch (op2) {
+                case 0:
+                    return env->cp15.c9_data;
+                case 1:
+                    return env->cp15.c9_insn;
+                default:
+                    goto bad_reg;
+                }
+            case 1: /* L2 cache */
+                if (crm != 0) {
+                    goto bad_reg;
+                }
+                /* L2 Lockdown and Auxiliary control.  */
+                return 0;
+            default:
+                goto bad_reg;
+            }
+            break;
+        case 12: /* Performance monitor control */
+            if (!arm_feature(env, ARM_FEATURE_V7)) {
+                goto bad_reg;
+            }
             switch (op2) {
-            case 0:
-                return env->cp15.c9_data;
-            case 1:
-                return env->cp15.c9_insn;
+            case 0: /* performance monitor control register */
+                return env->cp15.c9_pmcr;
+            case 1: /* count enable set */
+            case 2: /* count enable clear */
+                return env->cp15.c9_pmcnten;
+            case 3: /* overflow flag status */
+                return env->cp15.c9_pmovsr;
+            case 4: /* software increment */
+            case 5: /* event counter selection register */
+                return 0; /* Unimplemented, RAZ/WI */
             default:
                 goto bad_reg;
             }
-        case 1: /* L2 cache */
-            if (crm != 0)
+        case 13: /* Performance counters */
+            if (!arm_feature(env, ARM_FEATURE_V7)) {
+                goto bad_reg;
+            }
+            switch (op2) {
+            case 1: /* Event type select */
+                return env->cp15.c9_pmxevtyper;
+            case 0: /* Cycle count register */
+            case 2: /* Event count register */
+                /* Unimplemented, so RAZ/WI */
+                return 0;
+            default:
+                goto bad_reg;
+            }
+        case 14: /* Performance monitor control */
+            if (!arm_feature(env, ARM_FEATURE_V7)) {
                 goto bad_reg;
-            /* L2 Lockdown and Auxiliary control.  */
-            return 0;
+            }
+            switch (op2) {
+            case 0: /* user enable */
+                return env->cp15.c9_pmuserenr;
+            case 1: /* interrupt enable set */
+            case 2: /* interrupt enable clear */
+                return env->cp15.c9_pminten;
+            default:
+                goto bad_reg;
+            }
         default:
             goto bad_reg;
         }
+        break;
     case 10: /* MMU TLB lockdown.  */
         /* ??? TLB lockdown not implemented.  */
         return 0;
@@ -1994,11 +2127,11 @@ uint32_t HELPER(v7m_mrs)(CPUState *env, uint32_t reg)
         return env->v7m.current_sp ? env->regs[13] : env->v7m.other_sp;
     case 16: /* PRIMASK */
         return (env->uncached_cpsr & CPSR_I) != 0;
-    case 17: /* FAULTMASK */
-        return (env->uncached_cpsr & CPSR_F) != 0;
-    case 18: /* BASEPRI */
-    case 19: /* BASEPRI_MAX */
+    case 17: /* BASEPRI */
+    case 18: /* BASEPRI_MAX */
         return env->v7m.basepri;
+    case 19: /* FAULTMASK */
+        return (env->uncached_cpsr & CPSR_F) != 0;
     case 20: /* CONTROL */
         return env->v7m.control;
     default:
@@ -2050,20 +2183,20 @@ void HELPER(v7m_msr)(CPUState *env, uint32_t reg, uint32_t val)
         else
             env->uncached_cpsr &= ~CPSR_I;
         break;
-    case 17: /* FAULTMASK */
-        if (val & 1)
-            env->uncached_cpsr |= CPSR_F;
-        else
-            env->uncached_cpsr &= ~CPSR_F;
-        break;
-    case 18: /* BASEPRI */
+    case 17: /* BASEPRI */
         env->v7m.basepri = val & 0xff;
         break;
-    case 19: /* BASEPRI_MAX */
+    case 18: /* BASEPRI_MAX */
         val &= 0xff;
         if (val != 0 && (val < env->v7m.basepri || env->v7m.basepri == 0))
             env->v7m.basepri = val;
         break;
+    case 19: /* FAULTMASK */
+        if (val & 1)
+            env->uncached_cpsr |= CPSR_F;
+        else
+            env->uncached_cpsr &= ~CPSR_F;
+        break;
     case 20: /* CONTROL */
         env->v7m.control = val & 3;
         switch_v7m_sp(env, (val & 2) != 0);
@@ -2452,13 +2585,15 @@ void vfp_set_fpscr(CPUState *env, uint32_t val)
 #define VFP_HELPER(name, p) HELPER(glue(glue(vfp_,name),p))
 
 #define VFP_BINOP(name) \
-float32 VFP_HELPER(name, s)(float32 a, float32 b, CPUState *env) \
+float32 VFP_HELPER(name, s)(float32 a, float32 b, void *fpstp) \
 { \
-    return float32_ ## name (a, b, &env->vfp.fp_status); \
+    float_status *fpst = fpstp; \
+    return float32_ ## name(a, b, fpst); \
 } \
-float64 VFP_HELPER(name, d)(float64 a, float64 b, CPUState *env) \
+float64 VFP_HELPER(name, d)(float64 a, float64 b, void *fpstp) \
 { \
-    return float64_ ## name (a, b, &env->vfp.fp_status); \
+    float_status *fpst = fpstp; \
+    return float64_ ## name(a, b, fpst); \
 }
 VFP_BINOP(add)
 VFP_BINOP(sub)
diff --git a/target-arm/helper.h b/target-arm/helper.h
index 7d5533f613..3ad1cb0881 100644
--- a/target-arm/helper.h
+++ b/target-arm/helper.h
@@ -74,14 +74,14 @@ DEF_HELPER_2(set_user_reg, void, i32, i32)
 DEF_HELPER_1(vfp_get_fpscr, i32, env)
 DEF_HELPER_2(vfp_set_fpscr, void, env, i32)
 
-DEF_HELPER_3(vfp_adds, f32, f32, f32, env)
-DEF_HELPER_3(vfp_addd, f64, f64, f64, env)
-DEF_HELPER_3(vfp_subs, f32, f32, f32, env)
-DEF_HELPER_3(vfp_subd, f64, f64, f64, env)
-DEF_HELPER_3(vfp_muls, f32, f32, f32, env)
-DEF_HELPER_3(vfp_muld, f64, f64, f64, env)
-DEF_HELPER_3(vfp_divs, f32, f32, f32, env)
-DEF_HELPER_3(vfp_divd, f64, f64, f64, env)
+DEF_HELPER_3(vfp_adds, f32, f32, f32, ptr)
+DEF_HELPER_3(vfp_addd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_subs, f32, f32, f32, ptr)
+DEF_HELPER_3(vfp_subd, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_muls, f32, f32, f32, ptr)
+DEF_HELPER_3(vfp_muld, f64, f64, f64, ptr)
+DEF_HELPER_3(vfp_divs, f32, f32, f32, ptr)
+DEF_HELPER_3(vfp_divd, f64, f64, f64, ptr)
 DEF_HELPER_1(vfp_negs, f32, f32)
 DEF_HELPER_1(vfp_negd, f64, f64)
 DEF_HELPER_1(vfp_abss, f32, f32)
@@ -154,22 +154,22 @@ DEF_HELPER_2(sar_cc, i32, i32, i32)
 DEF_HELPER_2(ror_cc, i32, i32, i32)
 
 /* neon_helper.c */
-DEF_HELPER_2(neon_qadd_u8, i32, i32, i32)
-DEF_HELPER_2(neon_qadd_s8, i32, i32, i32)
-DEF_HELPER_2(neon_qadd_u16, i32, i32, i32)
-DEF_HELPER_2(neon_qadd_s16, i32, i32, i32)
-DEF_HELPER_2(neon_qadd_u32, i32, i32, i32)
-DEF_HELPER_2(neon_qadd_s32, i32, i32, i32)
-DEF_HELPER_2(neon_qsub_u8, i32, i32, i32)
-DEF_HELPER_2(neon_qsub_s8, i32, i32, i32)
-DEF_HELPER_2(neon_qsub_u16, i32, i32, i32)
-DEF_HELPER_2(neon_qsub_s16, i32, i32, i32)
-DEF_HELPER_2(neon_qsub_u32, i32, i32, i32)
-DEF_HELPER_2(neon_qsub_s32, i32, i32, i32)
-DEF_HELPER_2(neon_qadd_u64, i64, i64, i64)
-DEF_HELPER_2(neon_qadd_s64, i64, i64, i64)
-DEF_HELPER_2(neon_qsub_u64, i64, i64, i64)
-DEF_HELPER_2(neon_qsub_s64, i64, i64, i64)
+DEF_HELPER_3(neon_qadd_u8, i32, env, i32, i32)
+DEF_HELPER_3(neon_qadd_s8, i32, env, i32, i32)
+DEF_HELPER_3(neon_qadd_u16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qadd_s16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qadd_u32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qadd_s32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qsub_u8, i32, env, i32, i32)
+DEF_HELPER_3(neon_qsub_s8, i32, env, i32, i32)
+DEF_HELPER_3(neon_qsub_u16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qsub_s16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qsub_u32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qsub_s32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qadd_u64, i64, env, i64, i64)
+DEF_HELPER_3(neon_qadd_s64, i64, env, i64, i64)
+DEF_HELPER_3(neon_qsub_u64, i64, env, i64, i64)
+DEF_HELPER_3(neon_qsub_s64, i64, env, i64, i64)
 
 DEF_HELPER_2(neon_hadd_s8, i32, i32, i32)
 DEF_HELPER_2(neon_hadd_u8, i32, i32, i32)
@@ -247,26 +247,26 @@ DEF_HELPER_2(neon_rshl_u32, i32, i32, i32)
 DEF_HELPER_2(neon_rshl_s32, i32, i32, i32)
 DEF_HELPER_2(neon_rshl_u64, i64, i64, i64)
 DEF_HELPER_2(neon_rshl_s64, i64, i64, i64)
-DEF_HELPER_2(neon_qshl_u8, i32, i32, i32)
-DEF_HELPER_2(neon_qshl_s8, i32, i32, i32)
-DEF_HELPER_2(neon_qshl_u16, i32, i32, i32)
-DEF_HELPER_2(neon_qshl_s16, i32, i32, i32)
-DEF_HELPER_2(neon_qshl_u32, i32, i32, i32)
-DEF_HELPER_2(neon_qshl_s32, i32, i32, i32)
-DEF_HELPER_2(neon_qshl_u64, i64, i64, i64)
-DEF_HELPER_2(neon_qshl_s64, i64, i64, i64)
-DEF_HELPER_2(neon_qshlu_s8, i32, i32, i32);
-DEF_HELPER_2(neon_qshlu_s16, i32, i32, i32);
-DEF_HELPER_2(neon_qshlu_s32, i32, i32, i32);
-DEF_HELPER_2(neon_qshlu_s64, i64, i64, i64);
-DEF_HELPER_2(neon_qrshl_u8, i32, i32, i32)
-DEF_HELPER_2(neon_qrshl_s8, i32, i32, i32)
-DEF_HELPER_2(neon_qrshl_u16, i32, i32, i32)
-DEF_HELPER_2(neon_qrshl_s16, i32, i32, i32)
-DEF_HELPER_2(neon_qrshl_u32, i32, i32, i32)
-DEF_HELPER_2(neon_qrshl_s32, i32, i32, i32)
-DEF_HELPER_2(neon_qrshl_u64, i64, i64, i64)
-DEF_HELPER_2(neon_qrshl_s64, i64, i64, i64)
+DEF_HELPER_3(neon_qshl_u8, i32, env, i32, i32)
+DEF_HELPER_3(neon_qshl_s8, i32, env, i32, i32)
+DEF_HELPER_3(neon_qshl_u16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qshl_s16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qshl_u32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qshl_s32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qshl_u64, i64, env, i64, i64)
+DEF_HELPER_3(neon_qshl_s64, i64, env, i64, i64)
+DEF_HELPER_3(neon_qshlu_s8, i32, env, i32, i32);
+DEF_HELPER_3(neon_qshlu_s16, i32, env, i32, i32);
+DEF_HELPER_3(neon_qshlu_s32, i32, env, i32, i32);
+DEF_HELPER_3(neon_qshlu_s64, i64, env, i64, i64);
+DEF_HELPER_3(neon_qrshl_u8, i32, env, i32, i32)
+DEF_HELPER_3(neon_qrshl_s8, i32, env, i32, i32)
+DEF_HELPER_3(neon_qrshl_u16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qrshl_s16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qrshl_u32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qrshl_s32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qrshl_u64, i64, env, i64, i64)
+DEF_HELPER_3(neon_qrshl_s64, i64, env, i64, i64)
 
 DEF_HELPER_2(neon_add_u8, i32, i32, i32)
 DEF_HELPER_2(neon_add_u16, i32, i32, i32)
@@ -295,22 +295,22 @@ DEF_HELPER_1(neon_cls_s16, i32, i32)
 DEF_HELPER_1(neon_cls_s32, i32, i32)
 DEF_HELPER_1(neon_cnt_u8, i32, i32)
 
-DEF_HELPER_2(neon_qdmulh_s16, i32, i32, i32)
-DEF_HELPER_2(neon_qrdmulh_s16, i32, i32, i32)
-DEF_HELPER_2(neon_qdmulh_s32, i32, i32, i32)
-DEF_HELPER_2(neon_qrdmulh_s32, i32, i32, i32)
+DEF_HELPER_3(neon_qdmulh_s16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qrdmulh_s16, i32, env, i32, i32)
+DEF_HELPER_3(neon_qdmulh_s32, i32, env, i32, i32)
+DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32)
 
 DEF_HELPER_1(neon_narrow_u8, i32, i64)
 DEF_HELPER_1(neon_narrow_u16, i32, i64)
-DEF_HELPER_1(neon_unarrow_sat8, i32, i64)
-DEF_HELPER_1(neon_narrow_sat_u8, i32, i64)
-DEF_HELPER_1(neon_narrow_sat_s8, i32, i64)
-DEF_HELPER_1(neon_unarrow_sat16, i32, i64)
-DEF_HELPER_1(neon_narrow_sat_u16, i32, i64)
-DEF_HELPER_1(neon_narrow_sat_s16, i32, i64)
-DEF_HELPER_1(neon_unarrow_sat32, i32, i64)
-DEF_HELPER_1(neon_narrow_sat_u32, i32, i64)
-DEF_HELPER_1(neon_narrow_sat_s32, i32, i64)
+DEF_HELPER_2(neon_unarrow_sat8, i32, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u8, i32, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s8, i32, env, i64)
+DEF_HELPER_2(neon_unarrow_sat16, i32, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u16, i32, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s16, i32, env, i64)
+DEF_HELPER_2(neon_unarrow_sat32, i32, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u32, i32, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s32, i32, env, i64)
 DEF_HELPER_1(neon_narrow_high_u8, i32, i64)
 DEF_HELPER_1(neon_narrow_high_u16, i32, i64)
 DEF_HELPER_1(neon_narrow_round_high_u8, i32, i64)
@@ -326,8 +326,8 @@ DEF_HELPER_2(neon_paddl_u16, i64, i64, i64)
 DEF_HELPER_2(neon_paddl_u32, i64, i64, i64)
 DEF_HELPER_2(neon_subl_u16, i64, i64, i64)
 DEF_HELPER_2(neon_subl_u32, i64, i64, i64)
-DEF_HELPER_2(neon_addl_saturate_s32, i64, i64, i64)
-DEF_HELPER_2(neon_addl_saturate_s64, i64, i64, i64)
+DEF_HELPER_3(neon_addl_saturate_s32, i64, env, i64, i64)
+DEF_HELPER_3(neon_addl_saturate_s64, i64, env, i64, i64)
 DEF_HELPER_2(neon_abdl_u16, i64, i32, i32)
 DEF_HELPER_2(neon_abdl_s16, i64, i32, i32)
 DEF_HELPER_2(neon_abdl_u32, i64, i32, i32)
@@ -343,24 +343,21 @@ DEF_HELPER_1(neon_negl_u16, i64, i64)
 DEF_HELPER_1(neon_negl_u32, i64, i64)
 DEF_HELPER_1(neon_negl_u64, i64, i64)
 
-DEF_HELPER_1(neon_qabs_s8, i32, i32)
-DEF_HELPER_1(neon_qabs_s16, i32, i32)
-DEF_HELPER_1(neon_qabs_s32, i32, i32)
-DEF_HELPER_1(neon_qneg_s8, i32, i32)
-DEF_HELPER_1(neon_qneg_s16, i32, i32)
-DEF_HELPER_1(neon_qneg_s32, i32, i32)
-
-DEF_HELPER_2(neon_min_f32, i32, i32, i32)
-DEF_HELPER_2(neon_max_f32, i32, i32, i32)
-DEF_HELPER_2(neon_abd_f32, i32, i32, i32)
-DEF_HELPER_2(neon_add_f32, i32, i32, i32)
-DEF_HELPER_2(neon_sub_f32, i32, i32, i32)
-DEF_HELPER_2(neon_mul_f32, i32, i32, i32)
-DEF_HELPER_2(neon_ceq_f32, i32, i32, i32)
-DEF_HELPER_2(neon_cge_f32, i32, i32, i32)
-DEF_HELPER_2(neon_cgt_f32, i32, i32, i32)
-DEF_HELPER_2(neon_acge_f32, i32, i32, i32)
-DEF_HELPER_2(neon_acgt_f32, i32, i32, i32)
+DEF_HELPER_2(neon_qabs_s8, i32, env, i32)
+DEF_HELPER_2(neon_qabs_s16, i32, env, i32)
+DEF_HELPER_2(neon_qabs_s32, i32, env, i32)
+DEF_HELPER_2(neon_qneg_s8, i32, env, i32)
+DEF_HELPER_2(neon_qneg_s16, i32, env, i32)
+DEF_HELPER_2(neon_qneg_s32, i32, env, i32)
+
+DEF_HELPER_3(neon_min_f32, i32, i32, i32, ptr)
+DEF_HELPER_3(neon_max_f32, i32, i32, i32, ptr)
+DEF_HELPER_3(neon_abd_f32, i32, i32, i32, ptr)
+DEF_HELPER_3(neon_ceq_f32, i32, i32, i32, ptr)
+DEF_HELPER_3(neon_cge_f32, i32, i32, i32, ptr)
+DEF_HELPER_3(neon_cgt_f32, i32, i32, i32, ptr)
+DEF_HELPER_3(neon_acge_f32, i32, i32, i32, ptr)
+DEF_HELPER_3(neon_acgt_f32, i32, i32, i32, ptr)
 
 /* iwmmxt_helper.c */
 DEF_HELPER_2(iwmmxt_maddsq, i64, i64, i64)
@@ -375,47 +372,47 @@ DEF_HELPER_2(iwmmxt_macsw, i64, i64, i64)
 DEF_HELPER_2(iwmmxt_macuw, i64, i64, i64)
 DEF_HELPER_1(iwmmxt_setpsr_nz, i32, i64)
 
-#define DEF_IWMMXT_HELPER_SIZE(name) \
-DEF_HELPER_2(iwmmxt_##name##b, i64, i64, i64) \
-DEF_HELPER_2(iwmmxt_##name##w, i64, i64, i64) \
-DEF_HELPER_2(iwmmxt_##name##l, i64, i64, i64) \
-
-DEF_IWMMXT_HELPER_SIZE(unpackl)
-DEF_IWMMXT_HELPER_SIZE(unpackh)
-
-DEF_HELPER_1(iwmmxt_unpacklub, i64, i64)
-DEF_HELPER_1(iwmmxt_unpackluw, i64, i64)
-DEF_HELPER_1(iwmmxt_unpacklul, i64, i64)
-DEF_HELPER_1(iwmmxt_unpackhub, i64, i64)
-DEF_HELPER_1(iwmmxt_unpackhuw, i64, i64)
-DEF_HELPER_1(iwmmxt_unpackhul, i64, i64)
-DEF_HELPER_1(iwmmxt_unpacklsb, i64, i64)
-DEF_HELPER_1(iwmmxt_unpacklsw, i64, i64)
-DEF_HELPER_1(iwmmxt_unpacklsl, i64, i64)
-DEF_HELPER_1(iwmmxt_unpackhsb, i64, i64)
-DEF_HELPER_1(iwmmxt_unpackhsw, i64, i64)
-DEF_HELPER_1(iwmmxt_unpackhsl, i64, i64)
-
-DEF_IWMMXT_HELPER_SIZE(cmpeq)
-DEF_IWMMXT_HELPER_SIZE(cmpgtu)
-DEF_IWMMXT_HELPER_SIZE(cmpgts)
-
-DEF_IWMMXT_HELPER_SIZE(mins)
-DEF_IWMMXT_HELPER_SIZE(minu)
-DEF_IWMMXT_HELPER_SIZE(maxs)
-DEF_IWMMXT_HELPER_SIZE(maxu)
-
-DEF_IWMMXT_HELPER_SIZE(subn)
-DEF_IWMMXT_HELPER_SIZE(addn)
-DEF_IWMMXT_HELPER_SIZE(subu)
-DEF_IWMMXT_HELPER_SIZE(addu)
-DEF_IWMMXT_HELPER_SIZE(subs)
-DEF_IWMMXT_HELPER_SIZE(adds)
-
-DEF_HELPER_2(iwmmxt_avgb0, i64, i64, i64)
-DEF_HELPER_2(iwmmxt_avgb1, i64, i64, i64)
-DEF_HELPER_2(iwmmxt_avgw0, i64, i64, i64)
-DEF_HELPER_2(iwmmxt_avgw1, i64, i64, i64)
+#define DEF_IWMMXT_HELPER_SIZE_ENV(name) \
+DEF_HELPER_3(iwmmxt_##name##b, i64, env, i64, i64) \
+DEF_HELPER_3(iwmmxt_##name##w, i64, env, i64, i64) \
+DEF_HELPER_3(iwmmxt_##name##l, i64, env, i64, i64) \
+
+DEF_IWMMXT_HELPER_SIZE_ENV(unpackl)
+DEF_IWMMXT_HELPER_SIZE_ENV(unpackh)
+
+DEF_HELPER_2(iwmmxt_unpacklub, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpackluw, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpacklul, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpackhub, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpackhuw, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpackhul, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpacklsb, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpacklsw, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpacklsl, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpackhsb, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpackhsw, i64, env, i64)
+DEF_HELPER_2(iwmmxt_unpackhsl, i64, env, i64)
+
+DEF_IWMMXT_HELPER_SIZE_ENV(cmpeq)
+DEF_IWMMXT_HELPER_SIZE_ENV(cmpgtu)
+DEF_IWMMXT_HELPER_SIZE_ENV(cmpgts)
+
+DEF_IWMMXT_HELPER_SIZE_ENV(mins)
+DEF_IWMMXT_HELPER_SIZE_ENV(minu)
+DEF_IWMMXT_HELPER_SIZE_ENV(maxs)
+DEF_IWMMXT_HELPER_SIZE_ENV(maxu)
+
+DEF_IWMMXT_HELPER_SIZE_ENV(subn)
+DEF_IWMMXT_HELPER_SIZE_ENV(addn)
+DEF_IWMMXT_HELPER_SIZE_ENV(subu)
+DEF_IWMMXT_HELPER_SIZE_ENV(addu)
+DEF_IWMMXT_HELPER_SIZE_ENV(subs)
+DEF_IWMMXT_HELPER_SIZE_ENV(adds)
+
+DEF_HELPER_3(iwmmxt_avgb0, i64, env, i64, i64)
+DEF_HELPER_3(iwmmxt_avgb1, i64, env, i64, i64)
+DEF_HELPER_3(iwmmxt_avgw0, i64, env, i64, i64)
+DEF_HELPER_3(iwmmxt_avgw1, i64, env, i64, i64)
 
 DEF_HELPER_2(iwmmxt_msadb, i64, i64, i64)
 
@@ -434,26 +431,26 @@ DEF_HELPER_1(iwmmxt_msbb, i32, i64)
 DEF_HELPER_1(iwmmxt_msbw, i32, i64)
 DEF_HELPER_1(iwmmxt_msbl, i32, i64)
 
-DEF_HELPER_2(iwmmxt_srlw, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_srll, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_srlq, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_sllw, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_slll, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_sllq, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_sraw, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_sral, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_sraq, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_rorw, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_rorl, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_rorq, i64, i64, i32)
-DEF_HELPER_2(iwmmxt_shufh, i64, i64, i32)
-
-DEF_HELPER_2(iwmmxt_packuw, i64, i64, i64)
-DEF_HELPER_2(iwmmxt_packul, i64, i64, i64)
-DEF_HELPER_2(iwmmxt_packuq, i64, i64, i64)
-DEF_HELPER_2(iwmmxt_packsw, i64, i64, i64)
-DEF_HELPER_2(iwmmxt_packsl, i64, i64, i64)
-DEF_HELPER_2(iwmmxt_packsq, i64, i64, i64)
+DEF_HELPER_3(iwmmxt_srlw, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_srll, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_srlq, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_sllw, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_slll, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_sllq, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_sraw, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_sral, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_sraq, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_rorw, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_rorl, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_rorq, i64, env, i64, i32)
+DEF_HELPER_3(iwmmxt_shufh, i64, env, i64, i32)
+
+DEF_HELPER_3(iwmmxt_packuw, i64, env, i64, i64)
+DEF_HELPER_3(iwmmxt_packul, i64, env, i64, i64)
+DEF_HELPER_3(iwmmxt_packuq, i64, env, i64, i64)
+DEF_HELPER_3(iwmmxt_packsw, i64, env, i64, i64)
+DEF_HELPER_3(iwmmxt_packsl, i64, env, i64, i64)
+DEF_HELPER_3(iwmmxt_packsq, i64, env, i64, i64)
 
 DEF_HELPER_3(iwmmxt_muladdsl, i64, i64, i32, i32)
 DEF_HELPER_3(iwmmxt_muladdsw, i64, i64, i32, i32)
@@ -461,15 +458,15 @@ DEF_HELPER_3(iwmmxt_muladdswl, i64, i64, i32, i32)
 
 DEF_HELPER_2(set_teecr, void, env, i32)
 
-DEF_HELPER_2(neon_unzip8, void, i32, i32)
-DEF_HELPER_2(neon_unzip16, void, i32, i32)
-DEF_HELPER_2(neon_qunzip8, void, i32, i32)
-DEF_HELPER_2(neon_qunzip16, void, i32, i32)
-DEF_HELPER_2(neon_qunzip32, void, i32, i32)
-DEF_HELPER_2(neon_zip8, void, i32, i32)
-DEF_HELPER_2(neon_zip16, void, i32, i32)
-DEF_HELPER_2(neon_qzip8, void, i32, i32)
-DEF_HELPER_2(neon_qzip16, void, i32, i32)
-DEF_HELPER_2(neon_qzip32, void, i32, i32)
+DEF_HELPER_3(neon_unzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_unzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qunzip32, void, env, i32, i32)
+DEF_HELPER_3(neon_zip8, void, env, i32, i32)
+DEF_HELPER_3(neon_zip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip8, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip16, void, env, i32, i32)
+DEF_HELPER_3(neon_qzip32, void, env, i32, i32)
 
 #include "def-helper.h"
diff --git a/target-arm/iwmmxt_helper.c b/target-arm/iwmmxt_helper.c
index ebe6eb9fae..843994d8d5 100644
--- a/target-arm/iwmmxt_helper.c
+++ b/target-arm/iwmmxt_helper.c
@@ -23,7 +23,7 @@
 #include <stdio.h>
 
 #include "cpu.h"
-#include "exec.h"
+#include "exec-all.h"
 #include "helper.h"
 
 /* iwMMXt macros extracted from GNU gdb.  */
@@ -162,7 +162,8 @@ uint64_t HELPER(iwmmxt_macuw)(uint64_t a, uint64_t b)
     SIMD64_SET(NBIT64(x), SIMD_NBIT) | \
     SIMD64_SET(ZBIT64(x), SIMD_ZBIT)
 #define IWMMXT_OP_UNPACK(S, SH0, SH1, SH2, SH3)			\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(uint64_t a, uint64_t b) \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(CPUState *env, \
+                                                 uint64_t a, uint64_t b) \
 {								\
     a =							        \
         (((a >> SH0) & 0xff) << 0) | (((b >> SH0) & 0xff) << 8) |	\
@@ -176,7 +177,8 @@ uint64_t HELPER(glue(iwmmxt_unpack, glue(S, b)))(uint64_t a, uint64_t b) \
         NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);		\
     return a;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(uint64_t a, uint64_t b) \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(CPUState *env, \
+                                        uint64_t a, uint64_t b) \
 {								\
     a =							        \
         (((a >> SH0) & 0xffff) << 0) |				\
@@ -188,7 +190,8 @@ uint64_t HELPER(glue(iwmmxt_unpack, glue(S, w)))(uint64_t a, uint64_t b) \
         NZBIT8(a >> 32, 2) | NZBIT8(a >> 48, 3);		\
     return a;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(uint64_t a, uint64_t b) \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(CPUState *env, \
+                                        uint64_t a, uint64_t b) \
 {								\
     a =							        \
         (((a >> SH0) & 0xffffffff) << 0) |			\
@@ -197,7 +200,8 @@ uint64_t HELPER(glue(iwmmxt_unpack, glue(S, l)))(uint64_t a, uint64_t b) \
         NZBIT32(a >> 0, 0) | NZBIT32(a >> 32, 1);		\
     return a;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(uint64_t x)   \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(CPUState *env, \
+                                                  uint64_t x)   \
 {								\
     x =							        \
         (((x >> SH0) & 0xff) << 0) |				\
@@ -209,7 +213,8 @@ uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ub)))(uint64_t x)   \
         NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);		\
     return x;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(uint64_t x)   \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(CPUState *env, \
+                                                  uint64_t x)   \
 {								\
     x =							        \
         (((x >> SH0) & 0xffff) << 0) |				\
@@ -218,13 +223,15 @@ uint64_t HELPER(glue(iwmmxt_unpack, glue(S, uw)))(uint64_t x)   \
         NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);		\
     return x;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ul)))(uint64_t x)   \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, ul)))(CPUState *env, \
+                                                  uint64_t x)   \
 {								\
     x = (((x >> SH0) & 0xffffffff) << 0);			\
     env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0);	\
     return x;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(uint64_t x)   \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(CPUState *env, \
+                                                  uint64_t x)   \
 {								\
     x =							        \
         ((uint64_t) EXTEND8H((x >> SH0) & 0xff) << 0) |	        \
@@ -236,7 +243,8 @@ uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sb)))(uint64_t x)   \
         NZBIT16(x >> 32, 2) | NZBIT16(x >> 48, 3);		\
     return x;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(uint64_t x)   \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(CPUState *env, \
+                                                  uint64_t x)   \
 {								\
     x =							        \
         ((uint64_t) EXTEND16((x >> SH0) & 0xffff) << 0) |	\
@@ -245,7 +253,8 @@ uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sw)))(uint64_t x)   \
         NZBIT32(x >> 0, 0) | NZBIT32(x >> 32, 1);		\
     return x;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sl)))(uint64_t x)   \
+uint64_t HELPER(glue(iwmmxt_unpack, glue(S, sl)))(CPUState *env, \
+                                                  uint64_t x)   \
 {								\
     x = EXTEND32((x >> SH0) & 0xffffffff);			\
     env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x >> 0);	\
@@ -255,7 +264,8 @@ IWMMXT_OP_UNPACK(l, 0, 8, 16, 24)
 IWMMXT_OP_UNPACK(h, 32, 40, 48, 56)
 
 #define IWMMXT_OP_CMP(SUFF, Tb, Tw, Tl, O)			\
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(uint64_t a, uint64_t b) \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(CPUState *env,    \
+                                        uint64_t a, uint64_t b) \
 {								\
     a =							        \
         CMP(0, Tb, O, 0xff) | CMP(8, Tb, O, 0xff) |		\
@@ -269,7 +279,8 @@ uint64_t HELPER(glue(iwmmxt_, glue(SUFF, b)))(uint64_t a, uint64_t b) \
         NZBIT8(a >> 48, 6) | NZBIT8(a >> 56, 7);		\
     return a;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(uint64_t a, uint64_t b) \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(CPUState *env,    \
+                                        uint64_t a, uint64_t b) \
 {								\
     a = CMP(0, Tw, O, 0xffff) | CMP(16, Tw, O, 0xffff) |	\
         CMP(32, Tw, O, 0xffff) | CMP(48, Tw, O, 0xffff);	\
@@ -278,7 +289,8 @@ uint64_t HELPER(glue(iwmmxt_, glue(SUFF, w)))(uint64_t a, uint64_t b) \
         NZBIT16(a >> 32, 2) | NZBIT16(a >> 48, 3);		\
     return a;                                                   \
 }								\
-uint64_t HELPER(glue(iwmmxt_, glue(SUFF, l)))(uint64_t a, uint64_t b) \
+uint64_t HELPER(glue(iwmmxt_, glue(SUFF, l)))(CPUState *env,    \
+                                        uint64_t a, uint64_t b) \
 {								\
     a = CMP(0, Tl, O, 0xffffffff) |				\
         CMP(32, Tl, O, 0xffffffff);				\
@@ -317,7 +329,7 @@ IWMMXT_OP_CMP(adds, int8_t, int16_t, int32_t, +)
 #define AVGB(SHR) ((( \
         ((a >> SHR) & 0xff) + ((b >> SHR) & 0xff) + round) >> 1) << SHR)
 #define IWMMXT_OP_AVGB(r)                                                 \
-uint64_t HELPER(iwmmxt_avgb##r)(uint64_t a, uint64_t b)                   \
+uint64_t HELPER(iwmmxt_avgb##r)(CPUState *env, uint64_t a, uint64_t b)    \
 {                                                                         \
     const int round = r;                                                  \
     a = AVGB(0) | AVGB(8) | AVGB(16) | AVGB(24) |                         \
@@ -341,7 +353,7 @@ IWMMXT_OP_AVGB(1)
 #define AVGW(SHR) ((( \
         ((a >> SHR) & 0xffff) + ((b >> SHR) & 0xffff) + round) >> 1) << SHR)
 #define IWMMXT_OP_AVGW(r)                                               \
-uint64_t HELPER(iwmmxt_avgw##r)(uint64_t a, uint64_t b)                 \
+uint64_t HELPER(iwmmxt_avgw##r)(CPUState *env, uint64_t a, uint64_t b)  \
 {                                                                       \
     const int round = r;                                                \
     a = AVGW(0) | AVGW(16) | AVGW(32) | AVGW(48);                       \
@@ -452,7 +464,7 @@ uint32_t HELPER(iwmmxt_msbl)(uint64_t x)
 }
 
 /* FIXME: Split wCASF setting into a separate op to avoid env use.  */
-uint64_t HELPER(iwmmxt_srlw)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_srlw)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = (((x & (0xffffll << 0)) >> n) & (0xffffll << 0)) |
         (((x & (0xffffll << 16)) >> n) & (0xffffll << 16)) |
@@ -464,7 +476,7 @@ uint64_t HELPER(iwmmxt_srlw)(uint64_t x, uint32_t n)
     return x;
 }
 
-uint64_t HELPER(iwmmxt_srll)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_srll)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = ((x & (0xffffffffll << 0)) >> n) |
         ((x >> n) & (0xffffffffll << 32));
@@ -473,14 +485,14 @@ uint64_t HELPER(iwmmxt_srll)(uint64_t x, uint32_t n)
     return x;
 }
 
-uint64_t HELPER(iwmmxt_srlq)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_srlq)(CPUState *env, uint64_t x, uint32_t n)
 {
     x >>= n;
     env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
     return x;
 }
 
-uint64_t HELPER(iwmmxt_sllw)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_sllw)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = (((x & (0xffffll << 0)) << n) & (0xffffll << 0)) |
         (((x & (0xffffll << 16)) << n) & (0xffffll << 16)) |
@@ -492,7 +504,7 @@ uint64_t HELPER(iwmmxt_sllw)(uint64_t x, uint32_t n)
     return x;
 }
 
-uint64_t HELPER(iwmmxt_slll)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_slll)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = ((x << n) & (0xffffffffll << 0)) |
         ((x & (0xffffffffll << 32)) << n);
@@ -501,14 +513,14 @@ uint64_t HELPER(iwmmxt_slll)(uint64_t x, uint32_t n)
     return x;
 }
 
-uint64_t HELPER(iwmmxt_sllq)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_sllq)(CPUState *env, uint64_t x, uint32_t n)
 {
     x <<= n;
     env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
     return x;
 }
 
-uint64_t HELPER(iwmmxt_sraw)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_sraw)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = ((uint64_t) ((EXTEND16(x >> 0) >> n) & 0xffff) << 0) |
         ((uint64_t) ((EXTEND16(x >> 16) >> n) & 0xffff) << 16) |
@@ -520,7 +532,7 @@ uint64_t HELPER(iwmmxt_sraw)(uint64_t x, uint32_t n)
     return x;
 }
 
-uint64_t HELPER(iwmmxt_sral)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_sral)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = (((EXTEND32(x >> 0) >> n) & 0xffffffff) << 0) |
         (((EXTEND32(x >> 32) >> n) & 0xffffffff) << 32);
@@ -529,14 +541,14 @@ uint64_t HELPER(iwmmxt_sral)(uint64_t x, uint32_t n)
     return x;
 }
 
-uint64_t HELPER(iwmmxt_sraq)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_sraq)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = (int64_t) x >> n;
     env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
     return x;
 }
 
-uint64_t HELPER(iwmmxt_rorw)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_rorw)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = ((((x & (0xffffll << 0)) >> n) |
           ((x & (0xffffll << 0)) << (16 - n))) & (0xffffll << 0)) |
@@ -552,7 +564,7 @@ uint64_t HELPER(iwmmxt_rorw)(uint64_t x, uint32_t n)
     return x;
 }
 
-uint64_t HELPER(iwmmxt_rorl)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_rorl)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = ((x & (0xffffffffll << 0)) >> n) |
         ((x >> n) & (0xffffffffll << 32)) |
@@ -563,14 +575,14 @@ uint64_t HELPER(iwmmxt_rorl)(uint64_t x, uint32_t n)
     return x;
 }
 
-uint64_t HELPER(iwmmxt_rorq)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_rorq)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = (x >> n) | (x << (64 - n));
     env->iwmmxt.cregs[ARM_IWMMXT_wCASF] = NZBIT64(x);
     return x;
 }
 
-uint64_t HELPER(iwmmxt_shufh)(uint64_t x, uint32_t n)
+uint64_t HELPER(iwmmxt_shufh)(CPUState *env, uint64_t x, uint32_t n)
 {
     x = (((x >> ((n << 4) & 0x30)) & 0xffff) << 0) |
         (((x >> ((n << 2) & 0x30)) & 0xffff) << 16) |
@@ -583,7 +595,7 @@ uint64_t HELPER(iwmmxt_shufh)(uint64_t x, uint32_t n)
 }
 
 /* TODO: Unsigned-Saturation */
-uint64_t HELPER(iwmmxt_packuw)(uint64_t a, uint64_t b)
+uint64_t HELPER(iwmmxt_packuw)(CPUState *env, uint64_t a, uint64_t b)
 {
     a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
         (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
@@ -597,7 +609,7 @@ uint64_t HELPER(iwmmxt_packuw)(uint64_t a, uint64_t b)
     return a;
 }
 
-uint64_t HELPER(iwmmxt_packul)(uint64_t a, uint64_t b)
+uint64_t HELPER(iwmmxt_packul)(CPUState *env, uint64_t a, uint64_t b)
 {
     a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
         (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
@@ -607,7 +619,7 @@ uint64_t HELPER(iwmmxt_packul)(uint64_t a, uint64_t b)
     return a;
 }
 
-uint64_t HELPER(iwmmxt_packuq)(uint64_t a, uint64_t b)
+uint64_t HELPER(iwmmxt_packuq)(CPUState *env, uint64_t a, uint64_t b)
 {
     a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
     env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
@@ -616,7 +628,7 @@ uint64_t HELPER(iwmmxt_packuq)(uint64_t a, uint64_t b)
 }
 
 /* TODO: Signed-Saturation */
-uint64_t HELPER(iwmmxt_packsw)(uint64_t a, uint64_t b)
+uint64_t HELPER(iwmmxt_packsw)(CPUState *env, uint64_t a, uint64_t b)
 {
     a = (((a >> 0) & 0xff) << 0) | (((a >> 16) & 0xff) << 8) |
         (((a >> 32) & 0xff) << 16) | (((a >> 48) & 0xff) << 24) |
@@ -630,7 +642,7 @@ uint64_t HELPER(iwmmxt_packsw)(uint64_t a, uint64_t b)
     return a;
 }
 
-uint64_t HELPER(iwmmxt_packsl)(uint64_t a, uint64_t b)
+uint64_t HELPER(iwmmxt_packsl)(CPUState *env, uint64_t a, uint64_t b)
 {
     a = (((a >> 0) & 0xffff) << 0) | (((a >> 32) & 0xffff) << 16) |
         (((b >> 0) & 0xffff) << 32) | (((b >> 32) & 0xffff) << 48);
@@ -640,7 +652,7 @@ uint64_t HELPER(iwmmxt_packsl)(uint64_t a, uint64_t b)
     return a;
 }
 
-uint64_t HELPER(iwmmxt_packsq)(uint64_t a, uint64_t b)
+uint64_t HELPER(iwmmxt_packsq)(CPUState *env, uint64_t a, uint64_t b)
 {
     a = (a & 0xffffffff) | ((b & 0xffffffff) << 32);
     env->iwmmxt.cregs[ARM_IWMMXT_wCASF] =
diff --git a/target-arm/machine.c b/target-arm/machine.c
index a18b7dc67f..7d4fc545a6 100644
--- a/target-arm/machine.c
+++ b/target-arm/machine.c
@@ -44,6 +44,12 @@ void cpu_save(QEMUFile *f, void *opaque)
     qemu_put_be32(f, env->cp15.c7_par);
     qemu_put_be32(f, env->cp15.c9_insn);
     qemu_put_be32(f, env->cp15.c9_data);
+    qemu_put_be32(f, env->cp15.c9_pmcr);
+    qemu_put_be32(f, env->cp15.c9_pmcnten);
+    qemu_put_be32(f, env->cp15.c9_pmovsr);
+    qemu_put_be32(f, env->cp15.c9_pmxevtyper);
+    qemu_put_be32(f, env->cp15.c9_pmuserenr);
+    qemu_put_be32(f, env->cp15.c9_pminten);
     qemu_put_be32(f, env->cp15.c13_fcse);
     qemu_put_be32(f, env->cp15.c13_context);
     qemu_put_be32(f, env->cp15.c13_tls1);
@@ -152,6 +158,12 @@ int cpu_load(QEMUFile *f, void *opaque, int version_id)
     env->cp15.c7_par = qemu_get_be32(f);
     env->cp15.c9_insn = qemu_get_be32(f);
     env->cp15.c9_data = qemu_get_be32(f);
+    env->cp15.c9_pmcr = qemu_get_be32(f);
+    env->cp15.c9_pmcnten = qemu_get_be32(f);
+    env->cp15.c9_pmovsr = qemu_get_be32(f);
+    env->cp15.c9_pmxevtyper = qemu_get_be32(f);
+    env->cp15.c9_pmuserenr = qemu_get_be32(f);
+    env->cp15.c9_pminten = qemu_get_be32(f);
     env->cp15.c13_fcse = qemu_get_be32(f);
     env->cp15.c13_context = qemu_get_be32(f);
     env->cp15.c13_tls1 = qemu_get_be32(f);
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 9165519236..28306279a8 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -10,7 +10,7 @@
 #include <stdio.h>
 
 #include "cpu.h"
-#include "exec.h"
+#include "exec-all.h"
 #include "helper.h"
 
 #define SIGNBIT (uint32_t)0x80000000
@@ -18,8 +18,6 @@
 
 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
 
-#define NFS (&env->vfp.standard_fp_status)
-
 #define NEON_TYPE1(name, type) \
 typedef struct \
 { \
@@ -115,6 +113,10 @@ NEON_TYPE1(u32, uint32_t)
 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
 NEON_VOP_BODY(vtype, n)
 
+#define NEON_VOP_ENV(name, vtype, n) \
+uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
+NEON_VOP_BODY(vtype, n)
+
 /* Pairwise operations.  */
 /* For 32-bit elements each segment only contains a single element, so
    the elementwise and pairwise operations are the same.  */
@@ -163,14 +165,14 @@ uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
         dest = tmp; \
     }} while(0)
 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
-NEON_VOP(qadd_u8, neon_u8, 4)
+NEON_VOP_ENV(qadd_u8, neon_u8, 4)
 #undef NEON_FN
 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
-NEON_VOP(qadd_u16, neon_u16, 2)
+NEON_VOP_ENV(qadd_u16, neon_u16, 2)
 #undef NEON_FN
 #undef NEON_USAT
 
-uint32_t HELPER(neon_qadd_u32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_qadd_u32)(CPUState *env, uint32_t a, uint32_t b)
 {
     uint32_t res = a + b;
     if (res < a) {
@@ -180,7 +182,7 @@ uint32_t HELPER(neon_qadd_u32)(uint32_t a, uint32_t b)
     return res;
 }
 
-uint64_t HELPER(neon_qadd_u64)(uint64_t src1, uint64_t src2)
+uint64_t HELPER(neon_qadd_u64)(CPUState *env, uint64_t src1, uint64_t src2)
 {
     uint64_t res;
 
@@ -205,14 +207,14 @@ uint64_t HELPER(neon_qadd_u64)(uint64_t src1, uint64_t src2)
     dest = tmp; \
     } while(0)
 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
-NEON_VOP(qadd_s8, neon_s8, 4)
+NEON_VOP_ENV(qadd_s8, neon_s8, 4)
 #undef NEON_FN
 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
-NEON_VOP(qadd_s16, neon_s16, 2)
+NEON_VOP_ENV(qadd_s16, neon_s16, 2)
 #undef NEON_FN
 #undef NEON_SSAT
 
-uint32_t HELPER(neon_qadd_s32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_qadd_s32)(CPUState *env, uint32_t a, uint32_t b)
 {
     uint32_t res = a + b;
     if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
@@ -222,7 +224,7 @@ uint32_t HELPER(neon_qadd_s32)(uint32_t a, uint32_t b)
     return res;
 }
 
-uint64_t HELPER(neon_qadd_s64)(uint64_t src1, uint64_t src2)
+uint64_t HELPER(neon_qadd_s64)(CPUState *env, uint64_t src1, uint64_t src2)
 {
     uint64_t res;
 
@@ -243,14 +245,14 @@ uint64_t HELPER(neon_qadd_s64)(uint64_t src1, uint64_t src2)
         dest = tmp; \
     }} while(0)
 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
-NEON_VOP(qsub_u8, neon_u8, 4)
+NEON_VOP_ENV(qsub_u8, neon_u8, 4)
 #undef NEON_FN
 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
-NEON_VOP(qsub_u16, neon_u16, 2)
+NEON_VOP_ENV(qsub_u16, neon_u16, 2)
 #undef NEON_FN
 #undef NEON_USAT
 
-uint32_t HELPER(neon_qsub_u32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_qsub_u32)(CPUState *env, uint32_t a, uint32_t b)
 {
     uint32_t res = a - b;
     if (res > a) {
@@ -260,7 +262,7 @@ uint32_t HELPER(neon_qsub_u32)(uint32_t a, uint32_t b)
     return res;
 }
 
-uint64_t HELPER(neon_qsub_u64)(uint64_t src1, uint64_t src2)
+uint64_t HELPER(neon_qsub_u64)(CPUState *env, uint64_t src1, uint64_t src2)
 {
     uint64_t res;
 
@@ -286,14 +288,14 @@ uint64_t HELPER(neon_qsub_u64)(uint64_t src1, uint64_t src2)
     dest = tmp; \
     } while(0)
 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
-NEON_VOP(qsub_s8, neon_s8, 4)
+NEON_VOP_ENV(qsub_s8, neon_s8, 4)
 #undef NEON_FN
 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
-NEON_VOP(qsub_s16, neon_s16, 2)
+NEON_VOP_ENV(qsub_s16, neon_s16, 2)
 #undef NEON_FN
 #undef NEON_SSAT
 
-uint32_t HELPER(neon_qsub_s32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_qsub_s32)(CPUState *env, uint32_t a, uint32_t b)
 {
     uint32_t res = a - b;
     if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
@@ -303,7 +305,7 @@ uint32_t HELPER(neon_qsub_s32)(uint32_t a, uint32_t b)
     return res;
 }
 
-uint64_t HELPER(neon_qsub_s64)(uint64_t src1, uint64_t src2)
+uint64_t HELPER(neon_qsub_s64)(CPUState *env, uint64_t src1, uint64_t src2)
 {
     uint64_t res;
 
@@ -654,12 +656,12 @@ uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
             dest = ~0; \
         } \
     }} while (0)
-NEON_VOP(qshl_u8, neon_u8, 4)
-NEON_VOP(qshl_u16, neon_u16, 2)
-NEON_VOP(qshl_u32, neon_u32, 1)
+NEON_VOP_ENV(qshl_u8, neon_u8, 4)
+NEON_VOP_ENV(qshl_u16, neon_u16, 2)
+NEON_VOP_ENV(qshl_u32, neon_u32, 1)
 #undef NEON_FN
 
-uint64_t HELPER(neon_qshl_u64)(uint64_t val, uint64_t shiftop)
+uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
 {
     int8_t shift = (int8_t)shiftop;
     if (shift >= 64) {
@@ -709,12 +711,12 @@ uint64_t HELPER(neon_qshl_u64)(uint64_t val, uint64_t shiftop)
             } \
         } \
     }} while (0)
-NEON_VOP(qshl_s8, neon_s8, 4)
-NEON_VOP(qshl_s16, neon_s16, 2)
-NEON_VOP(qshl_s32, neon_s32, 1)
+NEON_VOP_ENV(qshl_s8, neon_s8, 4)
+NEON_VOP_ENV(qshl_s16, neon_s16, 2)
+NEON_VOP_ENV(qshl_s32, neon_s32, 1)
 #undef NEON_FN
 
-uint64_t HELPER(neon_qshl_s64)(uint64_t valop, uint64_t shiftop)
+uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
 {
     int8_t shift = (uint8_t)shiftop;
     int64_t val = valop;
@@ -764,26 +766,26 @@ uint64_t HELPER(neon_qshl_s64)(uint64_t valop, uint64_t shiftop)
             } \
         } \
     }} while (0)
-NEON_VOP(qshlu_s8, neon_u8, 4)
-NEON_VOP(qshlu_s16, neon_u16, 2)
+NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
+NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
 #undef NEON_FN
 
-uint32_t HELPER(neon_qshlu_s32)(uint32_t valop, uint32_t shiftop)
+uint32_t HELPER(neon_qshlu_s32)(CPUState *env, uint32_t valop, uint32_t shiftop)
 {
     if ((int32_t)valop < 0) {
         SET_QC();
         return 0;
     }
-    return helper_neon_qshl_u32(valop, shiftop);
+    return helper_neon_qshl_u32(env, valop, shiftop);
 }
 
-uint64_t HELPER(neon_qshlu_s64)(uint64_t valop, uint64_t shiftop)
+uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
 {
     if ((int64_t)valop < 0) {
         SET_QC();
         return 0;
     }
-    return helper_neon_qshl_u64(valop, shiftop);
+    return helper_neon_qshl_u64(env, valop, shiftop);
 }
 
 /* FIXME: This is wrong.  */
@@ -810,13 +812,13 @@ uint64_t HELPER(neon_qshlu_s64)(uint64_t valop, uint64_t shiftop)
             dest = ~0; \
         } \
     }} while (0)
-NEON_VOP(qrshl_u8, neon_u8, 4)
-NEON_VOP(qrshl_u16, neon_u16, 2)
+NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
+NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
 #undef NEON_FN
 
 /* The addition of the rounding constant may overflow, so we use an
  * intermediate 64 bits accumulator.  */
-uint32_t HELPER(neon_qrshl_u32)(uint32_t val, uint32_t shiftop)
+uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop)
 {
     uint32_t dest;
     int8_t shift = (int8_t)shiftop;
@@ -846,7 +848,7 @@ uint32_t HELPER(neon_qrshl_u32)(uint32_t val, uint32_t shiftop)
 
 /* Handling addition overflow with 64 bits inputs values is more
  * tricky than with 32 bits values.  */
-uint64_t HELPER(neon_qrshl_u64)(uint64_t val, uint64_t shiftop)
+uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
 {
     int8_t shift = (int8_t)shiftop;
     if (shift >= 64) {
@@ -907,13 +909,13 @@ uint64_t HELPER(neon_qrshl_u64)(uint64_t val, uint64_t shiftop)
             } \
         } \
     }} while (0)
-NEON_VOP(qrshl_s8, neon_s8, 4)
-NEON_VOP(qrshl_s16, neon_s16, 2)
+NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
+NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
 #undef NEON_FN
 
 /* The addition of the rounding constant may overflow, so we use an
  * intermediate 64 bits accumulator.  */
-uint32_t HELPER(neon_qrshl_s32)(uint32_t valop, uint32_t shiftop)
+uint32_t HELPER(neon_qrshl_s32)(CPUState *env, uint32_t valop, uint32_t shiftop)
 {
     int32_t dest;
     int32_t val = (int32_t)valop;
@@ -942,7 +944,7 @@ uint32_t HELPER(neon_qrshl_s32)(uint32_t valop, uint32_t shiftop)
 
 /* Handling addition overflow with 64 bits inputs values is more
  * tricky than with 32 bits values.  */
-uint64_t HELPER(neon_qrshl_s64)(uint64_t valop, uint64_t shiftop)
+uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
 {
     int8_t shift = (uint8_t)shiftop;
     int64_t val = valop;
@@ -1151,10 +1153,10 @@ uint32_t HELPER(neon_cnt_u8)(uint32_t x)
     dest = tmp >> 16; \
     } while(0)
 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
-NEON_VOP(qdmulh_s16, neon_s16, 2)
+NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
 #undef NEON_FN
 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
-NEON_VOP(qrdmulh_s16, neon_s16, 2)
+NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
 #undef NEON_FN
 #undef NEON_QDMULH16
 
@@ -1177,10 +1179,10 @@ NEON_VOP(qrdmulh_s16, neon_s16, 2)
     dest = tmp >> 32; \
     } while(0)
 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
-NEON_VOP(qdmulh_s32, neon_s32, 1)
+NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
 #undef NEON_FN
 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
-NEON_VOP(qrdmulh_s32, neon_s32, 1)
+NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
 #undef NEON_FN
 #undef NEON_QDMULH32
 
@@ -1221,7 +1223,7 @@ uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
 }
 
-uint32_t HELPER(neon_unarrow_sat8)(uint64_t x)
+uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x)
 {
     uint16_t s;
     uint8_t d;
@@ -1248,7 +1250,7 @@ uint32_t HELPER(neon_unarrow_sat8)(uint64_t x)
     return res;
 }
 
-uint32_t HELPER(neon_narrow_sat_u8)(uint64_t x)
+uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
 {
     uint16_t s;
     uint8_t d;
@@ -1271,7 +1273,7 @@ uint32_t HELPER(neon_narrow_sat_u8)(uint64_t x)
     return res;
 }
 
-uint32_t HELPER(neon_narrow_sat_s8)(uint64_t x)
+uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
 {
     int16_t s;
     uint8_t d;
@@ -1294,7 +1296,7 @@ uint32_t HELPER(neon_narrow_sat_s8)(uint64_t x)
     return res;
 }
 
-uint32_t HELPER(neon_unarrow_sat16)(uint64_t x)
+uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x)
 {
     uint32_t high;
     uint32_t low;
@@ -1317,7 +1319,7 @@ uint32_t HELPER(neon_unarrow_sat16)(uint64_t x)
     return low | (high << 16);
 }
 
-uint32_t HELPER(neon_narrow_sat_u16)(uint64_t x)
+uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
 {
     uint32_t high;
     uint32_t low;
@@ -1334,7 +1336,7 @@ uint32_t HELPER(neon_narrow_sat_u16)(uint64_t x)
     return low | (high << 16);
 }
 
-uint32_t HELPER(neon_narrow_sat_s16)(uint64_t x)
+uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
 {
     int32_t low;
     int32_t high;
@@ -1351,7 +1353,7 @@ uint32_t HELPER(neon_narrow_sat_s16)(uint64_t x)
     return (uint16_t)low | (high << 16);
 }
 
-uint32_t HELPER(neon_unarrow_sat32)(uint64_t x)
+uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x)
 {
     if (x & 0x8000000000000000ull) {
         SET_QC();
@@ -1364,7 +1366,7 @@ uint32_t HELPER(neon_unarrow_sat32)(uint64_t x)
     return x;
 }
 
-uint32_t HELPER(neon_narrow_sat_u32)(uint64_t x)
+uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
 {
     if (x > 0xffffffffu) {
         SET_QC();
@@ -1373,7 +1375,7 @@ uint32_t HELPER(neon_narrow_sat_u32)(uint64_t x)
     return x;
 }
 
-uint32_t HELPER(neon_narrow_sat_s32)(uint64_t x)
+uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
 {
     if ((int64_t)x != (int32_t)x) {
         SET_QC();
@@ -1480,7 +1482,7 @@ uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
     return (a - b) ^ mask;
 }
 
-uint64_t HELPER(neon_addl_saturate_s32)(uint64_t a, uint64_t b)
+uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
 {
     uint32_t x, y;
     uint32_t low, high;
@@ -1502,7 +1504,7 @@ uint64_t HELPER(neon_addl_saturate_s32)(uint64_t a, uint64_t b)
     return low | ((uint64_t)high << 32);
 }
 
-uint64_t HELPER(neon_addl_saturate_s64)(uint64_t a, uint64_t b)
+uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
 {
     uint64_t result;
 
@@ -1678,7 +1680,7 @@ uint64_t HELPER(neon_negl_u64)(uint64_t x)
     } else if (x < 0) { \
         x = -x; \
     }} while (0)
-uint32_t HELPER(neon_qabs_s8)(uint32_t x)
+uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
 {
     neon_s8 vec;
     NEON_UNPACK(neon_s8, vec, x);
@@ -1698,7 +1700,7 @@ uint32_t HELPER(neon_qabs_s8)(uint32_t x)
     } else { \
         x = -x; \
     }} while (0)
-uint32_t HELPER(neon_qneg_s8)(uint32_t x)
+uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
 {
     neon_s8 vec;
     NEON_UNPACK(neon_s8, vec, x);
@@ -1718,7 +1720,7 @@ uint32_t HELPER(neon_qneg_s8)(uint32_t x)
     } else if (x < 0) { \
         x = -x; \
     }} while (0)
-uint32_t HELPER(neon_qabs_s16)(uint32_t x)
+uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
 {
     neon_s16 vec;
     NEON_UNPACK(neon_s16, vec, x);
@@ -1736,7 +1738,7 @@ uint32_t HELPER(neon_qabs_s16)(uint32_t x)
     } else { \
         x = -x; \
     }} while (0)
-uint32_t HELPER(neon_qneg_s16)(uint32_t x)
+uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
 {
     neon_s16 vec;
     NEON_UNPACK(neon_s16, vec, x);
@@ -1747,7 +1749,7 @@ uint32_t HELPER(neon_qneg_s16)(uint32_t x)
 }
 #undef DO_QNEG16
 
-uint32_t HELPER(neon_qabs_s32)(uint32_t x)
+uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
 {
     if (x == SIGNBIT) {
         SET_QC();
@@ -1758,7 +1760,7 @@ uint32_t HELPER(neon_qabs_s32)(uint32_t x)
     return x;
 }
 
-uint32_t HELPER(neon_qneg_s32)(uint32_t x)
+uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
 {
     if (x == SIGNBIT) {
         SET_QC();
@@ -1770,74 +1772,67 @@ uint32_t HELPER(neon_qneg_s32)(uint32_t x)
 }
 
 /* NEON Float helpers.  */
-uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b, void *fpstp)
 {
-    return float32_val(float32_min(make_float32(a), make_float32(b), NFS));
+    float_status *fpst = fpstp;
+    return float32_val(float32_min(make_float32(a), make_float32(b), fpst));
 }
 
-uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b, void *fpstp)
 {
-    return float32_val(float32_max(make_float32(a), make_float32(b), NFS));
+    float_status *fpst = fpstp;
+    return float32_val(float32_max(make_float32(a), make_float32(b), fpst));
 }
 
-uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b, void *fpstp)
 {
+    float_status *fpst = fpstp;
     float32 f0 = make_float32(a);
     float32 f1 = make_float32(b);
-    return float32_val(float32_abs(float32_sub(f0, f1, NFS)));
-}
-
-uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
-{
-    return float32_val(float32_add(make_float32(a), make_float32(b), NFS));
-}
-
-uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
-{
-    return float32_val(float32_sub(make_float32(a), make_float32(b), NFS));
-}
-
-uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
-{
-    return float32_val(float32_mul(make_float32(a), make_float32(b), NFS));
+    return float32_val(float32_abs(float32_sub(f0, f1, fpst)));
 }
 
 /* Floating point comparisons produce an integer result.
  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
  */
-uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
 {
-    return -float32_eq_quiet(make_float32(a), make_float32(b), NFS);
+    float_status *fpst = fpstp;
+    return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
 }
 
-uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
 {
-    return -float32_le(make_float32(b), make_float32(a), NFS);
+    float_status *fpst = fpstp;
+    return -float32_le(make_float32(b), make_float32(a), fpst);
 }
 
-uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
 {
-    return -float32_lt(make_float32(b), make_float32(a), NFS);
+    float_status *fpst = fpstp;
+    return -float32_lt(make_float32(b), make_float32(a), fpst);
 }
 
-uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
 {
+    float_status *fpst = fpstp;
     float32 f0 = float32_abs(make_float32(a));
     float32 f1 = float32_abs(make_float32(b));
-    return -float32_le(f1, f0, NFS);
+    return -float32_le(f1, f0, fpst);
 }
 
-uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
+uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
 {
+    float_status *fpst = fpstp;
     float32 f0 = float32_abs(make_float32(a));
     float32 f1 = float32_abs(make_float32(b));
-    return -float32_lt(f1, f0, NFS);
+    return -float32_lt(f1, f0, fpst);
 }
 
 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
 
-void HELPER(neon_qunzip8)(uint32_t rd, uint32_t rm)
+void HELPER(neon_qunzip8)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
@@ -1865,7 +1860,7 @@ void HELPER(neon_qunzip8)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd + 1] = make_float64(d1);
 }
 
-void HELPER(neon_qunzip16)(uint32_t rd, uint32_t rm)
+void HELPER(neon_qunzip16)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
@@ -1885,7 +1880,7 @@ void HELPER(neon_qunzip16)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd + 1] = make_float64(d1);
 }
 
-void HELPER(neon_qunzip32)(uint32_t rd, uint32_t rm)
+void HELPER(neon_qunzip32)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
@@ -1901,7 +1896,7 @@ void HELPER(neon_qunzip32)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd + 1] = make_float64(d1);
 }
 
-void HELPER(neon_unzip8)(uint32_t rd, uint32_t rm)
+void HELPER(neon_unzip8)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm = float64_val(env->vfp.regs[rm]);
     uint64_t zd = float64_val(env->vfp.regs[rd]);
@@ -1917,7 +1912,7 @@ void HELPER(neon_unzip8)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd] = make_float64(d0);
 }
 
-void HELPER(neon_unzip16)(uint32_t rd, uint32_t rm)
+void HELPER(neon_unzip16)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm = float64_val(env->vfp.regs[rm]);
     uint64_t zd = float64_val(env->vfp.regs[rd]);
@@ -1929,7 +1924,7 @@ void HELPER(neon_unzip16)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd] = make_float64(d0);
 }
 
-void HELPER(neon_qzip8)(uint32_t rd, uint32_t rm)
+void HELPER(neon_qzip8)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
@@ -1957,7 +1952,7 @@ void HELPER(neon_qzip8)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd + 1] = make_float64(d1);
 }
 
-void HELPER(neon_qzip16)(uint32_t rd, uint32_t rm)
+void HELPER(neon_qzip16)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
@@ -1977,7 +1972,7 @@ void HELPER(neon_qzip16)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd + 1] = make_float64(d1);
 }
 
-void HELPER(neon_qzip32)(uint32_t rd, uint32_t rm)
+void HELPER(neon_qzip32)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
@@ -1993,7 +1988,7 @@ void HELPER(neon_qzip32)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd + 1] = make_float64(d1);
 }
 
-void HELPER(neon_zip8)(uint32_t rd, uint32_t rm)
+void HELPER(neon_zip8)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm = float64_val(env->vfp.regs[rm]);
     uint64_t zd = float64_val(env->vfp.regs[rd]);
@@ -2009,7 +2004,7 @@ void HELPER(neon_zip8)(uint32_t rd, uint32_t rm)
     env->vfp.regs[rd] = make_float64(d0);
 }
 
-void HELPER(neon_zip16)(uint32_t rd, uint32_t rm)
+void HELPER(neon_zip16)(CPUState *env, uint32_t rd, uint32_t rm)
 {
     uint64_t zm = float64_val(env->vfp.regs[rm]);
     uint64_t zd = float64_val(env->vfp.regs[rd]);
diff --git a/target-arm/translate.c b/target-arm/translate.c
index badbc5ff31..34d5e6ef1a 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -892,13 +892,29 @@ static inline void gen_add_datah_offset(DisasContext *s, unsigned int insn,
     }
 }
 
+static TCGv_ptr get_fpstatus_ptr(int neon)
+{
+    TCGv_ptr statusptr = tcg_temp_new_ptr();
+    int offset;
+    if (neon) {
+        offset = offsetof(CPUState, vfp.standard_fp_status);
+    } else {
+        offset = offsetof(CPUState, vfp.fp_status);
+    }
+    tcg_gen_addi_ptr(statusptr, cpu_env, offset);
+    return statusptr;
+}
+
 #define VFP_OP2(name)                                                 \
 static inline void gen_vfp_##name(int dp)                             \
 {                                                                     \
-    if (dp)                                                           \
-        gen_helper_vfp_##name##d(cpu_F0d, cpu_F0d, cpu_F1d, cpu_env); \
-    else                                                              \
-        gen_helper_vfp_##name##s(cpu_F0s, cpu_F0s, cpu_F1s, cpu_env); \
+    TCGv_ptr fpst = get_fpstatus_ptr(0);                              \
+    if (dp) {                                                         \
+        gen_helper_vfp_##name##d(cpu_F0d, cpu_F0d, cpu_F1d, fpst);    \
+    } else {                                                          \
+        gen_helper_vfp_##name##s(cpu_F0s, cpu_F0s, cpu_F1s, fpst);    \
+    }                                                                 \
+    tcg_temp_free_ptr(fpst);                                          \
 }
 
 VFP_OP2(add)
@@ -911,11 +927,13 @@ VFP_OP2(div)
 static inline void gen_vfp_F1_mul(int dp)
 {
     /* Like gen_vfp_mul() but put result in F1 */
+    TCGv_ptr fpst = get_fpstatus_ptr(0);
     if (dp) {
-        gen_helper_vfp_muld(cpu_F1d, cpu_F0d, cpu_F1d, cpu_env);
+        gen_helper_vfp_muld(cpu_F1d, cpu_F0d, cpu_F1d, fpst);
     } else {
-        gen_helper_vfp_muls(cpu_F1s, cpu_F0s, cpu_F1s, cpu_env);
+        gen_helper_vfp_muls(cpu_F1s, cpu_F0s, cpu_F1s, fpst);
     }
+    tcg_temp_free_ptr(fpst);
 }
 
 static inline void gen_vfp_F1_neg(int dp)
@@ -979,14 +997,7 @@ static inline void gen_vfp_F1_ld0(int dp)
 #define VFP_GEN_ITOF(name) \
 static inline void gen_vfp_##name(int dp, int neon) \
 { \
-    TCGv_ptr statusptr = tcg_temp_new_ptr(); \
-    int offset; \
-    if (neon) { \
-        offset = offsetof(CPUState, vfp.standard_fp_status); \
-    } else { \
-        offset = offsetof(CPUState, vfp.fp_status); \
-    } \
-    tcg_gen_addi_ptr(statusptr, cpu_env, offset); \
+    TCGv_ptr statusptr = get_fpstatus_ptr(neon); \
     if (dp) { \
         gen_helper_vfp_##name##d(cpu_F0d, cpu_F0s, statusptr); \
     } else { \
@@ -1002,14 +1013,7 @@ VFP_GEN_ITOF(sito)
 #define VFP_GEN_FTOI(name) \
 static inline void gen_vfp_##name(int dp, int neon) \
 { \
-    TCGv_ptr statusptr = tcg_temp_new_ptr(); \
-    int offset; \
-    if (neon) { \
-        offset = offsetof(CPUState, vfp.standard_fp_status); \
-    } else { \
-        offset = offsetof(CPUState, vfp.fp_status); \
-    } \
-    tcg_gen_addi_ptr(statusptr, cpu_env, offset); \
+    TCGv_ptr statusptr = get_fpstatus_ptr(neon); \
     if (dp) { \
         gen_helper_vfp_##name##d(cpu_F0s, cpu_F0d, statusptr); \
     } else { \
@@ -1028,14 +1032,7 @@ VFP_GEN_FTOI(tosiz)
 static inline void gen_vfp_##name(int dp, int shift, int neon) \
 { \
     TCGv tmp_shift = tcg_const_i32(shift); \
-    TCGv_ptr statusptr = tcg_temp_new_ptr(); \
-    int offset; \
-    if (neon) { \
-        offset = offsetof(CPUState, vfp.standard_fp_status); \
-    } else { \
-        offset = offsetof(CPUState, vfp.fp_status); \
-    } \
-    tcg_gen_addi_ptr(statusptr, cpu_env, offset); \
+    TCGv_ptr statusptr = get_fpstatus_ptr(neon); \
     if (dp) { \
         gen_helper_vfp_##name##d(cpu_F0d, cpu_F0d, tmp_shift, statusptr); \
     } else { \
@@ -1206,15 +1203,22 @@ static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \
     gen_helper_iwmmxt_##name(cpu_M0, cpu_M0, cpu_V1); \
 }
 
-#define IWMMXT_OP_SIZE(name) \
-IWMMXT_OP(name##b) \
-IWMMXT_OP(name##w) \
-IWMMXT_OP(name##l)
+#define IWMMXT_OP_ENV(name) \
+static inline void gen_op_iwmmxt_##name##_M0_wRn(int rn) \
+{ \
+    iwmmxt_load_reg(cpu_V1, rn); \
+    gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0, cpu_V1); \
+}
 
-#define IWMMXT_OP_1(name) \
+#define IWMMXT_OP_ENV_SIZE(name) \
+IWMMXT_OP_ENV(name##b) \
+IWMMXT_OP_ENV(name##w) \
+IWMMXT_OP_ENV(name##l)
+
+#define IWMMXT_OP_ENV1(name) \
 static inline void gen_op_iwmmxt_##name##_M0(void) \
 { \
-    gen_helper_iwmmxt_##name(cpu_M0, cpu_M0); \
+    gen_helper_iwmmxt_##name(cpu_M0, cpu_env, cpu_M0); \
 }
 
 IWMMXT_OP(maddsq)
@@ -1228,51 +1232,51 @@ IWMMXT_OP(muluhw)
 IWMMXT_OP(macsw)
 IWMMXT_OP(macuw)
 
-IWMMXT_OP_SIZE(unpackl)
-IWMMXT_OP_SIZE(unpackh)
-
-IWMMXT_OP_1(unpacklub)
-IWMMXT_OP_1(unpackluw)
-IWMMXT_OP_1(unpacklul)
-IWMMXT_OP_1(unpackhub)
-IWMMXT_OP_1(unpackhuw)
-IWMMXT_OP_1(unpackhul)
-IWMMXT_OP_1(unpacklsb)
-IWMMXT_OP_1(unpacklsw)
-IWMMXT_OP_1(unpacklsl)
-IWMMXT_OP_1(unpackhsb)
-IWMMXT_OP_1(unpackhsw)
-IWMMXT_OP_1(unpackhsl)
-
-IWMMXT_OP_SIZE(cmpeq)
-IWMMXT_OP_SIZE(cmpgtu)
-IWMMXT_OP_SIZE(cmpgts)
-
-IWMMXT_OP_SIZE(mins)
-IWMMXT_OP_SIZE(minu)
-IWMMXT_OP_SIZE(maxs)
-IWMMXT_OP_SIZE(maxu)
-
-IWMMXT_OP_SIZE(subn)
-IWMMXT_OP_SIZE(addn)
-IWMMXT_OP_SIZE(subu)
-IWMMXT_OP_SIZE(addu)
-IWMMXT_OP_SIZE(subs)
-IWMMXT_OP_SIZE(adds)
-
-IWMMXT_OP(avgb0)
-IWMMXT_OP(avgb1)
-IWMMXT_OP(avgw0)
-IWMMXT_OP(avgw1)
+IWMMXT_OP_ENV_SIZE(unpackl)
+IWMMXT_OP_ENV_SIZE(unpackh)
+
+IWMMXT_OP_ENV1(unpacklub)
+IWMMXT_OP_ENV1(unpackluw)
+IWMMXT_OP_ENV1(unpacklul)
+IWMMXT_OP_ENV1(unpackhub)
+IWMMXT_OP_ENV1(unpackhuw)
+IWMMXT_OP_ENV1(unpackhul)
+IWMMXT_OP_ENV1(unpacklsb)
+IWMMXT_OP_ENV1(unpacklsw)
+IWMMXT_OP_ENV1(unpacklsl)
+IWMMXT_OP_ENV1(unpackhsb)
+IWMMXT_OP_ENV1(unpackhsw)
+IWMMXT_OP_ENV1(unpackhsl)
+
+IWMMXT_OP_ENV_SIZE(cmpeq)
+IWMMXT_OP_ENV_SIZE(cmpgtu)
+IWMMXT_OP_ENV_SIZE(cmpgts)
+
+IWMMXT_OP_ENV_SIZE(mins)
+IWMMXT_OP_ENV_SIZE(minu)
+IWMMXT_OP_ENV_SIZE(maxs)
+IWMMXT_OP_ENV_SIZE(maxu)
+
+IWMMXT_OP_ENV_SIZE(subn)
+IWMMXT_OP_ENV_SIZE(addn)
+IWMMXT_OP_ENV_SIZE(subu)
+IWMMXT_OP_ENV_SIZE(addu)
+IWMMXT_OP_ENV_SIZE(subs)
+IWMMXT_OP_ENV_SIZE(adds)
+
+IWMMXT_OP_ENV(avgb0)
+IWMMXT_OP_ENV(avgb1)
+IWMMXT_OP_ENV(avgw0)
+IWMMXT_OP_ENV(avgw1)
 
 IWMMXT_OP(msadb)
 
-IWMMXT_OP(packuw)
-IWMMXT_OP(packul)
-IWMMXT_OP(packuq)
-IWMMXT_OP(packsw)
-IWMMXT_OP(packsl)
-IWMMXT_OP(packsq)
+IWMMXT_OP_ENV(packuw)
+IWMMXT_OP_ENV(packul)
+IWMMXT_OP_ENV(packuq)
+IWMMXT_OP_ENV(packsw)
+IWMMXT_OP_ENV(packsl)
+IWMMXT_OP_ENV(packsq)
 
 static void gen_op_iwmmxt_set_mup(void)
 {
@@ -2006,13 +2010,13 @@ static int disas_iwmmxt_insn(CPUState *env, DisasContext *s, uint32_t insn)
         }
         switch ((insn >> 22) & 3) {
         case 1:
-            gen_helper_iwmmxt_srlw(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_srlw(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         case 2:
-            gen_helper_iwmmxt_srll(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_srll(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         case 3:
-            gen_helper_iwmmxt_srlq(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_srlq(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         }
         tcg_temp_free_i32(tmp);
@@ -2034,13 +2038,13 @@ static int disas_iwmmxt_insn(CPUState *env, DisasContext *s, uint32_t insn)
         }
         switch ((insn >> 22) & 3) {
         case 1:
-            gen_helper_iwmmxt_sraw(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_sraw(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         case 2:
-            gen_helper_iwmmxt_sral(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_sral(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         case 3:
-            gen_helper_iwmmxt_sraq(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_sraq(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         }
         tcg_temp_free_i32(tmp);
@@ -2062,13 +2066,13 @@ static int disas_iwmmxt_insn(CPUState *env, DisasContext *s, uint32_t insn)
         }
         switch ((insn >> 22) & 3) {
         case 1:
-            gen_helper_iwmmxt_sllw(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_sllw(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         case 2:
-            gen_helper_iwmmxt_slll(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_slll(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         case 3:
-            gen_helper_iwmmxt_sllq(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_sllq(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         }
         tcg_temp_free_i32(tmp);
@@ -2090,21 +2094,21 @@ static int disas_iwmmxt_insn(CPUState *env, DisasContext *s, uint32_t insn)
                 tcg_temp_free_i32(tmp);
                 return 1;
             }
-            gen_helper_iwmmxt_rorw(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_rorw(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         case 2:
             if (gen_iwmmxt_shift(insn, 0x1f, tmp)) {
                 tcg_temp_free_i32(tmp);
                 return 1;
             }
-            gen_helper_iwmmxt_rorl(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_rorl(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         case 3:
             if (gen_iwmmxt_shift(insn, 0x3f, tmp)) {
                 tcg_temp_free_i32(tmp);
                 return 1;
             }
-            gen_helper_iwmmxt_rorq(cpu_M0, cpu_M0, tmp);
+            gen_helper_iwmmxt_rorq(cpu_M0, cpu_env, cpu_M0, tmp);
             break;
         }
         tcg_temp_free_i32(tmp);
@@ -2238,7 +2242,7 @@ static int disas_iwmmxt_insn(CPUState *env, DisasContext *s, uint32_t insn)
         rd0 = (insn >> 16) & 0xf;
         gen_op_iwmmxt_movq_M0_wRn(rd0);
         tmp = tcg_const_i32(((insn >> 16) & 0xf0) | (insn & 0x0f));
-        gen_helper_iwmmxt_shufh(cpu_M0, cpu_M0, tmp);
+        gen_helper_iwmmxt_shufh(cpu_M0, cpu_env, cpu_M0, tmp);
         tcg_temp_free(tmp);
         gen_op_iwmmxt_movq_wRn_M0(wrd);
         gen_op_iwmmxt_set_mup();
@@ -2467,12 +2471,28 @@ static int disas_cp_insn(CPUState *env, DisasContext *s, uint32_t insn)
     return 0;
 }
 
-static int cp15_user_ok(uint32_t insn)
+static int cp15_user_ok(CPUState *env, uint32_t insn)
 {
     int cpn = (insn >> 16) & 0xf;
     int cpm = insn & 0xf;
     int op = ((insn >> 5) & 7) | ((insn >> 18) & 0x38);
 
+    if (arm_feature(env, ARM_FEATURE_V7) && cpn == 9) {
+        /* Performance monitor registers fall into three categories:
+         *  (a) always UNDEF in usermode
+         *  (b) UNDEF only if PMUSERENR.EN is 0
+         *  (c) always read OK and UNDEF on write (PMUSERENR only)
+         */
+        if ((cpm == 12 && (op < 6)) ||
+            (cpm == 13 && (op < 3))) {
+            return env->cp15.c9_pmuserenr;
+        } else if (cpm == 14 && op == 0 && (insn & ARM_CP_RW_BIT)) {
+            /* PMUSERENR, read only */
+            return 1;
+        }
+        return 0;
+    }
+
     if (cpn == 13 && cpm == 0) {
         /* TLS register.  */
         if (op == 2 || (op == 3 && (insn & ARM_CP_RW_BIT)))
@@ -2559,7 +2579,7 @@ static int disas_cp15_insn(CPUState *env, DisasContext *s, uint32_t insn)
         /* cdp */
         return 1;
     }
-    if (IS_USER(s) && !cp15_user_ok(insn)) {
+    if (IS_USER(s) && !cp15_user_ok(env, insn)) {
         return 1;
     }
 
@@ -3706,13 +3726,13 @@ static int gen_neon_unzip(int rd, int rm, int size, int q)
     if (q) {
         switch (size) {
         case 0:
-            gen_helper_neon_qunzip8(tmp, tmp2);
+            gen_helper_neon_qunzip8(cpu_env, tmp, tmp2);
             break;
         case 1:
-            gen_helper_neon_qunzip16(tmp, tmp2);
+            gen_helper_neon_qunzip16(cpu_env, tmp, tmp2);
             break;
         case 2:
-            gen_helper_neon_qunzip32(tmp, tmp2);
+            gen_helper_neon_qunzip32(cpu_env, tmp, tmp2);
             break;
         default:
             abort();
@@ -3720,10 +3740,10 @@ static int gen_neon_unzip(int rd, int rm, int size, int q)
     } else {
         switch (size) {
         case 0:
-            gen_helper_neon_unzip8(tmp, tmp2);
+            gen_helper_neon_unzip8(cpu_env, tmp, tmp2);
             break;
         case 1:
-            gen_helper_neon_unzip16(tmp, tmp2);
+            gen_helper_neon_unzip16(cpu_env, tmp, tmp2);
             break;
         default:
             abort();
@@ -3745,13 +3765,13 @@ static int gen_neon_zip(int rd, int rm, int size, int q)
     if (q) {
         switch (size) {
         case 0:
-            gen_helper_neon_qzip8(tmp, tmp2);
+            gen_helper_neon_qzip8(cpu_env, tmp, tmp2);
             break;
         case 1:
-            gen_helper_neon_qzip16(tmp, tmp2);
+            gen_helper_neon_qzip16(cpu_env, tmp, tmp2);
             break;
         case 2:
-            gen_helper_neon_qzip32(tmp, tmp2);
+            gen_helper_neon_qzip32(cpu_env, tmp, tmp2);
             break;
         default:
             abort();
@@ -3759,10 +3779,10 @@ static int gen_neon_zip(int rd, int rm, int size, int q)
     } else {
         switch (size) {
         case 0:
-            gen_helper_neon_zip8(tmp, tmp2);
+            gen_helper_neon_zip8(cpu_env, tmp, tmp2);
             break;
         case 1:
-            gen_helper_neon_zip16(tmp, tmp2);
+            gen_helper_neon_zip16(cpu_env, tmp, tmp2);
             break;
         default:
             abort();
@@ -4162,9 +4182,9 @@ static inline void gen_neon_narrow(int size, TCGv dest, TCGv_i64 src)
 static inline void gen_neon_narrow_sats(int size, TCGv dest, TCGv_i64 src)
 {
     switch (size) {
-    case 0: gen_helper_neon_narrow_sat_s8(dest, src); break;
-    case 1: gen_helper_neon_narrow_sat_s16(dest, src); break;
-    case 2: gen_helper_neon_narrow_sat_s32(dest, src); break;
+    case 0: gen_helper_neon_narrow_sat_s8(dest, cpu_env, src); break;
+    case 1: gen_helper_neon_narrow_sat_s16(dest, cpu_env, src); break;
+    case 2: gen_helper_neon_narrow_sat_s32(dest, cpu_env, src); break;
     default: abort();
     }
 }
@@ -4172,9 +4192,9 @@ static inline void gen_neon_narrow_sats(int size, TCGv dest, TCGv_i64 src)
 static inline void gen_neon_narrow_satu(int size, TCGv dest, TCGv_i64 src)
 {
     switch (size) {
-    case 0: gen_helper_neon_narrow_sat_u8(dest, src); break;
-    case 1: gen_helper_neon_narrow_sat_u16(dest, src); break;
-    case 2: gen_helper_neon_narrow_sat_u32(dest, src); break;
+    case 0: gen_helper_neon_narrow_sat_u8(dest, cpu_env, src); break;
+    case 1: gen_helper_neon_narrow_sat_u16(dest, cpu_env, src); break;
+    case 2: gen_helper_neon_narrow_sat_u32(dest, cpu_env, src); break;
     default: abort();
     }
 }
@@ -4182,9 +4202,9 @@ static inline void gen_neon_narrow_satu(int size, TCGv dest, TCGv_i64 src)
 static inline void gen_neon_unarrow_sats(int size, TCGv dest, TCGv_i64 src)
 {
     switch (size) {
-    case 0: gen_helper_neon_unarrow_sat8(dest, src); break;
-    case 1: gen_helper_neon_unarrow_sat16(dest, src); break;
-    case 2: gen_helper_neon_unarrow_sat32(dest, src); break;
+    case 0: gen_helper_neon_unarrow_sat8(dest, cpu_env, src); break;
+    case 1: gen_helper_neon_unarrow_sat16(dest, cpu_env, src); break;
+    case 2: gen_helper_neon_unarrow_sat32(dest, cpu_env, src); break;
     default: abort();
     }
 }
@@ -4276,8 +4296,8 @@ static inline void gen_neon_negl(TCGv_i64 var, int size)
 static inline void gen_neon_addl_saturate(TCGv_i64 op0, TCGv_i64 op1, int size)
 {
     switch (size) {
-    case 1: gen_helper_neon_addl_saturate_s32(op0, op0, op1); break;
-    case 2: gen_helper_neon_addl_saturate_s64(op0, op0, op1); break;
+    case 1: gen_helper_neon_addl_saturate_s32(op0, cpu_env, op0, op1); break;
+    case 2: gen_helper_neon_addl_saturate_s64(op0, cpu_env, op0, op1); break;
     default: abort();
     }
 }
@@ -4553,16 +4573,20 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                 switch (op) {
                 case NEON_3R_VQADD:
                     if (u) {
-                        gen_helper_neon_qadd_u64(cpu_V0, cpu_V0, cpu_V1);
+                        gen_helper_neon_qadd_u64(cpu_V0, cpu_env,
+                                                 cpu_V0, cpu_V1);
                     } else {
-                        gen_helper_neon_qadd_s64(cpu_V0, cpu_V0, cpu_V1);
+                        gen_helper_neon_qadd_s64(cpu_V0, cpu_env,
+                                                 cpu_V0, cpu_V1);
                     }
                     break;
                 case NEON_3R_VQSUB:
                     if (u) {
-                        gen_helper_neon_qsub_u64(cpu_V0, cpu_V0, cpu_V1);
+                        gen_helper_neon_qsub_u64(cpu_V0, cpu_env,
+                                                 cpu_V0, cpu_V1);
                     } else {
-                        gen_helper_neon_qsub_s64(cpu_V0, cpu_V0, cpu_V1);
+                        gen_helper_neon_qsub_s64(cpu_V0, cpu_env,
+                                                 cpu_V0, cpu_V1);
                     }
                     break;
                 case NEON_3R_VSHL:
@@ -4574,9 +4598,11 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                     break;
                 case NEON_3R_VQSHL:
                     if (u) {
-                        gen_helper_neon_qshl_u64(cpu_V0, cpu_V1, cpu_V0);
+                        gen_helper_neon_qshl_u64(cpu_V0, cpu_env,
+                                                 cpu_V1, cpu_V0);
                     } else {
-                        gen_helper_neon_qshl_s64(cpu_V0, cpu_V1, cpu_V0);
+                        gen_helper_neon_qshl_s64(cpu_V0, cpu_env,
+                                                 cpu_V1, cpu_V0);
                     }
                     break;
                 case NEON_3R_VRSHL:
@@ -4588,9 +4614,11 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                     break;
                 case NEON_3R_VQRSHL:
                     if (u) {
-                        gen_helper_neon_qrshl_u64(cpu_V0, cpu_V1, cpu_V0);
+                        gen_helper_neon_qrshl_u64(cpu_V0, cpu_env,
+                                                  cpu_V1, cpu_V0);
                     } else {
-                        gen_helper_neon_qrshl_s64(cpu_V0, cpu_V1, cpu_V0);
+                        gen_helper_neon_qrshl_s64(cpu_V0, cpu_env,
+                                                  cpu_V1, cpu_V0);
                     }
                     break;
                 case NEON_3R_VADD_VSUB:
@@ -4688,7 +4716,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
             GEN_NEON_INTEGER_OP(hadd);
             break;
         case NEON_3R_VQADD:
-            GEN_NEON_INTEGER_OP(qadd);
+            GEN_NEON_INTEGER_OP_ENV(qadd);
             break;
         case NEON_3R_VRHADD:
             GEN_NEON_INTEGER_OP(rhadd);
@@ -4731,7 +4759,7 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
             GEN_NEON_INTEGER_OP(hsub);
             break;
         case NEON_3R_VQSUB:
-            GEN_NEON_INTEGER_OP(qsub);
+            GEN_NEON_INTEGER_OP_ENV(qsub);
             break;
         case NEON_3R_VCGT:
             GEN_NEON_INTEGER_OP(cgt);
@@ -4743,13 +4771,13 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
             GEN_NEON_INTEGER_OP(shl);
             break;
         case NEON_3R_VQSHL:
-            GEN_NEON_INTEGER_OP(qshl);
+            GEN_NEON_INTEGER_OP_ENV(qshl);
             break;
         case NEON_3R_VRSHL:
             GEN_NEON_INTEGER_OP(rshl);
             break;
         case NEON_3R_VQRSHL:
-            GEN_NEON_INTEGER_OP(qrshl);
+            GEN_NEON_INTEGER_OP_ENV(qrshl);
             break;
         case NEON_3R_VMAX:
             GEN_NEON_INTEGER_OP(max);
@@ -4831,14 +4859,22 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
         case NEON_3R_VQDMULH_VQRDMULH: /* Multiply high.  */
             if (!u) { /* VQDMULH */
                 switch (size) {
-                case 1: gen_helper_neon_qdmulh_s16(tmp, tmp, tmp2); break;
-                case 2: gen_helper_neon_qdmulh_s32(tmp, tmp, tmp2); break;
+                case 1:
+                    gen_helper_neon_qdmulh_s16(tmp, cpu_env, tmp, tmp2);
+                    break;
+                case 2:
+                    gen_helper_neon_qdmulh_s32(tmp, cpu_env, tmp, tmp2);
+                    break;
                 default: abort();
                 }
             } else { /* VQRDMULH */
                 switch (size) {
-                case 1: gen_helper_neon_qrdmulh_s16(tmp, tmp, tmp2); break;
-                case 2: gen_helper_neon_qrdmulh_s32(tmp, tmp, tmp2); break;
+                case 1:
+                    gen_helper_neon_qrdmulh_s16(tmp, cpu_env, tmp, tmp2);
+                    break;
+                case 2:
+                    gen_helper_neon_qrdmulh_s32(tmp, cpu_env, tmp, tmp2);
+                    break;
                 default: abort();
                 }
             }
@@ -4852,57 +4888,78 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
             }
             break;
         case NEON_3R_FLOAT_ARITH: /* Floating point arithmetic. */
+        {
+            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
             switch ((u << 2) | size) {
             case 0: /* VADD */
-                gen_helper_neon_add_f32(tmp, tmp, tmp2);
+            case 4: /* VPADD */
+                gen_helper_vfp_adds(tmp, tmp, tmp2, fpstatus);
                 break;
             case 2: /* VSUB */
-                gen_helper_neon_sub_f32(tmp, tmp, tmp2);
-                break;
-            case 4: /* VPADD */
-                gen_helper_neon_add_f32(tmp, tmp, tmp2);
+                gen_helper_vfp_subs(tmp, tmp, tmp2, fpstatus);
                 break;
             case 6: /* VABD */
-                gen_helper_neon_abd_f32(tmp, tmp, tmp2);
+                gen_helper_neon_abd_f32(tmp, tmp, tmp2, fpstatus);
                 break;
             default:
                 abort();
             }
+            tcg_temp_free_ptr(fpstatus);
             break;
+        }
         case NEON_3R_FLOAT_MULTIPLY:
-            gen_helper_neon_mul_f32(tmp, tmp, tmp2);
+        {
+            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+            gen_helper_vfp_muls(tmp, tmp, tmp2, fpstatus);
             if (!u) {
                 tcg_temp_free_i32(tmp2);
                 tmp2 = neon_load_reg(rd, pass);
                 if (size == 0) {
-                    gen_helper_neon_add_f32(tmp, tmp, tmp2);
+                    gen_helper_vfp_adds(tmp, tmp, tmp2, fpstatus);
                 } else {
-                    gen_helper_neon_sub_f32(tmp, tmp2, tmp);
+                    gen_helper_vfp_subs(tmp, tmp2, tmp, fpstatus);
                 }
             }
+            tcg_temp_free_ptr(fpstatus);
             break;
+        }
         case NEON_3R_FLOAT_CMP:
+        {
+            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
             if (!u) {
-                gen_helper_neon_ceq_f32(tmp, tmp, tmp2);
+                gen_helper_neon_ceq_f32(tmp, tmp, tmp2, fpstatus);
             } else {
-                if (size == 0)
-                    gen_helper_neon_cge_f32(tmp, tmp, tmp2);
-                else
-                    gen_helper_neon_cgt_f32(tmp, tmp, tmp2);
+                if (size == 0) {
+                    gen_helper_neon_cge_f32(tmp, tmp, tmp2, fpstatus);
+                } else {
+                    gen_helper_neon_cgt_f32(tmp, tmp, tmp2, fpstatus);
+                }
             }
+            tcg_temp_free_ptr(fpstatus);
             break;
+        }
         case NEON_3R_FLOAT_ACMP:
-            if (size == 0)
-                gen_helper_neon_acge_f32(tmp, tmp, tmp2);
-            else
-                gen_helper_neon_acgt_f32(tmp, tmp, tmp2);
+        {
+            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+            if (size == 0) {
+                gen_helper_neon_acge_f32(tmp, tmp, tmp2, fpstatus);
+            } else {
+                gen_helper_neon_acgt_f32(tmp, tmp, tmp2, fpstatus);
+            }
+            tcg_temp_free_ptr(fpstatus);
             break;
+        }
         case NEON_3R_FLOAT_MINMAX:
-            if (size == 0)
-                gen_helper_neon_max_f32(tmp, tmp, tmp2);
-            else
-                gen_helper_neon_min_f32(tmp, tmp, tmp2);
+        {
+            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+            if (size == 0) {
+                gen_helper_neon_max_f32(tmp, tmp, tmp2, fpstatus);
+            } else {
+                gen_helper_neon_min_f32(tmp, tmp, tmp2, fpstatus);
+            }
+            tcg_temp_free_ptr(fpstatus);
             break;
+        }
         case NEON_3R_VRECPS_VRSQRTS:
             if (size == 0)
                 gen_helper_recps_f32(tmp, tmp, tmp2, cpu_env);
@@ -5009,14 +5066,15 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                             gen_helper_neon_shl_u64(cpu_V0, cpu_V0, cpu_V1);
                             break;
                         case 6: /* VQSHLU */
-                            gen_helper_neon_qshlu_s64(cpu_V0, cpu_V0, cpu_V1);
+                            gen_helper_neon_qshlu_s64(cpu_V0, cpu_env,
+                                                      cpu_V0, cpu_V1);
                             break;
                         case 7: /* VQSHL */
                             if (u) {
-                                gen_helper_neon_qshl_u64(cpu_V0,
+                                gen_helper_neon_qshl_u64(cpu_V0, cpu_env,
                                                          cpu_V0, cpu_V1);
                             } else {
-                                gen_helper_neon_qshl_s64(cpu_V0,
+                                gen_helper_neon_qshl_s64(cpu_V0, cpu_env,
                                                          cpu_V0, cpu_V1);
                             }
                             break;
@@ -5068,20 +5126,23 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                         case 6: /* VQSHLU */
                             switch (size) {
                             case 0:
-                                gen_helper_neon_qshlu_s8(tmp, tmp, tmp2);
+                                gen_helper_neon_qshlu_s8(tmp, cpu_env,
+                                                         tmp, tmp2);
                                 break;
                             case 1:
-                                gen_helper_neon_qshlu_s16(tmp, tmp, tmp2);
+                                gen_helper_neon_qshlu_s16(tmp, cpu_env,
+                                                          tmp, tmp2);
                                 break;
                             case 2:
-                                gen_helper_neon_qshlu_s32(tmp, tmp, tmp2);
+                                gen_helper_neon_qshlu_s32(tmp, cpu_env,
+                                                          tmp, tmp2);
                                 break;
                             default:
                                 abort();
                             }
                             break;
                         case 7: /* VQSHL */
-                            GEN_NEON_INTEGER_OP(qshl);
+                            GEN_NEON_INTEGER_OP_ENV(qshl);
                             break;
                         }
                         tcg_temp_free_i32(tmp2);
@@ -5590,18 +5651,20 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                         tmp2 = neon_load_reg(rn, pass);
                         if (op == 12) {
                             if (size == 1) {
-                                gen_helper_neon_qdmulh_s16(tmp, tmp, tmp2);
+                                gen_helper_neon_qdmulh_s16(tmp, cpu_env, tmp, tmp2);
                             } else {
-                                gen_helper_neon_qdmulh_s32(tmp, tmp, tmp2);
+                                gen_helper_neon_qdmulh_s32(tmp, cpu_env, tmp, tmp2);
                             }
                         } else if (op == 13) {
                             if (size == 1) {
-                                gen_helper_neon_qrdmulh_s16(tmp, tmp, tmp2);
+                                gen_helper_neon_qrdmulh_s16(tmp, cpu_env, tmp, tmp2);
                             } else {
-                                gen_helper_neon_qrdmulh_s32(tmp, tmp, tmp2);
+                                gen_helper_neon_qrdmulh_s32(tmp, cpu_env, tmp, tmp2);
                             }
                         } else if (op & 1) {
-                            gen_helper_neon_mul_f32(tmp, tmp, tmp2);
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                            gen_helper_vfp_muls(tmp, tmp, tmp2, fpstatus);
+                            tcg_temp_free_ptr(fpstatus);
                         } else {
                             switch (size) {
                             case 0: gen_helper_neon_mul_u8(tmp, tmp, tmp2); break;
@@ -5619,14 +5682,22 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                                 gen_neon_add(size, tmp, tmp2);
                                 break;
                             case 1:
-                                gen_helper_neon_add_f32(tmp, tmp, tmp2);
+                            {
+                                TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                                gen_helper_vfp_adds(tmp, tmp, tmp2, fpstatus);
+                                tcg_temp_free_ptr(fpstatus);
                                 break;
+                            }
                             case 4:
                                 gen_neon_rsb(size, tmp, tmp2);
                                 break;
                             case 5:
-                                gen_helper_neon_sub_f32(tmp, tmp2, tmp);
+                            {
+                                TCGv_ptr fpstatus = get_fpstatus_ptr(1);
+                                gen_helper_vfp_subs(tmp, tmp2, tmp, fpstatus);
+                                tcg_temp_free_ptr(fpstatus);
                                 break;
+                            }
                             default:
                                 abort();
                             }
@@ -5960,17 +6031,29 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                             break;
                         case NEON_2RM_VQABS:
                             switch (size) {
-                            case 0: gen_helper_neon_qabs_s8(tmp, tmp); break;
-                            case 1: gen_helper_neon_qabs_s16(tmp, tmp); break;
-                            case 2: gen_helper_neon_qabs_s32(tmp, tmp); break;
+                            case 0:
+                                gen_helper_neon_qabs_s8(tmp, cpu_env, tmp);
+                                break;
+                            case 1:
+                                gen_helper_neon_qabs_s16(tmp, cpu_env, tmp);
+                                break;
+                            case 2:
+                                gen_helper_neon_qabs_s32(tmp, cpu_env, tmp);
+                                break;
                             default: abort();
                             }
                             break;
                         case NEON_2RM_VQNEG:
                             switch (size) {
-                            case 0: gen_helper_neon_qneg_s8(tmp, tmp); break;
-                            case 1: gen_helper_neon_qneg_s16(tmp, tmp); break;
-                            case 2: gen_helper_neon_qneg_s32(tmp, tmp); break;
+                            case 0:
+                                gen_helper_neon_qneg_s8(tmp, cpu_env, tmp);
+                                break;
+                            case 1:
+                                gen_helper_neon_qneg_s16(tmp, cpu_env, tmp);
+                                break;
+                            case 2:
+                                gen_helper_neon_qneg_s32(tmp, cpu_env, tmp);
+                                break;
                             default: abort();
                             }
                             break;
@@ -6024,30 +6107,50 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn)
                             tcg_temp_free(tmp2);
                             break;
                         case NEON_2RM_VCGT0_F:
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
-                            gen_helper_neon_cgt_f32(tmp, tmp, tmp2);
+                            gen_helper_neon_cgt_f32(tmp, tmp, tmp2, fpstatus);
                             tcg_temp_free(tmp2);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VCGE0_F:
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
-                            gen_helper_neon_cge_f32(tmp, tmp, tmp2);
+                            gen_helper_neon_cge_f32(tmp, tmp, tmp2, fpstatus);
                             tcg_temp_free(tmp2);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VCEQ0_F:
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
-                            gen_helper_neon_ceq_f32(tmp, tmp, tmp2);
+                            gen_helper_neon_ceq_f32(tmp, tmp, tmp2, fpstatus);
                             tcg_temp_free(tmp2);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VCLE0_F:
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
-                            gen_helper_neon_cge_f32(tmp, tmp2, tmp);
+                            gen_helper_neon_cge_f32(tmp, tmp2, tmp, fpstatus);
                             tcg_temp_free(tmp2);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VCLT0_F:
+                        {
+                            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
                             tmp2 = tcg_const_i32(0);
-                            gen_helper_neon_cgt_f32(tmp, tmp2, tmp);
+                            gen_helper_neon_cgt_f32(tmp, tmp2, tmp, fpstatus);
                             tcg_temp_free(tmp2);
+                            tcg_temp_free_ptr(fpstatus);
                             break;
+                        }
                         case NEON_2RM_VABS_F:
                             gen_vfp_abs(0);
                             break;