summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS6
-rwxr-xr-xconfigure1
-rw-r--r--fpu/softfloat-macros.h48
-rw-r--r--fpu/softfloat-specialize.h109
-rw-r--r--fpu/softfloat.c4550
-rw-r--r--hw/arm/raspi.c23
-rw-r--r--hw/char/stm32f2xx_usart.c12
-rw-r--r--hw/display/virtio-gpu-3d.c15
-rw-r--r--hw/display/virtio-gpu.c7
-rw-r--r--hw/misc/aspeed_scu.c6
-rw-r--r--hw/misc/aspeed_sdmc.c8
-rw-r--r--hw/sd/milkymist-memcard.c81
-rw-r--r--hw/sd/sd.c467
-rw-r--r--hw/sd/sdmmc-internal.h15
-rw-r--r--hw/sd/ssi-sd.c32
-rw-r--r--hw/sd/trace-events20
-rw-r--r--hw/usb/dev-mtp.c471
-rw-r--r--include/elf.h35
-rw-r--r--include/fpu/softfloat-types.h179
-rw-r--r--include/fpu/softfloat.h202
-rw-r--r--include/hw/char/stm32f2xx_usart.h7
-rw-r--r--include/hw/sd/sd.h1
-rw-r--r--include/hw/virtio/virtio-gpu.h2
-rw-r--r--include/qemu/bswap.h2
-rw-r--r--include/standard-headers/linux/virtio_gpu.h1
-rw-r--r--include/ui/console.h13
-rw-r--r--linux-user/aarch64/target_elf.h14
-rw-r--r--linux-user/alpha/target_elf.h14
-rw-r--r--linux-user/arm/target_elf.h14
-rw-r--r--linux-user/cris/target_elf.h14
-rw-r--r--linux-user/elfload.c35
-rw-r--r--linux-user/hppa/target_elf.h14
-rw-r--r--linux-user/i386/target_elf.h14
-rw-r--r--linux-user/m68k/target_elf.h20
-rw-r--r--linux-user/main.c59
-rw-r--r--linux-user/microblaze/target_elf.h14
-rw-r--r--linux-user/mips/target_elf.h17
-rw-r--r--linux-user/mips64/target_elf.h17
-rw-r--r--linux-user/nios2/target_elf.h14
-rw-r--r--linux-user/openrisc/target_elf.h14
-rw-r--r--linux-user/ppc/target_elf.h18
-rw-r--r--linux-user/qemu.h1
-rw-r--r--linux-user/s390x/target_elf.h14
-rw-r--r--linux-user/sh4/target_elf.h14
-rw-r--r--linux-user/sparc/target_elf.h18
-rw-r--r--linux-user/sparc64/target_elf.h14
-rw-r--r--linux-user/tilegx/target_elf.h14
-rw-r--r--linux-user/unicore32/target_elf.h14
-rw-r--r--linux-user/x86_64/target_elf.h14
-rw-r--r--pc-bios/openbios-ppcbin754936 -> 754936 bytes
-rw-r--r--pc-bios/openbios-sparc32bin382048 -> 382048 bytes
-rw-r--r--pc-bios/openbios-sparc64bin1593408 -> 1593408 bytes
m---------roms/openbios0
-rwxr-xr-xscripts/decodetree.py1062
-rw-r--r--target/alpha/cpu.h2
-rw-r--r--target/arm/cpu.c1
-rw-r--r--target/arm/cpu.h2
-rw-r--r--target/arm/helper-a64.c1
-rw-r--r--target/arm/helper.c9
-rw-r--r--target/arm/neon_helper.c1
-rw-r--r--target/hppa/cpu.c1
-rw-r--r--target/hppa/cpu.h1
-rw-r--r--target/hppa/op_helper.c2
-rw-r--r--target/i386/cpu.h4
-rw-r--r--target/i386/fpu_helper.c1
-rw-r--r--target/m68k/cpu.c2
-rw-r--r--target/m68k/cpu.h1
-rw-r--r--target/m68k/fpu_helper.c1
-rw-r--r--target/m68k/helper.c1
-rw-r--r--target/m68k/translate.c2
-rw-r--r--target/microblaze/cpu.c1
-rw-r--r--target/microblaze/cpu.h2
-rw-r--r--target/microblaze/op_helper.c1
-rw-r--r--target/moxie/cpu.h1
-rw-r--r--target/nios2/cpu.h1
-rw-r--r--target/openrisc/cpu.h1
-rw-r--r--target/openrisc/fpu_helper.c1
-rw-r--r--target/ppc/cpu.h1
-rw-r--r--target/ppc/fpu_helper.c1
-rw-r--r--target/ppc/int_helper.c1
-rw-r--r--target/ppc/translate_init.c1
-rw-r--r--target/s390x/cpu.c1
-rw-r--r--target/s390x/cpu.h2
-rw-r--r--target/s390x/fpu_helper.c1
-rw-r--r--target/sh4/cpu.c1
-rw-r--r--target/sh4/cpu.h2
-rw-r--r--target/sh4/op_helper.c1
-rw-r--r--target/sparc/cpu.h2
-rw-r--r--target/sparc/fop_helper.c1
-rw-r--r--target/tricore/cpu.h1
-rw-r--r--target/tricore/fpu_helper.c1
-rw-r--r--target/tricore/helper.c1
-rw-r--r--target/unicore32/cpu.c1
-rw-r--r--target/unicore32/cpu.h1
-rw-r--r--target/unicore32/ucf64_helper.c1
-rw-r--r--target/xtensa/cpu.h1
-rw-r--r--target/xtensa/op_helper.c1
-rw-r--r--tests/Makefile.include9
-rwxr-xr-xtests/decode/check.sh18
-rw-r--r--tests/decode/err_argset1.decode5
-rw-r--r--tests/decode/err_argset2.decode5
-rw-r--r--tests/decode/err_field1.decode5
-rw-r--r--tests/decode/err_field2.decode5
-rw-r--r--tests/decode/err_field3.decode5
-rw-r--r--tests/decode/err_field4.decode6
-rw-r--r--tests/decode/err_field5.decode5
-rw-r--r--tests/decode/err_init1.decode6
-rw-r--r--tests/decode/err_init2.decode6
-rw-r--r--tests/decode/err_init3.decode7
-rw-r--r--tests/decode/err_init4.decode7
-rw-r--r--tests/decode/err_overlap1.decode6
-rw-r--r--tests/decode/err_overlap2.decode6
-rw-r--r--tests/decode/err_overlap3.decode6
-rw-r--r--tests/decode/err_overlap4.decode6
-rw-r--r--tests/decode/err_overlap5.decode5
-rw-r--r--tests/decode/err_overlap6.decode6
-rw-r--r--tests/decode/err_overlap7.decode6
-rw-r--r--tests/decode/err_overlap8.decode5
-rw-r--r--tests/decode/err_overlap9.decode6
-rw-r--r--ui/console.c18
-rw-r--r--ui/curses.c3
-rw-r--r--ui/egl-headless.c30
-rw-r--r--ui/egl-helpers.c2
-rw-r--r--ui/keymaps.c163
-rw-r--r--ui/keymaps.h30
-rw-r--r--ui/sdl.c6
-rw-r--r--ui/sdl2.c14
-rw-r--r--ui/trace-events2
-rw-r--r--ui/vnc.c9
129 files changed, 4853 insertions, 3398 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 8c1ae1d423..79fb830031 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -386,6 +386,12 @@ M: Kamil Rytarowski <kamil@netbsd.org>
 S: Maintained
 K: ^Subject:.*(?i)NetBSD
 
+OPENBSD
+L: qemu-devel@nongnu.org
+M: Brad Smith <brad@comstyle.com>
+S: Maintained
+K: ^Subject:.*(?i)OpenBSD
+
 W32, W64
 L: qemu-devel@nongnu.org
 M: Stefan Weil <sw@weilnetz.de>
diff --git a/configure b/configure
index ed45a3c4dd..39f3a43001 100755
--- a/configure
+++ b/configure
@@ -760,6 +760,7 @@ OpenBSD)
   audio_drv_list="sdl"
   audio_possible_drivers="sdl"
   HOST_VARIANT_DIR="openbsd"
+  supported_os="yes"
 ;;
 Darwin)
   bsd="yes"
diff --git a/fpu/softfloat-macros.h b/fpu/softfloat-macros.h
index 9cc6158cb4..c45a23193e 100644
--- a/fpu/softfloat-macros.h
+++ b/fpu/softfloat-macros.h
@@ -625,6 +625,54 @@ static uint64_t estimateDiv128To64( uint64_t a0, uint64_t a1, uint64_t b )
 
 }
 
+/* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd
+ * (https://gmplib.org/repo/gmp/file/tip/longlong.h)
+ *
+ * Licensed under the GPLv2/LGPLv3
+ */
+static uint64_t div128To64(uint64_t n0, uint64_t n1, uint64_t d)
+{
+    uint64_t d0, d1, q0, q1, r1, r0, m;
+
+    d0 = (uint32_t)d;
+    d1 = d >> 32;
+
+    r1 = n1 % d1;
+    q1 = n1 / d1;
+    m = q1 * d0;
+    r1 = (r1 << 32) | (n0 >> 32);
+    if (r1 < m) {
+        q1 -= 1;
+        r1 += d;
+        if (r1 >= d) {
+            if (r1 < m) {
+                q1 -= 1;
+                r1 += d;
+            }
+        }
+    }
+    r1 -= m;
+
+    r0 = r1 % d1;
+    q0 = r1 / d1;
+    m = q0 * d0;
+    r0 = (r0 << 32) | (uint32_t)n0;
+    if (r0 < m) {
+        q0 -= 1;
+        r0 += d;
+        if (r0 >= d) {
+            if (r0 < m) {
+                q0 -= 1;
+                r0 += d;
+            }
+        }
+    }
+    r0 -= m;
+
+    /* Return remainder in LSB */
+    return (q1 << 32) | q0 | (r0 != 0);
+}
+
 /*----------------------------------------------------------------------------
 | Returns an approximation to the square root of the 32-bit significand given
 | by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h
index de2c5d5702..e81ca001e1 100644
--- a/fpu/softfloat-specialize.h
+++ b/fpu/softfloat-specialize.h
@@ -445,9 +445,10 @@ static float32 commonNaNToFloat32(commonNaNT a, float_status *status)
 
 #if defined(TARGET_ARM)
 static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN,
-                    flag aIsLargerSignificand)
+                   flag aIsLargerSignificand)
 {
-    /* ARM mandated NaN propagation rules: take the first of:
+    /* ARM mandated NaN propagation rules (see FPProcessNaNs()), take
+     * the first of:
      *  1. A if it is signaling
      *  2. B if it is signaling
      *  3. A (quiet)
@@ -728,58 +729,6 @@ static float32 propagateFloat32NaN(float32 a, float32 b, float_status *status)
     }
 }
 
-/*----------------------------------------------------------------------------
-| Takes three single-precision floating-point values `a', `b' and `c', one of
-| which is a NaN, and returns the appropriate NaN result.  If any of  `a',
-| `b' or `c' is a signaling NaN, the invalid exception is raised.
-| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case
-| obviously c is a NaN, and whether to propagate c or some other NaN is
-| implementation defined).
-*----------------------------------------------------------------------------*/
-
-static float32 propagateFloat32MulAddNaN(float32 a, float32 b,
-                                         float32 c, flag infzero,
-                                         float_status *status)
-{
-    flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN,
-        cIsQuietNaN, cIsSignalingNaN;
-    int which;
-
-    aIsQuietNaN = float32_is_quiet_nan(a, status);
-    aIsSignalingNaN = float32_is_signaling_nan(a, status);
-    bIsQuietNaN = float32_is_quiet_nan(b, status);
-    bIsSignalingNaN = float32_is_signaling_nan(b, status);
-    cIsQuietNaN = float32_is_quiet_nan(c, status);
-    cIsSignalingNaN = float32_is_signaling_nan(c, status);
-
-    if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) {
-        float_raise(float_flag_invalid, status);
-    }
-
-    which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN,
-                          bIsQuietNaN, bIsSignalingNaN,
-                          cIsQuietNaN, cIsSignalingNaN, infzero, status);
-
-    if (status->default_nan_mode) {
-        /* Note that this check is after pickNaNMulAdd so that function
-         * has an opportunity to set the Invalid flag.
-         */
-        return float32_default_nan(status);
-    }
-
-    switch (which) {
-    case 0:
-        return float32_maybe_silence_nan(a, status);
-    case 1:
-        return float32_maybe_silence_nan(b, status);
-    case 2:
-        return float32_maybe_silence_nan(c, status);
-    case 3:
-    default:
-        return float32_default_nan(status);
-    }
-}
-
 #ifdef NO_SIGNALING_NANS
 int float64_is_quiet_nan(float64 a_, float_status *status)
 {
@@ -935,58 +884,6 @@ static float64 propagateFloat64NaN(float64 a, float64 b, float_status *status)
     }
 }
 
-/*----------------------------------------------------------------------------
-| Takes three double-precision floating-point values `a', `b' and `c', one of
-| which is a NaN, and returns the appropriate NaN result.  If any of  `a',
-| `b' or `c' is a signaling NaN, the invalid exception is raised.
-| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case
-| obviously c is a NaN, and whether to propagate c or some other NaN is
-| implementation defined).
-*----------------------------------------------------------------------------*/
-
-static float64 propagateFloat64MulAddNaN(float64 a, float64 b,
-                                         float64 c, flag infzero,
-                                         float_status *status)
-{
-    flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN,
-        cIsQuietNaN, cIsSignalingNaN;
-    int which;
-
-    aIsQuietNaN = float64_is_quiet_nan(a, status);
-    aIsSignalingNaN = float64_is_signaling_nan(a, status);
-    bIsQuietNaN = float64_is_quiet_nan(b, status);
-    bIsSignalingNaN = float64_is_signaling_nan(b, status);
-    cIsQuietNaN = float64_is_quiet_nan(c, status);
-    cIsSignalingNaN = float64_is_signaling_nan(c, status);
-
-    if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) {
-        float_raise(float_flag_invalid, status);
-    }
-
-    which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN,
-                          bIsQuietNaN, bIsSignalingNaN,
-                          cIsQuietNaN, cIsSignalingNaN, infzero, status);
-
-    if (status->default_nan_mode) {
-        /* Note that this check is after pickNaNMulAdd so that function
-         * has an opportunity to set the Invalid flag.
-         */
-        return float64_default_nan(status);
-    }
-
-    switch (which) {
-    case 0:
-        return float64_maybe_silence_nan(a, status);
-    case 1:
-        return float64_maybe_silence_nan(b, status);
-    case 2:
-        return float64_maybe_silence_nan(c, status);
-    case 3:
-    default:
-        return float64_default_nan(status);
-    }
-}
-
 #ifdef NO_SIGNALING_NANS
 int floatx80_is_quiet_nan(floatx80 a_, float_status *status)
 {
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 433c5dad2d..e7fb0d357a 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -83,7 +83,7 @@ this code that are retained.
  * target-dependent and needs the TARGET_* macros.
  */
 #include "qemu/osdep.h"
-
+#include "qemu/bitops.h"
 #include "fpu/softfloat.h"
 
 /* We only need stdlib for abort() */
@@ -133,6 +133,1866 @@ static inline flag extractFloat16Sign(float16 a)
 }
 
 /*----------------------------------------------------------------------------
+| Returns the fraction bits of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+static inline uint32_t extractFloat32Frac(float32 a)
+{
+    return float32_val(a) & 0x007FFFFF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+static inline int extractFloat32Exp(float32 a)
+{
+    return (float32_val(a) >> 23) & 0xFF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the single-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+static inline flag extractFloat32Sign(float32 a)
+{
+    return float32_val(a) >> 31;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+static inline uint64_t extractFloat64Frac(float64 a)
+{
+    return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+static inline int extractFloat64Exp(float64 a)
+{
+    return (float64_val(a) >> 52) & 0x7FF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the double-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+static inline flag extractFloat64Sign(float64 a)
+{
+    return float64_val(a) >> 63;
+}
+
+/*
+ * Classify a floating point number. Everything above float_class_qnan
+ * is a NaN so cls >= float_class_qnan is any NaN.
+ */
+
+typedef enum __attribute__ ((__packed__)) {
+    float_class_unclassified,
+    float_class_zero,
+    float_class_normal,
+    float_class_inf,
+    float_class_qnan,  /* all NaNs from here */
+    float_class_snan,
+    float_class_dnan,
+    float_class_msnan, /* maybe silenced */
+} FloatClass;
+
+/*
+ * Structure holding all of the decomposed parts of a float. The
+ * exponent is unbiased and the fraction is normalized. All
+ * calculations are done with a 64 bit fraction and then rounded as
+ * appropriate for the final format.
+ *
+ * Thanks to the packed FloatClass a decent compiler should be able to
+ * fit the whole structure into registers and avoid using the stack
+ * for parameter passing.
+ */
+
+typedef struct {
+    uint64_t frac;
+    int32_t  exp;
+    FloatClass cls;
+    bool sign;
+} FloatParts;
+
+#define DECOMPOSED_BINARY_POINT    (64 - 2)
+#define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
+#define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
+
+/* Structure holding all of the relevant parameters for a format.
+ *   exp_size: the size of the exponent field
+ *   exp_bias: the offset applied to the exponent field
+ *   exp_max: the maximum normalised exponent
+ *   frac_size: the size of the fraction field
+ *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
+ * The following are computed based the size of fraction
+ *   frac_lsb: least significant bit of fraction
+ *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
+ *   round_mask/roundeven_mask: masks used for rounding
+ */
+typedef struct {
+    int exp_size;
+    int exp_bias;
+    int exp_max;
+    int frac_size;
+    int frac_shift;
+    uint64_t frac_lsb;
+    uint64_t frac_lsbm1;
+    uint64_t round_mask;
+    uint64_t roundeven_mask;
+} FloatFmt;
+
+/* Expand fields based on the size of exponent and fraction */
+#define FLOAT_PARAMS(E, F)                                           \
+    .exp_size       = E,                                             \
+    .exp_bias       = ((1 << E) - 1) >> 1,                           \
+    .exp_max        = (1 << E) - 1,                                  \
+    .frac_size      = F,                                             \
+    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
+    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
+    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
+    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
+    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
+
+static const FloatFmt float16_params = {
+    FLOAT_PARAMS(5, 10)
+};
+
+static const FloatFmt float32_params = {
+    FLOAT_PARAMS(8, 23)
+};
+
+static const FloatFmt float64_params = {
+    FLOAT_PARAMS(11, 52)
+};
+
+/* Unpack a float to parts, but do not canonicalize.  */
+static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
+{
+    const int sign_pos = fmt.frac_size + fmt.exp_size;
+
+    return (FloatParts) {
+        .cls = float_class_unclassified,
+        .sign = extract64(raw, sign_pos, 1),
+        .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
+        .frac = extract64(raw, 0, fmt.frac_size),
+    };
+}
+
+static inline FloatParts float16_unpack_raw(float16 f)
+{
+    return unpack_raw(float16_params, f);
+}
+
+static inline FloatParts float32_unpack_raw(float32 f)
+{
+    return unpack_raw(float32_params, f);
+}
+
+static inline FloatParts float64_unpack_raw(float64 f)
+{
+    return unpack_raw(float64_params, f);
+}
+
+/* Pack a float from parts, but do not canonicalize.  */
+static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
+{
+    const int sign_pos = fmt.frac_size + fmt.exp_size;
+    uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
+    return deposit64(ret, sign_pos, 1, p.sign);
+}
+
+static inline float16 float16_pack_raw(FloatParts p)
+{
+    return make_float16(pack_raw(float16_params, p));
+}
+
+static inline float32 float32_pack_raw(FloatParts p)
+{
+    return make_float32(pack_raw(float32_params, p));
+}
+
+static inline float64 float64_pack_raw(FloatParts p)
+{
+    return make_float64(pack_raw(float64_params, p));
+}
+
+/* Canonicalize EXP and FRAC, setting CLS.  */
+static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
+                               float_status *status)
+{
+    if (part.exp == parm->exp_max) {
+        if (part.frac == 0) {
+            part.cls = float_class_inf;
+        } else {
+#ifdef NO_SIGNALING_NANS
+            part.cls = float_class_qnan;
+#else
+            int64_t msb = part.frac << (parm->frac_shift + 2);
+            if ((msb < 0) == status->snan_bit_is_one) {
+                part.cls = float_class_snan;
+            } else {
+                part.cls = float_class_qnan;
+            }
+#endif
+        }
+    } else if (part.exp == 0) {
+        if (likely(part.frac == 0)) {
+            part.cls = float_class_zero;
+        } else if (status->flush_inputs_to_zero) {
+            float_raise(float_flag_input_denormal, status);
+            part.cls = float_class_zero;
+            part.frac = 0;
+        } else {
+            int shift = clz64(part.frac) - 1;
+            part.cls = float_class_normal;
+            part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
+            part.frac <<= shift;
+        }
+    } else {
+        part.cls = float_class_normal;
+        part.exp -= parm->exp_bias;
+        part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
+    }
+    return part;
+}
+
+/* Round and uncanonicalize a floating-point number by parts. There
+ * are FRAC_SHIFT bits that may require rounding at the bottom of the
+ * fraction; these bits will be removed. The exponent will be biased
+ * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
+ */
+
+static FloatParts round_canonical(FloatParts p, float_status *s,
+                                  const FloatFmt *parm)
+{
+    const uint64_t frac_lsbm1 = parm->frac_lsbm1;
+    const uint64_t round_mask = parm->round_mask;
+    const uint64_t roundeven_mask = parm->roundeven_mask;
+    const int exp_max = parm->exp_max;
+    const int frac_shift = parm->frac_shift;
+    uint64_t frac, inc;
+    int exp, flags = 0;
+    bool overflow_norm;
+
+    frac = p.frac;
+    exp = p.exp;
+
+    switch (p.cls) {
+    case float_class_normal:
+        switch (s->float_rounding_mode) {
+        case float_round_nearest_even:
+            overflow_norm = false;
+            inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
+            break;
+        case float_round_ties_away:
+            overflow_norm = false;
+            inc = frac_lsbm1;
+            break;
+        case float_round_to_zero:
+            overflow_norm = true;
+            inc = 0;
+            break;
+        case float_round_up:
+            inc = p.sign ? 0 : round_mask;
+            overflow_norm = p.sign;
+            break;
+        case float_round_down:
+            inc = p.sign ? round_mask : 0;
+            overflow_norm = !p.sign;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        exp += parm->exp_bias;
+        if (likely(exp > 0)) {
+            if (frac & round_mask) {
+                flags |= float_flag_inexact;
+                frac += inc;
+                if (frac & DECOMPOSED_OVERFLOW_BIT) {
+                    frac >>= 1;
+                    exp++;
+                }
+            }
+            frac >>= frac_shift;
+
+            if (unlikely(exp >= exp_max)) {
+                flags |= float_flag_overflow | float_flag_inexact;
+                if (overflow_norm) {
+                    exp = exp_max - 1;
+                    frac = -1;
+                } else {
+                    p.cls = float_class_inf;
+                    goto do_inf;
+                }
+            }
+        } else if (s->flush_to_zero) {
+            flags |= float_flag_output_denormal;
+            p.cls = float_class_zero;
+            goto do_zero;
+        } else {
+            bool is_tiny = (s->float_detect_tininess
+                            == float_tininess_before_rounding)
+                        || (exp < 0)
+                        || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
+
+            shift64RightJamming(frac, 1 - exp, &frac);
+            if (frac & round_mask) {
+                /* Need to recompute round-to-even.  */
+                if (s->float_rounding_mode == float_round_nearest_even) {
+                    inc = ((frac & roundeven_mask) != frac_lsbm1
+                           ? frac_lsbm1 : 0);
+                }
+                flags |= float_flag_inexact;
+                frac += inc;
+            }
+
+            exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
+            frac >>= frac_shift;
+
+            if (is_tiny && (flags & float_flag_inexact)) {
+                flags |= float_flag_underflow;
+            }
+            if (exp == 0 && frac == 0) {
+                p.cls = float_class_zero;
+            }
+        }
+        break;
+
+    case float_class_zero:
+    do_zero:
+        exp = 0;
+        frac = 0;
+        break;
+
+    case float_class_inf:
+    do_inf:
+        exp = exp_max;
+        frac = 0;
+        break;
+
+    case float_class_qnan:
+    case float_class_snan:
+        exp = exp_max;
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+
+    float_raise(flags, s);
+    p.exp = exp;
+    p.frac = frac;
+    return p;
+}
+
+static FloatParts float16_unpack_canonical(float16 f, float_status *s)
+{
+    return canonicalize(float16_unpack_raw(f), &float16_params, s);
+}
+
+static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
+{
+    switch (p.cls) {
+    case float_class_dnan:
+        return float16_default_nan(s);
+    case float_class_msnan:
+        return float16_maybe_silence_nan(float16_pack_raw(p), s);
+    default:
+        p = round_canonical(p, s, &float16_params);
+        return float16_pack_raw(p);
+    }
+}
+
+static FloatParts float32_unpack_canonical(float32 f, float_status *s)
+{
+    return canonicalize(float32_unpack_raw(f), &float32_params, s);
+}
+
+static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
+{
+    switch (p.cls) {
+    case float_class_dnan:
+        return float32_default_nan(s);
+    case float_class_msnan:
+        return float32_maybe_silence_nan(float32_pack_raw(p), s);
+    default:
+        p = round_canonical(p, s, &float32_params);
+        return float32_pack_raw(p);
+    }
+}
+
+static FloatParts float64_unpack_canonical(float64 f, float_status *s)
+{
+    return canonicalize(float64_unpack_raw(f), &float64_params, s);
+}
+
+static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
+{
+    switch (p.cls) {
+    case float_class_dnan:
+        return float64_default_nan(s);
+    case float_class_msnan:
+        return float64_maybe_silence_nan(float64_pack_raw(p), s);
+    default:
+        p = round_canonical(p, s, &float64_params);
+        return float64_pack_raw(p);
+    }
+}
+
+/* Simple helpers for checking if what NaN we have */
+static bool is_nan(FloatClass c)
+{
+    return unlikely(c >= float_class_qnan);
+}
+static bool is_snan(FloatClass c)
+{
+    return c == float_class_snan;
+}
+static bool is_qnan(FloatClass c)
+{
+    return c == float_class_qnan;
+}
+
+static FloatParts return_nan(FloatParts a, float_status *s)
+{
+    switch (a.cls) {
+    case float_class_snan:
+        s->float_exception_flags |= float_flag_invalid;
+        a.cls = float_class_msnan;
+        /* fall through */
+    case float_class_qnan:
+        if (s->default_nan_mode) {
+            a.cls = float_class_dnan;
+        }
+        break;
+
+    default:
+        g_assert_not_reached();
+    }
+    return a;
+}
+
+static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
+{
+    if (is_snan(a.cls) || is_snan(b.cls)) {
+        s->float_exception_flags |= float_flag_invalid;
+    }
+
+    if (s->default_nan_mode) {
+        a.cls = float_class_dnan;
+    } else {
+        if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
+                    is_qnan(b.cls), is_snan(b.cls),
+                    a.frac > b.frac ||
+                    (a.frac == b.frac && a.sign < b.sign))) {
+            a = b;
+        }
+        a.cls = float_class_msnan;
+    }
+    return a;
+}
+
+static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
+                                  bool inf_zero, float_status *s)
+{
+    if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
+        s->float_exception_flags |= float_flag_invalid;
+    }
+
+    if (s->default_nan_mode) {
+        a.cls = float_class_dnan;
+    } else {
+        switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
+                              is_qnan(b.cls), is_snan(b.cls),
+                              is_qnan(c.cls), is_snan(c.cls),
+                              inf_zero, s)) {
+        case 0:
+            break;
+        case 1:
+            a = b;
+            break;
+        case 2:
+            a = c;
+            break;
+        case 3:
+            a.cls = float_class_dnan;
+            return a;
+        default:
+            g_assert_not_reached();
+        }
+
+        a.cls = float_class_msnan;
+    }
+    return a;
+}
+
+/*
+ * Returns the result of adding or subtracting the values of the
+ * floating-point values `a' and `b'. The operation is performed
+ * according to the IEC/IEEE Standard for Binary Floating-Point
+ * Arithmetic.
+ */
+
+static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
+                                float_status *s)
+{
+    bool a_sign = a.sign;
+    bool b_sign = b.sign ^ subtract;
+
+    if (a_sign != b_sign) {
+        /* Subtraction */
+
+        if (a.cls == float_class_normal && b.cls == float_class_normal) {
+            if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
+                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
+                a.frac = a.frac - b.frac;
+            } else {
+                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
+                a.frac = b.frac - a.frac;
+                a.exp = b.exp;
+                a_sign ^= 1;
+            }
+
+            if (a.frac == 0) {
+                a.cls = float_class_zero;
+                a.sign = s->float_rounding_mode == float_round_down;
+            } else {
+                int shift = clz64(a.frac) - 1;
+                a.frac = a.frac << shift;
+                a.exp = a.exp - shift;
+                a.sign = a_sign;
+            }
+            return a;
+        }
+        if (is_nan(a.cls) || is_nan(b.cls)) {
+            return pick_nan(a, b, s);
+        }
+        if (a.cls == float_class_inf) {
+            if (b.cls == float_class_inf) {
+                float_raise(float_flag_invalid, s);
+                a.cls = float_class_dnan;
+            }
+            return a;
+        }
+        if (a.cls == float_class_zero && b.cls == float_class_zero) {
+            a.sign = s->float_rounding_mode == float_round_down;
+            return a;
+        }
+        if (a.cls == float_class_zero || b.cls == float_class_inf) {
+            b.sign = a_sign ^ 1;
+            return b;
+        }
+        if (b.cls == float_class_zero) {
+            return a;
+        }
+    } else {
+        /* Addition */
+        if (a.cls == float_class_normal && b.cls == float_class_normal) {
+            if (a.exp > b.exp) {
+                shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
+            } else if (a.exp < b.exp) {
+                shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
+                a.exp = b.exp;
+            }
+            a.frac += b.frac;
+            if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
+                a.frac >>= 1;
+                a.exp += 1;
+            }
+            return a;
+        }
+        if (is_nan(a.cls) || is_nan(b.cls)) {
+            return pick_nan(a, b, s);
+        }
+        if (a.cls == float_class_inf || b.cls == float_class_zero) {
+            return a;
+        }
+        if (b.cls == float_class_inf || a.cls == float_class_zero) {
+            b.sign = b_sign;
+            return b;
+        }
+    }
+    g_assert_not_reached();
+}
+
+/*
+ * Returns the result of adding or subtracting the floating-point
+ * values `a' and `b'. The operation is performed according to the
+ * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+ */
+
+float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
+                                              float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pb = float16_unpack_canonical(b, status);
+    FloatParts pr = addsub_floats(pa, pb, false, status);
+
+    return float16_round_pack_canonical(pr, status);
+}
+
+float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
+                                             float_status *status)
+{
+    FloatParts pa = float32_unpack_canonical(a, status);
+    FloatParts pb = float32_unpack_canonical(b, status);
+    FloatParts pr = addsub_floats(pa, pb, false, status);
+
+    return float32_round_pack_canonical(pr, status);
+}
+
+float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
+                                             float_status *status)
+{
+    FloatParts pa = float64_unpack_canonical(a, status);
+    FloatParts pb = float64_unpack_canonical(b, status);
+    FloatParts pr = addsub_floats(pa, pb, false, status);
+
+    return float64_round_pack_canonical(pr, status);
+}
+
+float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
+                                             float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pb = float16_unpack_canonical(b, status);
+    FloatParts pr = addsub_floats(pa, pb, true, status);
+
+    return float16_round_pack_canonical(pr, status);
+}
+
+float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
+                                             float_status *status)
+{
+    FloatParts pa = float32_unpack_canonical(a, status);
+    FloatParts pb = float32_unpack_canonical(b, status);
+    FloatParts pr = addsub_floats(pa, pb, true, status);
+
+    return float32_round_pack_canonical(pr, status);
+}
+
+float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
+                                             float_status *status)
+{
+    FloatParts pa = float64_unpack_canonical(a, status);
+    FloatParts pb = float64_unpack_canonical(b, status);
+    FloatParts pr = addsub_floats(pa, pb, true, status);
+
+    return float64_round_pack_canonical(pr, status);
+}
+
+/*
+ * Returns the result of multiplying the floating-point values `a' and
+ * `b'. The operation is performed according to the IEC/IEEE Standard
+ * for Binary Floating-Point Arithmetic.
+ */
+
+static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
+{
+    bool sign = a.sign ^ b.sign;
+
+    if (a.cls == float_class_normal && b.cls == float_class_normal) {
+        uint64_t hi, lo;
+        int exp = a.exp + b.exp;
+
+        mul64To128(a.frac, b.frac, &hi, &lo);
+        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
+        if (lo & DECOMPOSED_OVERFLOW_BIT) {
+            shift64RightJamming(lo, 1, &lo);
+            exp += 1;
+        }
+
+        /* Re-use a */
+        a.exp = exp;
+        a.sign = sign;
+        a.frac = lo;
+        return a;
+    }
+    /* handle all the NaN cases */
+    if (is_nan(a.cls) || is_nan(b.cls)) {
+        return pick_nan(a, b, s);
+    }
+    /* Inf * Zero == NaN */
+    if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
+        (a.cls == float_class_zero && b.cls == float_class_inf)) {
+        s->float_exception_flags |= float_flag_invalid;
+        a.cls = float_class_dnan;
+        a.sign = sign;
+        return a;
+    }
+    /* Multiply by 0 or Inf */
+    if (a.cls == float_class_inf || a.cls == float_class_zero) {
+        a.sign = sign;
+        return a;
+    }
+    if (b.cls == float_class_inf || b.cls == float_class_zero) {
+        b.sign = sign;
+        return b;
+    }
+    g_assert_not_reached();
+}
+
+float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
+                                             float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pb = float16_unpack_canonical(b, status);
+    FloatParts pr = mul_floats(pa, pb, status);
+
+    return float16_round_pack_canonical(pr, status);
+}
+
+float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
+                                             float_status *status)
+{
+    FloatParts pa = float32_unpack_canonical(a, status);
+    FloatParts pb = float32_unpack_canonical(b, status);
+    FloatParts pr = mul_floats(pa, pb, status);
+
+    return float32_round_pack_canonical(pr, status);
+}
+
+float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
+                                             float_status *status)
+{
+    FloatParts pa = float64_unpack_canonical(a, status);
+    FloatParts pb = float64_unpack_canonical(b, status);
+    FloatParts pr = mul_floats(pa, pb, status);
+
+    return float64_round_pack_canonical(pr, status);
+}
+
+/*
+ * Returns the result of multiplying the floating-point values `a' and
+ * `b' then adding 'c', with no intermediate rounding step after the
+ * multiplication. The operation is performed according to the
+ * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
+ * The flags argument allows the caller to select negation of the
+ * addend, the intermediate product, or the final result. (The
+ * difference between this and having the caller do a separate
+ * negation is that negating externally will flip the sign bit on
+ * NaNs.)
+ */
+
+static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
+                                int flags, float_status *s)
+{
+    bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
+                    ((1 << float_class_inf) | (1 << float_class_zero));
+    bool p_sign;
+    bool sign_flip = flags & float_muladd_negate_result;
+    FloatClass p_class;
+    uint64_t hi, lo;
+    int p_exp;
+
+    /* It is implementation-defined whether the cases of (0,inf,qnan)
+     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
+     * they return if they do), so we have to hand this information
+     * off to the target-specific pick-a-NaN routine.
+     */
+    if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
+        return pick_nan_muladd(a, b, c, inf_zero, s);
+    }
+
+    if (inf_zero) {
+        s->float_exception_flags |= float_flag_invalid;
+        a.cls = float_class_dnan;
+        return a;
+    }
+
+    if (flags & float_muladd_negate_c) {
+        c.sign ^= 1;
+    }
+
+    p_sign = a.sign ^ b.sign;
+
+    if (flags & float_muladd_negate_product) {
+        p_sign ^= 1;
+    }
+
+    if (a.cls == float_class_inf || b.cls == float_class_inf) {
+        p_class = float_class_inf;
+    } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
+        p_class = float_class_zero;
+    } else {
+        p_class = float_class_normal;
+    }
+
+    if (c.cls == float_class_inf) {
+        if (p_class == float_class_inf && p_sign != c.sign) {
+            s->float_exception_flags |= float_flag_invalid;
+            a.cls = float_class_dnan;
+        } else {
+            a.cls = float_class_inf;
+            a.sign = c.sign ^ sign_flip;
+        }
+        return a;
+    }
+
+    if (p_class == float_class_inf) {
+        a.cls = float_class_inf;
+        a.sign = p_sign ^ sign_flip;
+        return a;
+    }
+
+    if (p_class == float_class_zero) {
+        if (c.cls == float_class_zero) {
+            if (p_sign != c.sign) {
+                p_sign = s->float_rounding_mode == float_round_down;
+            }
+            c.sign = p_sign;
+        } else if (flags & float_muladd_halve_result) {
+            c.exp -= 1;
+        }
+        c.sign ^= sign_flip;
+        return c;
+    }
+
+    /* a & b should be normals now... */
+    assert(a.cls == float_class_normal &&
+           b.cls == float_class_normal);
+
+    p_exp = a.exp + b.exp;
+
+    /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
+     * result.
+     */
+    mul64To128(a.frac, b.frac, &hi, &lo);
+    /* binary point now at bit 124 */
+
+    /* check for overflow */
+    if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
+        shift128RightJamming(hi, lo, 1, &hi, &lo);
+        p_exp += 1;
+    }
+
+    /* + add/sub */
+    if (c.cls == float_class_zero) {
+        /* move binary point back to 62 */
+        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
+    } else {
+        int exp_diff = p_exp - c.exp;
+        if (p_sign == c.sign) {
+            /* Addition */
+            if (exp_diff <= 0) {
+                shift128RightJamming(hi, lo,
+                                     DECOMPOSED_BINARY_POINT - exp_diff,
+                                     &hi, &lo);
+                lo += c.frac;
+                p_exp = c.exp;
+            } else {
+                uint64_t c_hi, c_lo;
+                /* shift c to the same binary point as the product (124) */
+                c_hi = c.frac >> 2;
+                c_lo = 0;
+                shift128RightJamming(c_hi, c_lo,
+                                     exp_diff,
+                                     &c_hi, &c_lo);
+                add128(hi, lo, c_hi, c_lo, &hi, &lo);
+                /* move binary point back to 62 */
+                shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
+            }
+
+            if (lo & DECOMPOSED_OVERFLOW_BIT) {
+                shift64RightJamming(lo, 1, &lo);
+                p_exp += 1;
+            }
+
+        } else {
+            /* Subtraction */
+            uint64_t c_hi, c_lo;
+            /* make C binary point match product at bit 124 */
+            c_hi = c.frac >> 2;
+            c_lo = 0;
+
+            if (exp_diff <= 0) {
+                shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
+                if (exp_diff == 0
+                    &&
+                    (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
+                    sub128(hi, lo, c_hi, c_lo, &hi, &lo);
+                } else {
+                    sub128(c_hi, c_lo, hi, lo, &hi, &lo);
+                    p_sign ^= 1;
+                    p_exp = c.exp;
+                }
+            } else {
+                shift128RightJamming(c_hi, c_lo,
+                                     exp_diff,
+                                     &c_hi, &c_lo);
+                sub128(hi, lo, c_hi, c_lo, &hi, &lo);
+            }
+
+            if (hi == 0 && lo == 0) {
+                a.cls = float_class_zero;
+                a.sign = s->float_rounding_mode == float_round_down;
+                a.sign ^= sign_flip;
+                return a;
+            } else {
+                int shift;
+                if (hi != 0) {
+                    shift = clz64(hi);
+                } else {
+                    shift = clz64(lo) + 64;
+                }
+                /* Normalizing to a binary point of 124 is the
+                   correct adjust for the exponent.  However since we're
+                   shifting, we might as well put the binary point back
+                   at 62 where we really want it.  Therefore shift as
+                   if we're leaving 1 bit at the top of the word, but
+                   adjust the exponent as if we're leaving 3 bits.  */
+                shift -= 1;
+                if (shift >= 64) {
+                    lo = lo << (shift - 64);
+                } else {
+                    hi = (hi << shift) | (lo >> (64 - shift));
+                    lo = hi | ((lo << shift) != 0);
+                }
+                p_exp -= shift - 2;
+            }
+        }
+    }
+
+    if (flags & float_muladd_halve_result) {
+        p_exp -= 1;
+    }
+
+    /* finally prepare our result */
+    a.cls = float_class_normal;
+    a.sign = p_sign ^ sign_flip;
+    a.exp = p_exp;
+    a.frac = lo;
+
+    return a;
+}
+
+float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
+                                                int flags, float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pb = float16_unpack_canonical(b, status);
+    FloatParts pc = float16_unpack_canonical(c, status);
+    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
+
+    return float16_round_pack_canonical(pr, status);
+}
+
+float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
+                                                int flags, float_status *status)
+{
+    FloatParts pa = float32_unpack_canonical(a, status);
+    FloatParts pb = float32_unpack_canonical(b, status);
+    FloatParts pc = float32_unpack_canonical(c, status);
+    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
+
+    return float32_round_pack_canonical(pr, status);
+}
+
+float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
+                                                int flags, float_status *status)
+{
+    FloatParts pa = float64_unpack_canonical(a, status);
+    FloatParts pb = float64_unpack_canonical(b, status);
+    FloatParts pc = float64_unpack_canonical(c, status);
+    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
+
+    return float64_round_pack_canonical(pr, status);
+}
+
+/*
+ * Returns the result of dividing the floating-point value `a' by the
+ * corresponding value `b'. The operation is performed according to
+ * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+ */
+
+static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
+{
+    bool sign = a.sign ^ b.sign;
+
+    if (a.cls == float_class_normal && b.cls == float_class_normal) {
+        uint64_t temp_lo, temp_hi;
+        int exp = a.exp - b.exp;
+        if (a.frac < b.frac) {
+            exp -= 1;
+            shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
+                              &temp_hi, &temp_lo);
+        } else {
+            shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
+                              &temp_hi, &temp_lo);
+        }
+        /* LSB of quot is set if inexact which roundandpack will use
+         * to set flags. Yet again we re-use a for the result */
+        a.frac = div128To64(temp_lo, temp_hi, b.frac);
+        a.sign = sign;
+        a.exp = exp;
+        return a;
+    }
+    /* handle all the NaN cases */
+    if (is_nan(a.cls) || is_nan(b.cls)) {
+        return pick_nan(a, b, s);
+    }
+    /* 0/0 or Inf/Inf */
+    if (a.cls == b.cls
+        &&
+        (a.cls == float_class_inf || a.cls == float_class_zero)) {
+        s->float_exception_flags |= float_flag_invalid;
+        a.cls = float_class_dnan;
+        return a;
+    }
+    /* Div 0 => Inf */
+    if (b.cls == float_class_zero) {
+        s->float_exception_flags |= float_flag_divbyzero;
+        a.cls = float_class_inf;
+        a.sign = sign;
+        return a;
+    }
+    /* Inf / x or 0 / x */
+    if (a.cls == float_class_inf || a.cls == float_class_zero) {
+        a.sign = sign;
+        return a;
+    }
+    /* Div by Inf */
+    if (b.cls == float_class_inf) {
+        a.cls = float_class_zero;
+        a.sign = sign;
+        return a;
+    }
+    g_assert_not_reached();
+}
+
+float16 float16_div(float16 a, float16 b, float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pb = float16_unpack_canonical(b, status);
+    FloatParts pr = div_floats(pa, pb, status);
+
+    return float16_round_pack_canonical(pr, status);
+}
+
+float32 float32_div(float32 a, float32 b, float_status *status)
+{
+    FloatParts pa = float32_unpack_canonical(a, status);
+    FloatParts pb = float32_unpack_canonical(b, status);
+    FloatParts pr = div_floats(pa, pb, status);
+
+    return float32_round_pack_canonical(pr, status);
+}
+
+float64 float64_div(float64 a, float64 b, float_status *status)
+{
+    FloatParts pa = float64_unpack_canonical(a, status);
+    FloatParts pb = float64_unpack_canonical(b, status);
+    FloatParts pr = div_floats(pa, pb, status);
+
+    return float64_round_pack_canonical(pr, status);
+}
+
+/*
+ * Rounds the floating-point value `a' to an integer, and returns the
+ * result as a floating-point value. The operation is performed
+ * according to the IEC/IEEE Standard for Binary Floating-Point
+ * Arithmetic.
+ */
+
+static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
+{
+    if (is_nan(a.cls)) {
+        return return_nan(a, s);
+    }
+
+    switch (a.cls) {
+    case float_class_zero:
+    case float_class_inf:
+    case float_class_qnan:
+        /* already "integral" */
+        break;
+    case float_class_normal:
+        if (a.exp >= DECOMPOSED_BINARY_POINT) {
+            /* already integral */
+            break;
+        }
+        if (a.exp < 0) {
+            bool one;
+            /* all fractional */
+            s->float_exception_flags |= float_flag_inexact;
+            switch (rounding_mode) {
+            case float_round_nearest_even:
+                one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
+                break;
+            case float_round_ties_away:
+                one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
+                break;
+            case float_round_to_zero:
+                one = false;
+                break;
+            case float_round_up:
+                one = !a.sign;
+                break;
+            case float_round_down:
+                one = a.sign;
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (one) {
+                a.frac = DECOMPOSED_IMPLICIT_BIT;
+                a.exp = 0;
+            } else {
+                a.cls = float_class_zero;
+            }
+        } else {
+            uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
+            uint64_t frac_lsbm1 = frac_lsb >> 1;
+            uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
+            uint64_t rnd_mask = rnd_even_mask >> 1;
+            uint64_t inc;
+
+            switch (rounding_mode) {
+            case float_round_nearest_even:
+                inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
+                break;
+            case float_round_ties_away:
+                inc = frac_lsbm1;
+                break;
+            case float_round_to_zero:
+                inc = 0;
+                break;
+            case float_round_up:
+                inc = a.sign ? 0 : rnd_mask;
+                break;
+            case float_round_down:
+                inc = a.sign ? rnd_mask : 0;
+                break;
+            default:
+                g_assert_not_reached();
+            }
+
+            if (a.frac & rnd_mask) {
+                s->float_exception_flags |= float_flag_inexact;
+                a.frac += inc;
+                a.frac &= ~rnd_mask;
+                if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
+                    a.frac >>= 1;
+                    a.exp++;
+                }
+            }
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return a;
+}
+
+float16 float16_round_to_int(float16 a, float_status *s)
+{
+    FloatParts pa = float16_unpack_canonical(a, s);
+    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
+    return float16_round_pack_canonical(pr, s);
+}
+
+float32 float32_round_to_int(float32 a, float_status *s)
+{
+    FloatParts pa = float32_unpack_canonical(a, s);
+    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
+    return float32_round_pack_canonical(pr, s);
+}
+
+float64 float64_round_to_int(float64 a, float_status *s)
+{
+    FloatParts pa = float64_unpack_canonical(a, s);
+    FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
+    return float64_round_pack_canonical(pr, s);
+}
+
+float64 float64_trunc_to_int(float64 a, float_status *s)
+{
+    FloatParts pa = float64_unpack_canonical(a, s);
+    FloatParts pr = round_to_int(pa, float_round_to_zero, s);
+    return float64_round_pack_canonical(pr, s);
+}
+
+/*
+ * Returns the result of converting the floating-point value `a' to
+ * the two's complement integer format. The conversion is performed
+ * according to the IEC/IEEE Standard for Binary Floating-Point
+ * Arithmetic---which means in particular that the conversion is
+ * rounded according to the current rounding mode. If `a' is a NaN,
+ * the largest positive integer is returned. Otherwise, if the
+ * conversion overflows, the largest integer with the same sign as `a'
+ * is returned.
+*/
+
+static int64_t round_to_int_and_pack(FloatParts in, int rmode,
+                                     int64_t min, int64_t max,
+                                     float_status *s)
+{
+    uint64_t r;
+    int orig_flags = get_float_exception_flags(s);
+    FloatParts p = round_to_int(in, rmode, s);
+
+    switch (p.cls) {
+    case float_class_snan:
+    case float_class_qnan:
+        return max;
+    case float_class_inf:
+        return p.sign ? min : max;
+    case float_class_zero:
+        return 0;
+    case float_class_normal:
+        if (p.exp < DECOMPOSED_BINARY_POINT) {
+            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
+        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
+            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
+        } else {
+            r = UINT64_MAX;
+        }
+        if (p.sign) {
+            if (r < -(uint64_t) min) {
+                return -r;
+            } else {
+                s->float_exception_flags = orig_flags | float_flag_invalid;
+                return min;
+            }
+        } else {
+            if (r < max) {
+                return r;
+            } else {
+                s->float_exception_flags = orig_flags | float_flag_invalid;
+                return max;
+            }
+        }
+    default:
+        g_assert_not_reached();
+    }
+}
+
+#define FLOAT_TO_INT(fsz, isz)                                          \
+int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
+                                                float_status *s)        \
+{                                                                       \
+    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
+    return round_to_int_and_pack(p, s->float_rounding_mode,             \
+                                 INT ## isz ## _MIN, INT ## isz ## _MAX,\
+                                 s);                                    \
+}                                                                       \
+                                                                        \
+int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
+ (float ## fsz a, float_status *s)                                      \
+{                                                                       \
+    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
+    return round_to_int_and_pack(p, float_round_to_zero,                \
+                                 INT ## isz ## _MIN, INT ## isz ## _MAX,\
+                                 s);                                    \
+}
+
+FLOAT_TO_INT(16, 16)
+FLOAT_TO_INT(16, 32)
+FLOAT_TO_INT(16, 64)
+
+FLOAT_TO_INT(32, 16)
+FLOAT_TO_INT(32, 32)
+FLOAT_TO_INT(32, 64)
+
+FLOAT_TO_INT(64, 16)
+FLOAT_TO_INT(64, 32)
+FLOAT_TO_INT(64, 64)
+
+#undef FLOAT_TO_INT
+
+/*
+ *  Returns the result of converting the floating-point value `a' to
+ *  the unsigned integer format. The conversion is performed according
+ *  to the IEC/IEEE Standard for Binary Floating-Point
+ *  Arithmetic---which means in particular that the conversion is
+ *  rounded according to the current rounding mode. If `a' is a NaN,
+ *  the largest unsigned integer is returned. Otherwise, if the
+ *  conversion overflows, the largest unsigned integer is returned. If
+ *  the 'a' is negative, the result is rounded and zero is returned;
+ *  values that do not round to zero will raise the inexact exception
+ *  flag.
+ */
+
+static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
+                                       float_status *s)
+{
+    int orig_flags = get_float_exception_flags(s);
+    FloatParts p = round_to_int(in, rmode, s);
+
+    switch (p.cls) {
+    case float_class_snan:
+    case float_class_qnan:
+        s->float_exception_flags = orig_flags | float_flag_invalid;
+        return max;
+    case float_class_inf:
+        return p.sign ? 0 : max;
+    case float_class_zero:
+        return 0;
+    case float_class_normal:
+    {
+        uint64_t r;
+        if (p.sign) {
+            s->float_exception_flags = orig_flags | float_flag_invalid;
+            return 0;
+        }
+
+        if (p.exp < DECOMPOSED_BINARY_POINT) {
+            r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
+        } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
+            r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
+        } else {
+            s->float_exception_flags = orig_flags | float_flag_invalid;
+            return max;
+        }
+
+        /* For uint64 this will never trip, but if p.exp is too large
+         * to shift a decomposed fraction we shall have exited via the
+         * 3rd leg above.
+         */
+        if (r > max) {
+            s->float_exception_flags = orig_flags | float_flag_invalid;
+            return max;
+        } else {
+            return r;
+        }
+    }
+    default:
+        g_assert_not_reached();
+    }
+}
+
+#define FLOAT_TO_UINT(fsz, isz) \
+uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
+                                                  float_status *s)      \
+{                                                                       \
+    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
+    return round_to_uint_and_pack(p, s->float_rounding_mode,            \
+                                 UINT ## isz ## _MAX, s);               \
+}                                                                       \
+                                                                        \
+uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
+ (float ## fsz a, float_status *s)                                      \
+{                                                                       \
+    FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
+    return round_to_uint_and_pack(p, s->float_rounding_mode,            \
+                                 UINT ## isz ## _MAX, s);               \
+}
+
+FLOAT_TO_UINT(16, 16)
+FLOAT_TO_UINT(16, 32)
+FLOAT_TO_UINT(16, 64)
+
+FLOAT_TO_UINT(32, 16)
+FLOAT_TO_UINT(32, 32)
+FLOAT_TO_UINT(32, 64)
+
+FLOAT_TO_UINT(64, 16)
+FLOAT_TO_UINT(64, 32)
+FLOAT_TO_UINT(64, 64)
+
+#undef FLOAT_TO_UINT
+
+/*
+ * Integer to float conversions
+ *
+ * Returns the result of converting the two's complement integer `a'
+ * to the floating-point format. The conversion is performed according
+ * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+ */
+
+static FloatParts int_to_float(int64_t a, float_status *status)
+{
+    FloatParts r;
+    if (a == 0) {
+        r.cls = float_class_zero;
+        r.sign = false;
+    } else if (a == (1ULL << 63)) {
+        r.cls = float_class_normal;
+        r.sign = true;
+        r.frac = DECOMPOSED_IMPLICIT_BIT;
+        r.exp = 63;
+    } else {
+        uint64_t f;
+        if (a < 0) {
+            f = -a;
+            r.sign = true;
+        } else {
+            f = a;
+            r.sign = false;
+        }
+        int shift = clz64(f) - 1;
+        r.cls = float_class_normal;
+        r.exp = (DECOMPOSED_BINARY_POINT - shift);
+        r.frac = f << shift;
+    }
+
+    return r;
+}
+
+float16 int64_to_float16(int64_t a, float_status *status)
+{
+    FloatParts pa = int_to_float(a, status);
+    return float16_round_pack_canonical(pa, status);
+}
+
+float16 int32_to_float16(int32_t a, float_status *status)
+{
+    return int64_to_float16(a, status);
+}
+
+float16 int16_to_float16(int16_t a, float_status *status)
+{
+    return int64_to_float16(a, status);
+}
+
+float32 int64_to_float32(int64_t a, float_status *status)
+{
+    FloatParts pa = int_to_float(a, status);
+    return float32_round_pack_canonical(pa, status);
+}
+
+float32 int32_to_float32(int32_t a, float_status *status)
+{
+    return int64_to_float32(a, status);
+}
+
+float32 int16_to_float32(int16_t a, float_status *status)
+{
+    return int64_to_float32(a, status);
+}
+
+float64 int64_to_float64(int64_t a, float_status *status)
+{
+    FloatParts pa = int_to_float(a, status);
+    return float64_round_pack_canonical(pa, status);
+}
+
+float64 int32_to_float64(int32_t a, float_status *status)
+{
+    return int64_to_float64(a, status);
+}
+
+float64 int16_to_float64(int16_t a, float_status *status)
+{
+    return int64_to_float64(a, status);
+}
+
+
+/*
+ * Unsigned Integer to float conversions
+ *
+ * Returns the result of converting the unsigned integer `a' to the
+ * floating-point format. The conversion is performed according to the
+ * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+ */
+
+static FloatParts uint_to_float(uint64_t a, float_status *status)
+{
+    FloatParts r = { .sign = false};
+
+    if (a == 0) {
+        r.cls = float_class_zero;
+    } else {
+        int spare_bits = clz64(a) - 1;
+        r.cls = float_class_normal;
+        r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
+        if (spare_bits < 0) {
+            shift64RightJamming(a, -spare_bits, &a);
+            r.frac = a;
+        } else {
+            r.frac = a << spare_bits;
+        }
+    }
+
+    return r;
+}
+
+float16 uint64_to_float16(uint64_t a, float_status *status)
+{
+    FloatParts pa = uint_to_float(a, status);
+    return float16_round_pack_canonical(pa, status);
+}
+
+float16 uint32_to_float16(uint32_t a, float_status *status)
+{
+    return uint64_to_float16(a, status);
+}
+
+float16 uint16_to_float16(uint16_t a, float_status *status)
+{
+    return uint64_to_float16(a, status);
+}
+
+float32 uint64_to_float32(uint64_t a, float_status *status)
+{
+    FloatParts pa = uint_to_float(a, status);
+    return float32_round_pack_canonical(pa, status);
+}
+
+float32 uint32_to_float32(uint32_t a, float_status *status)
+{
+    return uint64_to_float32(a, status);
+}
+
+float32 uint16_to_float32(uint16_t a, float_status *status)
+{
+    return uint64_to_float32(a, status);
+}
+
+float64 uint64_to_float64(uint64_t a, float_status *status)
+{
+    FloatParts pa = uint_to_float(a, status);
+    return float64_round_pack_canonical(pa, status);
+}
+
+float64 uint32_to_float64(uint32_t a, float_status *status)
+{
+    return uint64_to_float64(a, status);
+}
+
+float64 uint16_to_float64(uint16_t a, float_status *status)
+{
+    return uint64_to_float64(a, status);
+}
+
+/* Float Min/Max */
+/* min() and max() functions. These can't be implemented as
+ * 'compare and pick one input' because that would mishandle
+ * NaNs and +0 vs -0.
+ *
+ * minnum() and maxnum() functions. These are similar to the min()
+ * and max() functions but if one of the arguments is a QNaN and
+ * the other is numerical then the numerical argument is returned.
+ * SNaNs will get quietened before being returned.
+ * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
+ * and maxNum() operations. min() and max() are the typical min/max
+ * semantics provided by many CPUs which predate that specification.
+ *
+ * minnummag() and maxnummag() functions correspond to minNumMag()
+ * and minNumMag() from the IEEE-754 2008.
+ */
+static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
+                                bool ieee, bool ismag, float_status *s)
+{
+    if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
+        if (ieee) {
+            /* Takes two floating-point values `a' and `b', one of
+             * which is a NaN, and returns the appropriate NaN
+             * result. If either `a' or `b' is a signaling NaN,
+             * the invalid exception is raised.
+             */
+            if (is_snan(a.cls) || is_snan(b.cls)) {
+                return pick_nan(a, b, s);
+            } else if (is_nan(a.cls) && !is_nan(b.cls)) {
+                return b;
+            } else if (is_nan(b.cls) && !is_nan(a.cls)) {
+                return a;
+            }
+        }
+        return pick_nan(a, b, s);
+    } else {
+        int a_exp, b_exp;
+        bool a_sign, b_sign;
+
+        switch (a.cls) {
+        case float_class_normal:
+            a_exp = a.exp;
+            break;
+        case float_class_inf:
+            a_exp = INT_MAX;
+            break;
+        case float_class_zero:
+            a_exp = INT_MIN;
+            break;
+        default:
+            g_assert_not_reached();
+            break;
+        }
+        switch (b.cls) {
+        case float_class_normal:
+            b_exp = b.exp;
+            break;
+        case float_class_inf:
+            b_exp = INT_MAX;
+            break;
+        case float_class_zero:
+            b_exp = INT_MIN;
+            break;
+        default:
+            g_assert_not_reached();
+            break;
+        }
+
+        a_sign = a.sign;
+        b_sign = b.sign;
+        if (ismag) {
+            a_sign = b_sign = 0;
+        }
+
+        if (a_sign == b_sign) {
+            bool a_less = a_exp < b_exp;
+            if (a_exp == b_exp) {
+                a_less = a.frac < b.frac;
+            }
+            return a_sign ^ a_less ^ ismin ? b : a;
+        } else {
+            return a_sign ^ ismin ? b : a;
+        }
+    }
+}
+
+#define MINMAX(sz, name, ismin, isiee, ismag)                           \
+float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
+                                     float_status *s)                   \
+{                                                                       \
+    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
+    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
+    FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
+                                                                        \
+    return float ## sz ## _round_pack_canonical(pr, s);                 \
+}
+
+MINMAX(16, min, true, false, false)
+MINMAX(16, minnum, true, true, false)
+MINMAX(16, minnummag, true, true, true)
+MINMAX(16, max, false, false, false)
+MINMAX(16, maxnum, false, true, false)
+MINMAX(16, maxnummag, false, true, true)
+
+MINMAX(32, min, true, false, false)
+MINMAX(32, minnum, true, true, false)
+MINMAX(32, minnummag, true, true, true)
+MINMAX(32, max, false, false, false)
+MINMAX(32, maxnum, false, true, false)
+MINMAX(32, maxnummag, false, true, true)
+
+MINMAX(64, min, true, false, false)
+MINMAX(64, minnum, true, true, false)
+MINMAX(64, minnummag, true, true, true)
+MINMAX(64, max, false, false, false)
+MINMAX(64, maxnum, false, true, false)
+MINMAX(64, maxnummag, false, true, true)
+
+#undef MINMAX
+
+/* Floating point compare */
+static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
+                          float_status *s)
+{
+    if (is_nan(a.cls) || is_nan(b.cls)) {
+        if (!is_quiet ||
+            a.cls == float_class_snan ||
+            b.cls == float_class_snan) {
+            s->float_exception_flags |= float_flag_invalid;
+        }
+        return float_relation_unordered;
+    }
+
+    if (a.cls == float_class_zero) {
+        if (b.cls == float_class_zero) {
+            return float_relation_equal;
+        }
+        return b.sign ? float_relation_greater : float_relation_less;
+    } else if (b.cls == float_class_zero) {
+        return a.sign ? float_relation_less : float_relation_greater;
+    }
+
+    /* The only really important thing about infinity is its sign. If
+     * both are infinities the sign marks the smallest of the two.
+     */
+    if (a.cls == float_class_inf) {
+        if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
+            return float_relation_equal;
+        }
+        return a.sign ? float_relation_less : float_relation_greater;
+    } else if (b.cls == float_class_inf) {
+        return b.sign ? float_relation_greater : float_relation_less;
+    }
+
+    if (a.sign != b.sign) {
+        return a.sign ? float_relation_less : float_relation_greater;
+    }
+
+    if (a.exp == b.exp) {
+        if (a.frac == b.frac) {
+            return float_relation_equal;
+        }
+        if (a.sign) {
+            return a.frac > b.frac ?
+                float_relation_less : float_relation_greater;
+        } else {
+            return a.frac > b.frac ?
+                float_relation_greater : float_relation_less;
+        }
+    } else {
+        if (a.sign) {
+            return a.exp > b.exp ? float_relation_less : float_relation_greater;
+        } else {
+            return a.exp > b.exp ? float_relation_greater : float_relation_less;
+        }
+    }
+}
+
+#define COMPARE(sz)                                                     \
+int float ## sz ## _compare(float ## sz a, float ## sz b,               \
+                            float_status *s)                            \
+{                                                                       \
+    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
+    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
+    return compare_floats(pa, pb, false, s);                            \
+}                                                                       \
+int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
+                                  float_status *s)                      \
+{                                                                       \
+    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
+    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
+    return compare_floats(pa, pb, true, s);                             \
+}
+
+COMPARE(16)
+COMPARE(32)
+COMPARE(64)
+
+#undef COMPARE
+
+/* Multiply A by 2 raised to the power N.  */
+static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
+{
+    if (unlikely(is_nan(a.cls))) {
+        return return_nan(a, s);
+    }
+    if (a.cls == float_class_normal) {
+        a.exp += n;
+    }
+    return a;
+}
+
+float16 float16_scalbn(float16 a, int n, float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pr = scalbn_decomposed(pa, n, status);
+    return float16_round_pack_canonical(pr, status);
+}
+
+float32 float32_scalbn(float32 a, int n, float_status *status)
+{
+    FloatParts pa = float32_unpack_canonical(a, status);
+    FloatParts pr = scalbn_decomposed(pa, n, status);
+    return float32_round_pack_canonical(pr, status);
+}
+
+float64 float64_scalbn(float64 a, int n, float_status *status)
+{
+    FloatParts pa = float64_unpack_canonical(a, status);
+    FloatParts pr = scalbn_decomposed(pa, n, status);
+    return float64_round_pack_canonical(pr, status);
+}
+
+/*
+ * Square Root
+ *
+ * The old softfloat code did an approximation step before zeroing in
+ * on the final result. However for simpleness we just compute the
+ * square root by iterating down from the implicit bit to enough extra
+ * bits to ensure we get a correctly rounded result.
+ *
+ * This does mean however the calculation is slower than before,
+ * especially for 64 bit floats.
+ */
+
+static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
+{
+    uint64_t a_frac, r_frac, s_frac;
+    int bit, last_bit;
+
+    if (is_nan(a.cls)) {
+        return return_nan(a, s);
+    }
+    if (a.cls == float_class_zero) {
+        return a;  /* sqrt(+-0) = +-0 */
+    }
+    if (a.sign) {
+        s->float_exception_flags |= float_flag_invalid;
+        a.cls = float_class_dnan;
+        return a;
+    }
+    if (a.cls == float_class_inf) {
+        return a;  /* sqrt(+inf) = +inf */
+    }
+
+    assert(a.cls == float_class_normal);
+
+    /* We need two overflow bits at the top. Adding room for that is a
+     * right shift. If the exponent is odd, we can discard the low bit
+     * by multiplying the fraction by 2; that's a left shift. Combine
+     * those and we shift right if the exponent is even.
+     */
+    a_frac = a.frac;
+    if (!(a.exp & 1)) {
+        a_frac >>= 1;
+    }
+    a.exp >>= 1;
+
+    /* Bit-by-bit computation of sqrt.  */
+    r_frac = 0;
+    s_frac = 0;
+
+    /* Iterate from implicit bit down to the 3 extra bits to compute a
+     * properly rounded result. Remember we've inserted one more bit
+     * at the top, so these positions are one less.
+     */
+    bit = DECOMPOSED_BINARY_POINT - 1;
+    last_bit = MAX(p->frac_shift - 4, 0);
+    do {
+        uint64_t q = 1ULL << bit;
+        uint64_t t_frac = s_frac + q;
+        if (t_frac <= a_frac) {
+            s_frac = t_frac + q;
+            a_frac -= t_frac;
+            r_frac += q;
+        }
+        a_frac <<= 1;
+    } while (--bit >= last_bit);
+
+    /* Undo the right shift done above. If there is any remaining
+     * fraction, the result is inexact. Set the sticky bit.
+     */
+    a.frac = (r_frac << 1) + (a_frac != 0);
+
+    return a;
+}
+
+float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pr = sqrt_float(pa, status, &float16_params);
+    return float16_round_pack_canonical(pr, status);
+}
+
+float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
+{
+    FloatParts pa = float32_unpack_canonical(a, status);
+    FloatParts pr = sqrt_float(pa, status, &float32_params);
+    return float32_round_pack_canonical(pr, status);
+}
+
+float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
+{
+    FloatParts pa = float64_unpack_canonical(a, status);
+    FloatParts pr = sqrt_float(pa, status, &float64_params);
+    return float64_round_pack_canonical(pr, status);
+}
+
+
+/*----------------------------------------------------------------------------
 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 | input.  If `zSign' is 1, the input is negated before being converted to an
@@ -300,39 +2160,6 @@ static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
 }
 
 /*----------------------------------------------------------------------------
-| Returns the fraction bits of the single-precision floating-point value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline uint32_t extractFloat32Frac( float32 a )
-{
-
-    return float32_val(a) & 0x007FFFFF;
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the exponent bits of the single-precision floating-point value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline int extractFloat32Exp(float32 a)
-{
-
-    return ( float32_val(a)>>23 ) & 0xFF;
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the sign bit of the single-precision floating-point value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline flag extractFloat32Sign( float32 a )
-{
-
-    return float32_val(a)>>31;
-
-}
-
-/*----------------------------------------------------------------------------
 | If `a' is denormal and we are in flush-to-zero mode then set the
 | input-denormal exception and return zero. Otherwise just return the value.
 *----------------------------------------------------------------------------*/
@@ -493,39 +2320,6 @@ static float32
 }
 
 /*----------------------------------------------------------------------------
-| Returns the fraction bits of the double-precision floating-point value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline uint64_t extractFloat64Frac( float64 a )
-{
-
-    return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the exponent bits of the double-precision floating-point value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline int extractFloat64Exp(float64 a)
-{
-
-    return ( float64_val(a)>>52 ) & 0x7FF;
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the sign bit of the double-precision floating-point value `a'.
-*----------------------------------------------------------------------------*/
-
-static inline flag extractFloat64Sign( float64 a )
-{
-
-    return float64_val(a)>>63;
-
-}
-
-/*----------------------------------------------------------------------------
 | If `a' is denormal and we are in flush-to-zero mode then set the
 | input-denormal exception and return zero. Otherwise just return the value.
 *----------------------------------------------------------------------------*/
@@ -1289,43 +3083,6 @@ static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
 
 }
 
-/*----------------------------------------------------------------------------
-| Returns the result of converting the 32-bit two's complement integer `a'
-| to the single-precision floating-point format.  The conversion is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 int32_to_float32(int32_t a, float_status *status)
-{
-    flag zSign;
-
-    if ( a == 0 ) return float32_zero;
-    if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
-    zSign = ( a < 0 );
-    return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the 32-bit two's complement integer `a'
-| to the double-precision floating-point format.  The conversion is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 int32_to_float64(int32_t a, float_status *status)
-{
-    flag zSign;
-    uint32_t absA;
-    int8_t shiftCount;
-    uint64_t zSig;
-
-    if ( a == 0 ) return float64_zero;
-    zSign = ( a < 0 );
-    absA = zSign ? - a : a;
-    shiftCount = countLeadingZeros32( absA ) + 21;
-    zSig = absA;
-    return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
-
-}
 
 /*----------------------------------------------------------------------------
 | Returns the result of converting the 32-bit two's complement integer `a'
@@ -1374,56 +3131,6 @@ float128 int32_to_float128(int32_t a, float_status *status)
 
 /*----------------------------------------------------------------------------
 | Returns the result of converting the 64-bit two's complement integer `a'
-| to the single-precision floating-point format.  The conversion is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 int64_to_float32(int64_t a, float_status *status)
-{
-    flag zSign;
-    uint64_t absA;
-    int8_t shiftCount;
-
-    if ( a == 0 ) return float32_zero;
-    zSign = ( a < 0 );
-    absA = zSign ? - a : a;
-    shiftCount = countLeadingZeros64( absA ) - 40;
-    if ( 0 <= shiftCount ) {
-        return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
-    }
-    else {
-        shiftCount += 7;
-        if ( shiftCount < 0 ) {
-            shift64RightJamming( absA, - shiftCount, &absA );
-        }
-        else {
-            absA <<= shiftCount;
-        }
-        return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
-    }
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the 64-bit two's complement integer `a'
-| to the double-precision floating-point format.  The conversion is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 int64_to_float64(int64_t a, float_status *status)
-{
-    flag zSign;
-
-    if ( a == 0 ) return float64_zero;
-    if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
-        return packFloat64( 1, 0x43E, 0 );
-    }
-    zSign = ( a < 0 );
-    return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the 64-bit two's complement integer `a'
 | to the extended double-precision floating-point format.  The conversion
 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 | Arithmetic.
@@ -1478,65 +3185,6 @@ float128 int64_to_float128(int64_t a, float_status *status)
 
 /*----------------------------------------------------------------------------
 | Returns the result of converting the 64-bit unsigned integer `a'
-| to the single-precision floating-point format.  The conversion is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 uint64_to_float32(uint64_t a, float_status *status)
-{
-    int shiftcount;
-
-    if (a == 0) {
-        return float32_zero;
-    }
-
-    /* Determine (left) shift needed to put first set bit into bit posn 23
-     * (since packFloat32() expects the binary point between bits 23 and 22);
-     * this is the fast case for smallish numbers.
-     */
-    shiftcount = countLeadingZeros64(a) - 40;
-    if (shiftcount >= 0) {
-        return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
-    }
-    /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
-     * expects the binary point between bits 30 and 29, hence the + 7.
-     */
-    shiftcount += 7;
-    if (shiftcount < 0) {
-        shift64RightJamming(a, -shiftcount, &a);
-    } else {
-        a <<= shiftcount;
-    }
-
-    return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the 64-bit unsigned integer `a'
-| to the double-precision floating-point format.  The conversion is performed
-| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 uint64_to_float64(uint64_t a, float_status *status)
-{
-    int exp = 0x43C;
-    int shiftcount;
-
-    if (a == 0) {
-        return float64_zero;
-    }
-
-    shiftcount = countLeadingZeros64(a) - 1;
-    if (shiftcount < 0) {
-        shift64RightJamming(a, -shiftcount, &a);
-    } else {
-        a <<= shiftcount;
-    }
-    return roundAndPackFloat64(0, exp - shiftcount, a, status);
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the 64-bit unsigned integer `a'
 | to the quadruple-precision floating-point format.  The conversion is performed
 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 *----------------------------------------------------------------------------*/
@@ -1549,288 +3197,8 @@ float128 uint64_to_float128(uint64_t a, float_status *status)
     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
 }
 
-/*----------------------------------------------------------------------------
-| Returns the result of converting the single-precision floating-point value
-| `a' to the 32-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic---which means in particular that the conversion is rounded
-| according to the current rounding mode.  If `a' is a NaN, the largest
-| positive integer is returned.  Otherwise, if the conversion overflows, the
-| largest integer with the same sign as `a' is returned.
-*----------------------------------------------------------------------------*/
 
-int32_t float32_to_int32(float32 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint32_t aSig;
-    uint64_t aSig64;
 
-    a = float32_squash_input_denormal(a, status);
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-    if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
-    if ( aExp ) aSig |= 0x00800000;
-    shiftCount = 0xAF - aExp;
-    aSig64 = aSig;
-    aSig64 <<= 32;
-    if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
-    return roundAndPackInt32(aSign, aSig64, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the single-precision floating-point value
-| `a' to the 32-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic, except that the conversion is always rounded toward zero.
-| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
-| the conversion overflows, the largest integer with the same sign as `a' is
-| returned.
-*----------------------------------------------------------------------------*/
-
-int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint32_t aSig;
-    int32_t z;
-    a = float32_squash_input_denormal(a, status);
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-    shiftCount = aExp - 0x9E;
-    if ( 0 <= shiftCount ) {
-        if ( float32_val(a) != 0xCF000000 ) {
-            float_raise(float_flag_invalid, status);
-            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
-        }
-        return (int32_t) 0x80000000;
-    }
-    else if ( aExp <= 0x7E ) {
-        if (aExp | aSig) {
-            status->float_exception_flags |= float_flag_inexact;
-        }
-        return 0;
-    }
-    aSig = ( aSig | 0x00800000 )<<8;
-    z = aSig>>( - shiftCount );
-    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
-        status->float_exception_flags |= float_flag_inexact;
-    }
-    if ( aSign ) z = - z;
-    return z;
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the single-precision floating-point value
-| `a' to the 16-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic, except that the conversion is always rounded toward zero.
-| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
-| the conversion overflows, the largest integer with the same sign as `a' is
-| returned.
-*----------------------------------------------------------------------------*/
-
-int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint32_t aSig;
-    int32_t z;
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-    shiftCount = aExp - 0x8E;
-    if ( 0 <= shiftCount ) {
-        if ( float32_val(a) != 0xC7000000 ) {
-            float_raise(float_flag_invalid, status);
-            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
-                return 0x7FFF;
-            }
-        }
-        return (int32_t) 0xffff8000;
-    }
-    else if ( aExp <= 0x7E ) {
-        if ( aExp | aSig ) {
-            status->float_exception_flags |= float_flag_inexact;
-        }
-        return 0;
-    }
-    shiftCount -= 0x10;
-    aSig = ( aSig | 0x00800000 )<<8;
-    z = aSig>>( - shiftCount );
-    if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
-        status->float_exception_flags |= float_flag_inexact;
-    }
-    if ( aSign ) {
-        z = - z;
-    }
-    return z;
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the single-precision floating-point value
-| `a' to the 64-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic---which means in particular that the conversion is rounded
-| according to the current rounding mode.  If `a' is a NaN, the largest
-| positive integer is returned.  Otherwise, if the conversion overflows, the
-| largest integer with the same sign as `a' is returned.
-*----------------------------------------------------------------------------*/
-
-int64_t float32_to_int64(float32 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint32_t aSig;
-    uint64_t aSig64, aSigExtra;
-    a = float32_squash_input_denormal(a, status);
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-    shiftCount = 0xBE - aExp;
-    if ( shiftCount < 0 ) {
-        float_raise(float_flag_invalid, status);
-        if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
-            return LIT64( 0x7FFFFFFFFFFFFFFF );
-        }
-        return (int64_t) LIT64( 0x8000000000000000 );
-    }
-    if ( aExp ) aSig |= 0x00800000;
-    aSig64 = aSig;
-    aSig64 <<= 40;
-    shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
-    return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the single-precision floating-point value
-| `a' to the 64-bit unsigned integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic---which means in particular that the conversion is rounded
-| according to the current rounding mode.  If `a' is a NaN, the largest
-| unsigned integer is returned.  Otherwise, if the conversion overflows, the
-| largest unsigned integer is returned.  If the 'a' is negative, the result
-| is rounded and zero is returned; values that do not round to zero will
-| raise the inexact exception flag.
-*----------------------------------------------------------------------------*/
-
-uint64_t float32_to_uint64(float32 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint32_t aSig;
-    uint64_t aSig64, aSigExtra;
-    a = float32_squash_input_denormal(a, status);
-
-    aSig = extractFloat32Frac(a);
-    aExp = extractFloat32Exp(a);
-    aSign = extractFloat32Sign(a);
-    if ((aSign) && (aExp > 126)) {
-        float_raise(float_flag_invalid, status);
-        if (float32_is_any_nan(a)) {
-            return LIT64(0xFFFFFFFFFFFFFFFF);
-        } else {
-            return 0;
-        }
-    }
-    shiftCount = 0xBE - aExp;
-    if (aExp) {
-        aSig |= 0x00800000;
-    }
-    if (shiftCount < 0) {
-        float_raise(float_flag_invalid, status);
-        return LIT64(0xFFFFFFFFFFFFFFFF);
-    }
-
-    aSig64 = aSig;
-    aSig64 <<= 40;
-    shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
-    return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the single-precision floating-point value
-| `a' to the 64-bit unsigned integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic, except that the conversion is always rounded toward zero.  If
-| `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
-| conversion overflows, the largest unsigned integer is returned.  If the
-| 'a' is negative, the result is rounded and zero is returned; values that do
-| not round to zero will raise the inexact flag.
-*----------------------------------------------------------------------------*/
-
-uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
-{
-    signed char current_rounding_mode = status->float_rounding_mode;
-    set_float_rounding_mode(float_round_to_zero, status);
-    int64_t v = float32_to_uint64(a, status);
-    set_float_rounding_mode(current_rounding_mode, status);
-    return v;
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the single-precision floating-point value
-| `a' to the 64-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic, except that the conversion is always rounded toward zero.  If
-| `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
-| conversion overflows, the largest integer with the same sign as `a' is
-| returned.
-*----------------------------------------------------------------------------*/
-
-int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint32_t aSig;
-    uint64_t aSig64;
-    int64_t z;
-    a = float32_squash_input_denormal(a, status);
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-    shiftCount = aExp - 0xBE;
-    if ( 0 <= shiftCount ) {
-        if ( float32_val(a) != 0xDF000000 ) {
-            float_raise(float_flag_invalid, status);
-            if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
-                return LIT64( 0x7FFFFFFFFFFFFFFF );
-            }
-        }
-        return (int64_t) LIT64( 0x8000000000000000 );
-    }
-    else if ( aExp <= 0x7E ) {
-        if (aExp | aSig) {
-            status->float_exception_flags |= float_flag_inexact;
-        }
-        return 0;
-    }
-    aSig64 = aSig | 0x00800000;
-    aSig64 <<= 40;
-    z = aSig64>>( - shiftCount );
-    if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
-        status->float_exception_flags |= float_flag_inexact;
-    }
-    if ( aSign ) z = - z;
-    return z;
-
-}
 
 /*----------------------------------------------------------------------------
 | Returns the result of converting the single-precision floating-point value
@@ -1929,436 +3297,6 @@ float128 float32_to_float128(float32 a, float_status *status)
 }
 
 /*----------------------------------------------------------------------------
-| Rounds the single-precision floating-point value `a' to an integer, and
-| returns the result as a single-precision floating-point value.  The
-| operation is performed according to the IEC/IEEE Standard for Binary
-| Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 float32_round_to_int(float32 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    uint32_t lastBitMask, roundBitsMask;
-    uint32_t z;
-    a = float32_squash_input_denormal(a, status);
-
-    aExp = extractFloat32Exp( a );
-    if ( 0x96 <= aExp ) {
-        if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
-            return propagateFloat32NaN(a, a, status);
-        }
-        return a;
-    }
-    if ( aExp <= 0x7E ) {
-        if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
-        status->float_exception_flags |= float_flag_inexact;
-        aSign = extractFloat32Sign( a );
-        switch (status->float_rounding_mode) {
-         case float_round_nearest_even:
-            if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
-                return packFloat32( aSign, 0x7F, 0 );
-            }
-            break;
-        case float_round_ties_away:
-            if (aExp == 0x7E) {
-                return packFloat32(aSign, 0x7F, 0);
-            }
-            break;
-         case float_round_down:
-            return make_float32(aSign ? 0xBF800000 : 0);
-         case float_round_up:
-            return make_float32(aSign ? 0x80000000 : 0x3F800000);
-        }
-        return packFloat32( aSign, 0, 0 );
-    }
-    lastBitMask = 1;
-    lastBitMask <<= 0x96 - aExp;
-    roundBitsMask = lastBitMask - 1;
-    z = float32_val(a);
-    switch (status->float_rounding_mode) {
-    case float_round_nearest_even:
-        z += lastBitMask>>1;
-        if ((z & roundBitsMask) == 0) {
-            z &= ~lastBitMask;
-        }
-        break;
-    case float_round_ties_away:
-        z += lastBitMask >> 1;
-        break;
-    case float_round_to_zero:
-        break;
-    case float_round_up:
-        if (!extractFloat32Sign(make_float32(z))) {
-            z += roundBitsMask;
-        }
-        break;
-    case float_round_down:
-        if (extractFloat32Sign(make_float32(z))) {
-            z += roundBitsMask;
-        }
-        break;
-    default:
-        abort();
-    }
-    z &= ~ roundBitsMask;
-    if (z != float32_val(a)) {
-        status->float_exception_flags |= float_flag_inexact;
-    }
-    return make_float32(z);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of adding the absolute values of the single-precision
-| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
-| before being returned.  `zSign' is ignored if the result is a NaN.
-| The addition is performed according to the IEC/IEEE Standard for Binary
-| Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
-                              float_status *status)
-{
-    int aExp, bExp, zExp;
-    uint32_t aSig, bSig, zSig;
-    int expDiff;
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    bSig = extractFloat32Frac( b );
-    bExp = extractFloat32Exp( b );
-    expDiff = aExp - bExp;
-    aSig <<= 6;
-    bSig <<= 6;
-    if ( 0 < expDiff ) {
-        if ( aExp == 0xFF ) {
-            if (aSig) {
-                return propagateFloat32NaN(a, b, status);
-            }
-            return a;
-        }
-        if ( bExp == 0 ) {
-            --expDiff;
-        }
-        else {
-            bSig |= 0x20000000;
-        }
-        shift32RightJamming( bSig, expDiff, &bSig );
-        zExp = aExp;
-    }
-    else if ( expDiff < 0 ) {
-        if ( bExp == 0xFF ) {
-            if (bSig) {
-                return propagateFloat32NaN(a, b, status);
-            }
-            return packFloat32( zSign, 0xFF, 0 );
-        }
-        if ( aExp == 0 ) {
-            ++expDiff;
-        }
-        else {
-            aSig |= 0x20000000;
-        }
-        shift32RightJamming( aSig, - expDiff, &aSig );
-        zExp = bExp;
-    }
-    else {
-        if ( aExp == 0xFF ) {
-            if (aSig | bSig) {
-                return propagateFloat32NaN(a, b, status);
-            }
-            return a;
-        }
-        if ( aExp == 0 ) {
-            if (status->flush_to_zero) {
-                if (aSig | bSig) {
-                    float_raise(float_flag_output_denormal, status);
-                }
-                return packFloat32(zSign, 0, 0);
-            }
-            return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
-        }
-        zSig = 0x40000000 + aSig + bSig;
-        zExp = aExp;
-        goto roundAndPack;
-    }
-    aSig |= 0x20000000;
-    zSig = ( aSig + bSig )<<1;
-    --zExp;
-    if ( (int32_t) zSig < 0 ) {
-        zSig = aSig + bSig;
-        ++zExp;
-    }
- roundAndPack:
-    return roundAndPackFloat32(zSign, zExp, zSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of subtracting the absolute values of the single-
-| precision floating-point values `a' and `b'.  If `zSign' is 1, the
-| difference is negated before being returned.  `zSign' is ignored if the
-| result is a NaN.  The subtraction is performed according to the IEC/IEEE
-| Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
-                              float_status *status)
-{
-    int aExp, bExp, zExp;
-    uint32_t aSig, bSig, zSig;
-    int expDiff;
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    bSig = extractFloat32Frac( b );
-    bExp = extractFloat32Exp( b );
-    expDiff = aExp - bExp;
-    aSig <<= 7;
-    bSig <<= 7;
-    if ( 0 < expDiff ) goto aExpBigger;
-    if ( expDiff < 0 ) goto bExpBigger;
-    if ( aExp == 0xFF ) {
-        if (aSig | bSig) {
-            return propagateFloat32NaN(a, b, status);
-        }
-        float_raise(float_flag_invalid, status);
-        return float32_default_nan(status);
-    }
-    if ( aExp == 0 ) {
-        aExp = 1;
-        bExp = 1;
-    }
-    if ( bSig < aSig ) goto aBigger;
-    if ( aSig < bSig ) goto bBigger;
-    return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
- bExpBigger:
-    if ( bExp == 0xFF ) {
-        if (bSig) {
-            return propagateFloat32NaN(a, b, status);
-        }
-        return packFloat32( zSign ^ 1, 0xFF, 0 );
-    }
-    if ( aExp == 0 ) {
-        ++expDiff;
-    }
-    else {
-        aSig |= 0x40000000;
-    }
-    shift32RightJamming( aSig, - expDiff, &aSig );
-    bSig |= 0x40000000;
- bBigger:
-    zSig = bSig - aSig;
-    zExp = bExp;
-    zSign ^= 1;
-    goto normalizeRoundAndPack;
- aExpBigger:
-    if ( aExp == 0xFF ) {
-        if (aSig) {
-            return propagateFloat32NaN(a, b, status);
-        }
-        return a;
-    }
-    if ( bExp == 0 ) {
-        --expDiff;
-    }
-    else {
-        bSig |= 0x40000000;
-    }
-    shift32RightJamming( bSig, expDiff, &bSig );
-    aSig |= 0x40000000;
- aBigger:
-    zSig = aSig - bSig;
-    zExp = aExp;
- normalizeRoundAndPack:
-    --zExp;
-    return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of adding the single-precision floating-point values `a'
-| and `b'.  The operation is performed according to the IEC/IEEE Standard for
-| Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 float32_add(float32 a, float32 b, float_status *status)
-{
-    flag aSign, bSign;
-    a = float32_squash_input_denormal(a, status);
-    b = float32_squash_input_denormal(b, status);
-
-    aSign = extractFloat32Sign( a );
-    bSign = extractFloat32Sign( b );
-    if ( aSign == bSign ) {
-        return addFloat32Sigs(a, b, aSign, status);
-    }
-    else {
-        return subFloat32Sigs(a, b, aSign, status);
-    }
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of subtracting the single-precision floating-point values
-| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
-| for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 float32_sub(float32 a, float32 b, float_status *status)
-{
-    flag aSign, bSign;
-    a = float32_squash_input_denormal(a, status);
-    b = float32_squash_input_denormal(b, status);
-
-    aSign = extractFloat32Sign( a );
-    bSign = extractFloat32Sign( b );
-    if ( aSign == bSign ) {
-        return subFloat32Sigs(a, b, aSign, status);
-    }
-    else {
-        return addFloat32Sigs(a, b, aSign, status);
-    }
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of multiplying the single-precision floating-point values
-| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
-| for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 float32_mul(float32 a, float32 b, float_status *status)
-{
-    flag aSign, bSign, zSign;
-    int aExp, bExp, zExp;
-    uint32_t aSig, bSig;
-    uint64_t zSig64;
-    uint32_t zSig;
-
-    a = float32_squash_input_denormal(a, status);
-    b = float32_squash_input_denormal(b, status);
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-    bSig = extractFloat32Frac( b );
-    bExp = extractFloat32Exp( b );
-    bSign = extractFloat32Sign( b );
-    zSign = aSign ^ bSign;
-    if ( aExp == 0xFF ) {
-        if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
-            return propagateFloat32NaN(a, b, status);
-        }
-        if ( ( bExp | bSig ) == 0 ) {
-            float_raise(float_flag_invalid, status);
-            return float32_default_nan(status);
-        }
-        return packFloat32( zSign, 0xFF, 0 );
-    }
-    if ( bExp == 0xFF ) {
-        if (bSig) {
-            return propagateFloat32NaN(a, b, status);
-        }
-        if ( ( aExp | aSig ) == 0 ) {
-            float_raise(float_flag_invalid, status);
-            return float32_default_nan(status);
-        }
-        return packFloat32( zSign, 0xFF, 0 );
-    }
-    if ( aExp == 0 ) {
-        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
-        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
-    }
-    if ( bExp == 0 ) {
-        if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
-        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
-    }
-    zExp = aExp + bExp - 0x7F;
-    aSig = ( aSig | 0x00800000 )<<7;
-    bSig = ( bSig | 0x00800000 )<<8;
-    shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
-    zSig = zSig64;
-    if ( 0 <= (int32_t) ( zSig<<1 ) ) {
-        zSig <<= 1;
-        --zExp;
-    }
-    return roundAndPackFloat32(zSign, zExp, zSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of dividing the single-precision floating-point value `a'
-| by the corresponding value `b'.  The operation is performed according to the
-| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 float32_div(float32 a, float32 b, float_status *status)
-{
-    flag aSign, bSign, zSign;
-    int aExp, bExp, zExp;
-    uint32_t aSig, bSig, zSig;
-    a = float32_squash_input_denormal(a, status);
-    b = float32_squash_input_denormal(b, status);
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-    bSig = extractFloat32Frac( b );
-    bExp = extractFloat32Exp( b );
-    bSign = extractFloat32Sign( b );
-    zSign = aSign ^ bSign;
-    if ( aExp == 0xFF ) {
-        if (aSig) {
-            return propagateFloat32NaN(a, b, status);
-        }
-        if ( bExp == 0xFF ) {
-            if (bSig) {
-                return propagateFloat32NaN(a, b, status);
-            }
-            float_raise(float_flag_invalid, status);
-            return float32_default_nan(status);
-        }
-        return packFloat32( zSign, 0xFF, 0 );
-    }
-    if ( bExp == 0xFF ) {
-        if (bSig) {
-            return propagateFloat32NaN(a, b, status);
-        }
-        return packFloat32( zSign, 0, 0 );
-    }
-    if ( bExp == 0 ) {
-        if ( bSig == 0 ) {
-            if ( ( aExp | aSig ) == 0 ) {
-                float_raise(float_flag_invalid, status);
-                return float32_default_nan(status);
-            }
-            float_raise(float_flag_divbyzero, status);
-            return packFloat32( zSign, 0xFF, 0 );
-        }
-        normalizeFloat32Subnormal( bSig, &bExp, &bSig );
-    }
-    if ( aExp == 0 ) {
-        if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
-        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
-    }
-    zExp = aExp - bExp + 0x7D;
-    aSig = ( aSig | 0x00800000 )<<7;
-    bSig = ( bSig | 0x00800000 )<<8;
-    if ( bSig <= ( aSig + aSig ) ) {
-        aSig >>= 1;
-        ++zExp;
-    }
-    zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
-    if ( ( zSig & 0x3F ) == 0 ) {
-        zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
-    }
-    return roundAndPackFloat32(zSign, zExp, zSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
 | Returns the remainder of the single-precision floating-point value `a'
 | with respect to the corresponding value `b'.  The operation is performed
 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
@@ -2460,290 +3398,9 @@ float32 float32_rem(float32 a, float32 b, float_status *status)
     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
 }
 
-/*----------------------------------------------------------------------------
-| Returns the result of multiplying the single-precision floating-point values
-| `a' and `b' then adding 'c', with no intermediate rounding step after the
-| multiplication.  The operation is performed according to the IEC/IEEE
-| Standard for Binary Floating-Point Arithmetic 754-2008.
-| The flags argument allows the caller to select negation of the
-| addend, the intermediate product, or the final result. (The difference
-| between this and having the caller do a separate negation is that negating
-| externally will flip the sign bit on NaNs.)
-*----------------------------------------------------------------------------*/
-
-float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
-                       float_status *status)
-{
-    flag aSign, bSign, cSign, zSign;
-    int aExp, bExp, cExp, pExp, zExp, expDiff;
-    uint32_t aSig, bSig, cSig;
-    flag pInf, pZero, pSign;
-    uint64_t pSig64, cSig64, zSig64;
-    uint32_t pSig;
-    int shiftcount;
-    flag signflip, infzero;
-
-    a = float32_squash_input_denormal(a, status);
-    b = float32_squash_input_denormal(b, status);
-    c = float32_squash_input_denormal(c, status);
-    aSig = extractFloat32Frac(a);
-    aExp = extractFloat32Exp(a);
-    aSign = extractFloat32Sign(a);
-    bSig = extractFloat32Frac(b);
-    bExp = extractFloat32Exp(b);
-    bSign = extractFloat32Sign(b);
-    cSig = extractFloat32Frac(c);
-    cExp = extractFloat32Exp(c);
-    cSign = extractFloat32Sign(c);
-
-    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
-               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
-
-    /* It is implementation-defined whether the cases of (0,inf,qnan)
-     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
-     * they return if they do), so we have to hand this information
-     * off to the target-specific pick-a-NaN routine.
-     */
-    if (((aExp == 0xff) && aSig) ||
-        ((bExp == 0xff) && bSig) ||
-        ((cExp == 0xff) && cSig)) {
-        return propagateFloat32MulAddNaN(a, b, c, infzero, status);
-    }
-
-    if (infzero) {
-        float_raise(float_flag_invalid, status);
-        return float32_default_nan(status);
-    }
-
-    if (flags & float_muladd_negate_c) {
-        cSign ^= 1;
-    }
-
-    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
-
-    /* Work out the sign and type of the product */
-    pSign = aSign ^ bSign;
-    if (flags & float_muladd_negate_product) {
-        pSign ^= 1;
-    }
-    pInf = (aExp == 0xff) || (bExp == 0xff);
-    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
-
-    if (cExp == 0xff) {
-        if (pInf && (pSign ^ cSign)) {
-            /* addition of opposite-signed infinities => InvalidOperation */
-            float_raise(float_flag_invalid, status);
-            return float32_default_nan(status);
-        }
-        /* Otherwise generate an infinity of the same sign */
-        return packFloat32(cSign ^ signflip, 0xff, 0);
-    }
-
-    if (pInf) {
-        return packFloat32(pSign ^ signflip, 0xff, 0);
-    }
-
-    if (pZero) {
-        if (cExp == 0) {
-            if (cSig == 0) {
-                /* Adding two exact zeroes */
-                if (pSign == cSign) {
-                    zSign = pSign;
-                } else if (status->float_rounding_mode == float_round_down) {
-                    zSign = 1;
-                } else {
-                    zSign = 0;
-                }
-                return packFloat32(zSign ^ signflip, 0, 0);
-            }
-            /* Exact zero plus a denorm */
-            if (status->flush_to_zero) {
-                float_raise(float_flag_output_denormal, status);
-                return packFloat32(cSign ^ signflip, 0, 0);
-            }
-        }
-        /* Zero plus something non-zero : just return the something */
-        if (flags & float_muladd_halve_result) {
-            if (cExp == 0) {
-                normalizeFloat32Subnormal(cSig, &cExp, &cSig);
-            }
-            /* Subtract one to halve, and one again because roundAndPackFloat32
-             * wants one less than the true exponent.
-             */
-            cExp -= 2;
-            cSig = (cSig | 0x00800000) << 7;
-            return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
-        }
-        return packFloat32(cSign ^ signflip, cExp, cSig);
-    }
-
-    if (aExp == 0) {
-        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
-    }
-    if (bExp == 0) {
-        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
-    }
-
-    /* Calculate the actual result a * b + c */
-
-    /* Multiply first; this is easy. */
-    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
-     * because we want the true exponent, not the "one-less-than"
-     * flavour that roundAndPackFloat32() takes.
-     */
-    pExp = aExp + bExp - 0x7e;
-    aSig = (aSig | 0x00800000) << 7;
-    bSig = (bSig | 0x00800000) << 8;
-    pSig64 = (uint64_t)aSig * bSig;
-    if ((int64_t)(pSig64 << 1) >= 0) {
-        pSig64 <<= 1;
-        pExp--;
-    }
-
-    zSign = pSign ^ signflip;
-
-    /* Now pSig64 is the significand of the multiply, with the explicit bit in
-     * position 62.
-     */
-    if (cExp == 0) {
-        if (!cSig) {
-            /* Throw out the special case of c being an exact zero now */
-            shift64RightJamming(pSig64, 32, &pSig64);
-            pSig = pSig64;
-            if (flags & float_muladd_halve_result) {
-                pExp--;
-            }
-            return roundAndPackFloat32(zSign, pExp - 1,
-                                       pSig, status);
-        }
-        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
-    }
-
-    cSig64 = (uint64_t)cSig << (62 - 23);
-    cSig64 |= LIT64(0x4000000000000000);
-    expDiff = pExp - cExp;
-
-    if (pSign == cSign) {
-        /* Addition */
-        if (expDiff > 0) {
-            /* scale c to match p */
-            shift64RightJamming(cSig64, expDiff, &cSig64);
-            zExp = pExp;
-        } else if (expDiff < 0) {
-            /* scale p to match c */
-            shift64RightJamming(pSig64, -expDiff, &pSig64);
-            zExp = cExp;
-        } else {
-            /* no scaling needed */
-            zExp = cExp;
-        }
-        /* Add significands and make sure explicit bit ends up in posn 62 */
-        zSig64 = pSig64 + cSig64;
-        if ((int64_t)zSig64 < 0) {
-            shift64RightJamming(zSig64, 1, &zSig64);
-        } else {
-            zExp--;
-        }
-    } else {
-        /* Subtraction */
-        if (expDiff > 0) {
-            shift64RightJamming(cSig64, expDiff, &cSig64);
-            zSig64 = pSig64 - cSig64;
-            zExp = pExp;
-        } else if (expDiff < 0) {
-            shift64RightJamming(pSig64, -expDiff, &pSig64);
-            zSig64 = cSig64 - pSig64;
-            zExp = cExp;
-            zSign ^= 1;
-        } else {
-            zExp = pExp;
-            if (cSig64 < pSig64) {
-                zSig64 = pSig64 - cSig64;
-            } else if (pSig64 < cSig64) {
-                zSig64 = cSig64 - pSig64;
-                zSign ^= 1;
-            } else {
-                /* Exact zero */
-                zSign = signflip;
-                if (status->float_rounding_mode == float_round_down) {
-                    zSign ^= 1;
-                }
-                return packFloat32(zSign, 0, 0);
-            }
-        }
-        --zExp;
-        /* Normalize to put the explicit bit back into bit 62. */
-        shiftcount = countLeadingZeros64(zSig64) - 1;
-        zSig64 <<= shiftcount;
-        zExp -= shiftcount;
-    }
-    if (flags & float_muladd_halve_result) {
-        zExp--;
-    }
-
-    shift64RightJamming(zSig64, 32, &zSig64);
-    return roundAndPackFloat32(zSign, zExp, zSig64, status);
-}
 
 
 /*----------------------------------------------------------------------------
-| Returns the square root of the single-precision floating-point value `a'.
-| The operation is performed according to the IEC/IEEE Standard for Binary
-| Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float32 float32_sqrt(float32 a, float_status *status)
-{
-    flag aSign;
-    int aExp, zExp;
-    uint32_t aSig, zSig;
-    uint64_t rem, term;
-    a = float32_squash_input_denormal(a, status);
-
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-    if ( aExp == 0xFF ) {
-        if (aSig) {
-            return propagateFloat32NaN(a, float32_zero, status);
-        }
-        if ( ! aSign ) return a;
-        float_raise(float_flag_invalid, status);
-        return float32_default_nan(status);
-    }
-    if ( aSign ) {
-        if ( ( aExp | aSig ) == 0 ) return a;
-        float_raise(float_flag_invalid, status);
-        return float32_default_nan(status);
-    }
-    if ( aExp == 0 ) {
-        if ( aSig == 0 ) return float32_zero;
-        normalizeFloat32Subnormal( aSig, &aExp, &aSig );
-    }
-    zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
-    aSig = ( aSig | 0x00800000 )<<8;
-    zSig = estimateSqrt32( aExp, aSig ) + 2;
-    if ( ( zSig & 0x7F ) <= 5 ) {
-        if ( zSig < 2 ) {
-            zSig = 0x7FFFFFFF;
-            goto roundAndPack;
-        }
-        aSig >>= aExp & 1;
-        term = ( (uint64_t) zSig ) * zSig;
-        rem = ( ( (uint64_t) aSig )<<32 ) - term;
-        while ( (int64_t) rem < 0 ) {
-            --zSig;
-            rem += ( ( (uint64_t) zSig )<<1 ) | 1;
-        }
-        zSig |= ( rem != 0 );
-    }
-    shift32RightJamming( zSig, 1, &zSig );
- roundAndPack:
-    return roundAndPackFloat32(0, zExp, zSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
 | Returns the binary exponential of the single-precision floating-point value
 | `a'. The operation is performed according to the IEC/IEEE Standard for
 | Binary Floating-Point Arithmetic.
@@ -3091,236 +3748,6 @@ int float32_unordered_quiet(float32 a, float32 b, float_status *status)
     return 0;
 }
 
-/*----------------------------------------------------------------------------
-| Returns the result of converting the double-precision floating-point value
-| `a' to the 32-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic---which means in particular that the conversion is rounded
-| according to the current rounding mode.  If `a' is a NaN, the largest
-| positive integer is returned.  Otherwise, if the conversion overflows, the
-| largest integer with the same sign as `a' is returned.
-*----------------------------------------------------------------------------*/
-
-int32_t float64_to_int32(float64 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint64_t aSig;
-    a = float64_squash_input_denormal(a, status);
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-    if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
-    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
-    shiftCount = 0x42C - aExp;
-    if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
-    return roundAndPackInt32(aSign, aSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the double-precision floating-point value
-| `a' to the 32-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic, except that the conversion is always rounded toward zero.
-| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
-| the conversion overflows, the largest integer with the same sign as `a' is
-| returned.
-*----------------------------------------------------------------------------*/
-
-int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint64_t aSig, savedASig;
-    int32_t z;
-    a = float64_squash_input_denormal(a, status);
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-    if ( 0x41E < aExp ) {
-        if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
-        goto invalid;
-    }
-    else if ( aExp < 0x3FF ) {
-        if (aExp || aSig) {
-            status->float_exception_flags |= float_flag_inexact;
-        }
-        return 0;
-    }
-    aSig |= LIT64( 0x0010000000000000 );
-    shiftCount = 0x433 - aExp;
-    savedASig = aSig;
-    aSig >>= shiftCount;
-    z = aSig;
-    if ( aSign ) z = - z;
-    if ( ( z < 0 ) ^ aSign ) {
- invalid:
-        float_raise(float_flag_invalid, status);
-        return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
-    }
-    if ( ( aSig<<shiftCount ) != savedASig ) {
-        status->float_exception_flags |= float_flag_inexact;
-    }
-    return z;
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the double-precision floating-point value
-| `a' to the 16-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic, except that the conversion is always rounded toward zero.
-| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
-| the conversion overflows, the largest integer with the same sign as `a' is
-| returned.
-*----------------------------------------------------------------------------*/
-
-int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint64_t aSig, savedASig;
-    int32_t z;
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-    if ( 0x40E < aExp ) {
-        if ( ( aExp == 0x7FF ) && aSig ) {
-            aSign = 0;
-        }
-        goto invalid;
-    }
-    else if ( aExp < 0x3FF ) {
-        if ( aExp || aSig ) {
-            status->float_exception_flags |= float_flag_inexact;
-        }
-        return 0;
-    }
-    aSig |= LIT64( 0x0010000000000000 );
-    shiftCount = 0x433 - aExp;
-    savedASig = aSig;
-    aSig >>= shiftCount;
-    z = aSig;
-    if ( aSign ) {
-        z = - z;
-    }
-    if ( ( (int16_t)z < 0 ) ^ aSign ) {
- invalid:
-        float_raise(float_flag_invalid, status);
-        return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
-    }
-    if ( ( aSig<<shiftCount ) != savedASig ) {
-        status->float_exception_flags |= float_flag_inexact;
-    }
-    return z;
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the double-precision floating-point value
-| `a' to the 64-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic---which means in particular that the conversion is rounded
-| according to the current rounding mode.  If `a' is a NaN, the largest
-| positive integer is returned.  Otherwise, if the conversion overflows, the
-| largest integer with the same sign as `a' is returned.
-*----------------------------------------------------------------------------*/
-
-int64_t float64_to_int64(float64 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint64_t aSig, aSigExtra;
-    a = float64_squash_input_denormal(a, status);
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
-    shiftCount = 0x433 - aExp;
-    if ( shiftCount <= 0 ) {
-        if ( 0x43E < aExp ) {
-            float_raise(float_flag_invalid, status);
-            if (    ! aSign
-                 || (    ( aExp == 0x7FF )
-                      && ( aSig != LIT64( 0x0010000000000000 ) ) )
-               ) {
-                return LIT64( 0x7FFFFFFFFFFFFFFF );
-            }
-            return (int64_t) LIT64( 0x8000000000000000 );
-        }
-        aSigExtra = 0;
-        aSig <<= - shiftCount;
-    }
-    else {
-        shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
-    }
-    return roundAndPackInt64(aSign, aSig, aSigExtra, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the double-precision floating-point value
-| `a' to the 64-bit two's complement integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic, except that the conversion is always rounded toward zero.
-| If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
-| the conversion overflows, the largest integer with the same sign as `a' is
-| returned.
-*----------------------------------------------------------------------------*/
-
-int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint64_t aSig;
-    int64_t z;
-    a = float64_squash_input_denormal(a, status);
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-    if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
-    shiftCount = aExp - 0x433;
-    if ( 0 <= shiftCount ) {
-        if ( 0x43E <= aExp ) {
-            if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
-                float_raise(float_flag_invalid, status);
-                if (    ! aSign
-                     || (    ( aExp == 0x7FF )
-                          && ( aSig != LIT64( 0x0010000000000000 ) ) )
-                   ) {
-                    return LIT64( 0x7FFFFFFFFFFFFFFF );
-                }
-            }
-            return (int64_t) LIT64( 0x8000000000000000 );
-        }
-        z = aSig<<shiftCount;
-    }
-    else {
-        if ( aExp < 0x3FE ) {
-            if (aExp | aSig) {
-                status->float_exception_flags |= float_flag_inexact;
-            }
-            return 0;
-        }
-        z = aSig>>( - shiftCount );
-        if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
-            status->float_exception_flags |= float_flag_inexact;
-        }
-    }
-    if ( aSign ) z = - z;
-    return z;
-
-}
 
 /*----------------------------------------------------------------------------
 | Returns the result of converting the double-precision floating-point value
@@ -3488,6 +3915,21 @@ static float16 roundAndPackFloat16(flag zSign, int zExp,
     return packFloat16(zSign, zExp, zSig >> 13);
 }
 
+/*----------------------------------------------------------------------------
+| If `a' is denormal and we are in flush-to-zero mode then set the
+| input-denormal exception and return zero. Otherwise just return the value.
+*----------------------------------------------------------------------------*/
+float16 float16_squash_input_denormal(float16 a, float_status *status)
+{
+    if (status->flush_inputs_to_zero) {
+        if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
+            float_raise(float_flag_input_denormal, status);
+            return make_float16(float16_val(a) & 0x8000);
+        }
+    }
+    return a;
+}
+
 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
                                       uint32_t *zSigPtr)
 {
@@ -3711,453 +4153,6 @@ float128 float64_to_float128(float64 a, float_status *status)
 
 }
 
-/*----------------------------------------------------------------------------
-| Rounds the double-precision floating-point value `a' to an integer, and
-| returns the result as a double-precision floating-point value.  The
-| operation is performed according to the IEC/IEEE Standard for Binary
-| Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 float64_round_to_int(float64 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    uint64_t lastBitMask, roundBitsMask;
-    uint64_t z;
-    a = float64_squash_input_denormal(a, status);
-
-    aExp = extractFloat64Exp( a );
-    if ( 0x433 <= aExp ) {
-        if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
-            return propagateFloat64NaN(a, a, status);
-        }
-        return a;
-    }
-    if ( aExp < 0x3FF ) {
-        if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
-        status->float_exception_flags |= float_flag_inexact;
-        aSign = extractFloat64Sign( a );
-        switch (status->float_rounding_mode) {
-         case float_round_nearest_even:
-            if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
-                return packFloat64( aSign, 0x3FF, 0 );
-            }
-            break;
-        case float_round_ties_away:
-            if (aExp == 0x3FE) {
-                return packFloat64(aSign, 0x3ff, 0);
-            }
-            break;
-         case float_round_down:
-            return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
-         case float_round_up:
-            return make_float64(
-            aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
-        }
-        return packFloat64( aSign, 0, 0 );
-    }
-    lastBitMask = 1;
-    lastBitMask <<= 0x433 - aExp;
-    roundBitsMask = lastBitMask - 1;
-    z = float64_val(a);
-    switch (status->float_rounding_mode) {
-    case float_round_nearest_even:
-        z += lastBitMask >> 1;
-        if ((z & roundBitsMask) == 0) {
-            z &= ~lastBitMask;
-        }
-        break;
-    case float_round_ties_away:
-        z += lastBitMask >> 1;
-        break;
-    case float_round_to_zero:
-        break;
-    case float_round_up:
-        if (!extractFloat64Sign(make_float64(z))) {
-            z += roundBitsMask;
-        }
-        break;
-    case float_round_down:
-        if (extractFloat64Sign(make_float64(z))) {
-            z += roundBitsMask;
-        }
-        break;
-    default:
-        abort();
-    }
-    z &= ~ roundBitsMask;
-    if (z != float64_val(a)) {
-        status->float_exception_flags |= float_flag_inexact;
-    }
-    return make_float64(z);
-
-}
-
-float64 float64_trunc_to_int(float64 a, float_status *status)
-{
-    int oldmode;
-    float64 res;
-    oldmode = status->float_rounding_mode;
-    status->float_rounding_mode = float_round_to_zero;
-    res = float64_round_to_int(a, status);
-    status->float_rounding_mode = oldmode;
-    return res;
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of adding the absolute values of the double-precision
-| floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
-| before being returned.  `zSign' is ignored if the result is a NaN.
-| The addition is performed according to the IEC/IEEE Standard for Binary
-| Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
-                              float_status *status)
-{
-    int aExp, bExp, zExp;
-    uint64_t aSig, bSig, zSig;
-    int expDiff;
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    bSig = extractFloat64Frac( b );
-    bExp = extractFloat64Exp( b );
-    expDiff = aExp - bExp;
-    aSig <<= 9;
-    bSig <<= 9;
-    if ( 0 < expDiff ) {
-        if ( aExp == 0x7FF ) {
-            if (aSig) {
-                return propagateFloat64NaN(a, b, status);
-            }
-            return a;
-        }
-        if ( bExp == 0 ) {
-            --expDiff;
-        }
-        else {
-            bSig |= LIT64( 0x2000000000000000 );
-        }
-        shift64RightJamming( bSig, expDiff, &bSig );
-        zExp = aExp;
-    }
-    else if ( expDiff < 0 ) {
-        if ( bExp == 0x7FF ) {
-            if (bSig) {
-                return propagateFloat64NaN(a, b, status);
-            }
-            return packFloat64( zSign, 0x7FF, 0 );
-        }
-        if ( aExp == 0 ) {
-            ++expDiff;
-        }
-        else {
-            aSig |= LIT64( 0x2000000000000000 );
-        }
-        shift64RightJamming( aSig, - expDiff, &aSig );
-        zExp = bExp;
-    }
-    else {
-        if ( aExp == 0x7FF ) {
-            if (aSig | bSig) {
-                return propagateFloat64NaN(a, b, status);
-            }
-            return a;
-        }
-        if ( aExp == 0 ) {
-            if (status->flush_to_zero) {
-                if (aSig | bSig) {
-                    float_raise(float_flag_output_denormal, status);
-                }
-                return packFloat64(zSign, 0, 0);
-            }
-            return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
-        }
-        zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
-        zExp = aExp;
-        goto roundAndPack;
-    }
-    aSig |= LIT64( 0x2000000000000000 );
-    zSig = ( aSig + bSig )<<1;
-    --zExp;
-    if ( (int64_t) zSig < 0 ) {
-        zSig = aSig + bSig;
-        ++zExp;
-    }
- roundAndPack:
-    return roundAndPackFloat64(zSign, zExp, zSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of subtracting the absolute values of the double-
-| precision floating-point values `a' and `b'.  If `zSign' is 1, the
-| difference is negated before being returned.  `zSign' is ignored if the
-| result is a NaN.  The subtraction is performed according to the IEC/IEEE
-| Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
-                              float_status *status)
-{
-    int aExp, bExp, zExp;
-    uint64_t aSig, bSig, zSig;
-    int expDiff;
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    bSig = extractFloat64Frac( b );
-    bExp = extractFloat64Exp( b );
-    expDiff = aExp - bExp;
-    aSig <<= 10;
-    bSig <<= 10;
-    if ( 0 < expDiff ) goto aExpBigger;
-    if ( expDiff < 0 ) goto bExpBigger;
-    if ( aExp == 0x7FF ) {
-        if (aSig | bSig) {
-            return propagateFloat64NaN(a, b, status);
-        }
-        float_raise(float_flag_invalid, status);
-        return float64_default_nan(status);
-    }
-    if ( aExp == 0 ) {
-        aExp = 1;
-        bExp = 1;
-    }
-    if ( bSig < aSig ) goto aBigger;
-    if ( aSig < bSig ) goto bBigger;
-    return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
- bExpBigger:
-    if ( bExp == 0x7FF ) {
-        if (bSig) {
-            return propagateFloat64NaN(a, b, status);
-        }
-        return packFloat64( zSign ^ 1, 0x7FF, 0 );
-    }
-    if ( aExp == 0 ) {
-        ++expDiff;
-    }
-    else {
-        aSig |= LIT64( 0x4000000000000000 );
-    }
-    shift64RightJamming( aSig, - expDiff, &aSig );
-    bSig |= LIT64( 0x4000000000000000 );
- bBigger:
-    zSig = bSig - aSig;
-    zExp = bExp;
-    zSign ^= 1;
-    goto normalizeRoundAndPack;
- aExpBigger:
-    if ( aExp == 0x7FF ) {
-        if (aSig) {
-            return propagateFloat64NaN(a, b, status);
-        }
-        return a;
-    }
-    if ( bExp == 0 ) {
-        --expDiff;
-    }
-    else {
-        bSig |= LIT64( 0x4000000000000000 );
-    }
-    shift64RightJamming( bSig, expDiff, &bSig );
-    aSig |= LIT64( 0x4000000000000000 );
- aBigger:
-    zSig = aSig - bSig;
-    zExp = aExp;
- normalizeRoundAndPack:
-    --zExp;
-    return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of adding the double-precision floating-point values `a'
-| and `b'.  The operation is performed according to the IEC/IEEE Standard for
-| Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 float64_add(float64 a, float64 b, float_status *status)
-{
-    flag aSign, bSign;
-    a = float64_squash_input_denormal(a, status);
-    b = float64_squash_input_denormal(b, status);
-
-    aSign = extractFloat64Sign( a );
-    bSign = extractFloat64Sign( b );
-    if ( aSign == bSign ) {
-        return addFloat64Sigs(a, b, aSign, status);
-    }
-    else {
-        return subFloat64Sigs(a, b, aSign, status);
-    }
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of subtracting the double-precision floating-point values
-| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
-| for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 float64_sub(float64 a, float64 b, float_status *status)
-{
-    flag aSign, bSign;
-    a = float64_squash_input_denormal(a, status);
-    b = float64_squash_input_denormal(b, status);
-
-    aSign = extractFloat64Sign( a );
-    bSign = extractFloat64Sign( b );
-    if ( aSign == bSign ) {
-        return subFloat64Sigs(a, b, aSign, status);
-    }
-    else {
-        return addFloat64Sigs(a, b, aSign, status);
-    }
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of multiplying the double-precision floating-point values
-| `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
-| for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 float64_mul(float64 a, float64 b, float_status *status)
-{
-    flag aSign, bSign, zSign;
-    int aExp, bExp, zExp;
-    uint64_t aSig, bSig, zSig0, zSig1;
-
-    a = float64_squash_input_denormal(a, status);
-    b = float64_squash_input_denormal(b, status);
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-    bSig = extractFloat64Frac( b );
-    bExp = extractFloat64Exp( b );
-    bSign = extractFloat64Sign( b );
-    zSign = aSign ^ bSign;
-    if ( aExp == 0x7FF ) {
-        if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
-            return propagateFloat64NaN(a, b, status);
-        }
-        if ( ( bExp | bSig ) == 0 ) {
-            float_raise(float_flag_invalid, status);
-            return float64_default_nan(status);
-        }
-        return packFloat64( zSign, 0x7FF, 0 );
-    }
-    if ( bExp == 0x7FF ) {
-        if (bSig) {
-            return propagateFloat64NaN(a, b, status);
-        }
-        if ( ( aExp | aSig ) == 0 ) {
-            float_raise(float_flag_invalid, status);
-            return float64_default_nan(status);
-        }
-        return packFloat64( zSign, 0x7FF, 0 );
-    }
-    if ( aExp == 0 ) {
-        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
-        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
-    }
-    if ( bExp == 0 ) {
-        if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
-        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
-    }
-    zExp = aExp + bExp - 0x3FF;
-    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
-    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
-    mul64To128( aSig, bSig, &zSig0, &zSig1 );
-    zSig0 |= ( zSig1 != 0 );
-    if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
-        zSig0 <<= 1;
-        --zExp;
-    }
-    return roundAndPackFloat64(zSign, zExp, zSig0, status);
-
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of dividing the double-precision floating-point value `a'
-| by the corresponding value `b'.  The operation is performed according to
-| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 float64_div(float64 a, float64 b, float_status *status)
-{
-    flag aSign, bSign, zSign;
-    int aExp, bExp, zExp;
-    uint64_t aSig, bSig, zSig;
-    uint64_t rem0, rem1;
-    uint64_t term0, term1;
-    a = float64_squash_input_denormal(a, status);
-    b = float64_squash_input_denormal(b, status);
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-    bSig = extractFloat64Frac( b );
-    bExp = extractFloat64Exp( b );
-    bSign = extractFloat64Sign( b );
-    zSign = aSign ^ bSign;
-    if ( aExp == 0x7FF ) {
-        if (aSig) {
-            return propagateFloat64NaN(a, b, status);
-        }
-        if ( bExp == 0x7FF ) {
-            if (bSig) {
-                return propagateFloat64NaN(a, b, status);
-            }
-            float_raise(float_flag_invalid, status);
-            return float64_default_nan(status);
-        }
-        return packFloat64( zSign, 0x7FF, 0 );
-    }
-    if ( bExp == 0x7FF ) {
-        if (bSig) {
-            return propagateFloat64NaN(a, b, status);
-        }
-        return packFloat64( zSign, 0, 0 );
-    }
-    if ( bExp == 0 ) {
-        if ( bSig == 0 ) {
-            if ( ( aExp | aSig ) == 0 ) {
-                float_raise(float_flag_invalid, status);
-                return float64_default_nan(status);
-            }
-            float_raise(float_flag_divbyzero, status);
-            return packFloat64( zSign, 0x7FF, 0 );
-        }
-        normalizeFloat64Subnormal( bSig, &bExp, &bSig );
-    }
-    if ( aExp == 0 ) {
-        if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
-        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
-    }
-    zExp = aExp - bExp + 0x3FD;
-    aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
-    bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
-    if ( bSig <= ( aSig + aSig ) ) {
-        aSig >>= 1;
-        ++zExp;
-    }
-    zSig = estimateDiv128To64( aSig, 0, bSig );
-    if ( ( zSig & 0x1FF ) <= 2 ) {
-        mul64To128( bSig, zSig, &term0, &term1 );
-        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
-        while ( (int64_t) rem0 < 0 ) {
-            --zSig;
-            add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
-        }
-        zSig |= ( rem1 != 0 );
-    }
-    return roundAndPackFloat64(zSign, zExp, zSig, status);
-
-}
 
 /*----------------------------------------------------------------------------
 | Returns the remainder of the double-precision floating-point value `a'
@@ -4248,307 +4243,6 @@ float64 float64_rem(float64 a, float64 b, float_status *status)
 }
 
 /*----------------------------------------------------------------------------
-| Returns the result of multiplying the double-precision floating-point values
-| `a' and `b' then adding 'c', with no intermediate rounding step after the
-| multiplication.  The operation is performed according to the IEC/IEEE
-| Standard for Binary Floating-Point Arithmetic 754-2008.
-| The flags argument allows the caller to select negation of the
-| addend, the intermediate product, or the final result. (The difference
-| between this and having the caller do a separate negation is that negating
-| externally will flip the sign bit on NaNs.)
-*----------------------------------------------------------------------------*/
-
-float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
-                       float_status *status)
-{
-    flag aSign, bSign, cSign, zSign;
-    int aExp, bExp, cExp, pExp, zExp, expDiff;
-    uint64_t aSig, bSig, cSig;
-    flag pInf, pZero, pSign;
-    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
-    int shiftcount;
-    flag signflip, infzero;
-
-    a = float64_squash_input_denormal(a, status);
-    b = float64_squash_input_denormal(b, status);
-    c = float64_squash_input_denormal(c, status);
-    aSig = extractFloat64Frac(a);
-    aExp = extractFloat64Exp(a);
-    aSign = extractFloat64Sign(a);
-    bSig = extractFloat64Frac(b);
-    bExp = extractFloat64Exp(b);
-    bSign = extractFloat64Sign(b);
-    cSig = extractFloat64Frac(c);
-    cExp = extractFloat64Exp(c);
-    cSign = extractFloat64Sign(c);
-
-    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
-               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
-
-    /* It is implementation-defined whether the cases of (0,inf,qnan)
-     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
-     * they return if they do), so we have to hand this information
-     * off to the target-specific pick-a-NaN routine.
-     */
-    if (((aExp == 0x7ff) && aSig) ||
-        ((bExp == 0x7ff) && bSig) ||
-        ((cExp == 0x7ff) && cSig)) {
-        return propagateFloat64MulAddNaN(a, b, c, infzero, status);
-    }
-
-    if (infzero) {
-        float_raise(float_flag_invalid, status);
-        return float64_default_nan(status);
-    }
-
-    if (flags & float_muladd_negate_c) {
-        cSign ^= 1;
-    }
-
-    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
-
-    /* Work out the sign and type of the product */
-    pSign = aSign ^ bSign;
-    if (flags & float_muladd_negate_product) {
-        pSign ^= 1;
-    }
-    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
-    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
-
-    if (cExp == 0x7ff) {
-        if (pInf && (pSign ^ cSign)) {
-            /* addition of opposite-signed infinities => InvalidOperation */
-            float_raise(float_flag_invalid, status);
-            return float64_default_nan(status);
-        }
-        /* Otherwise generate an infinity of the same sign */
-        return packFloat64(cSign ^ signflip, 0x7ff, 0);
-    }
-
-    if (pInf) {
-        return packFloat64(pSign ^ signflip, 0x7ff, 0);
-    }
-
-    if (pZero) {
-        if (cExp == 0) {
-            if (cSig == 0) {
-                /* Adding two exact zeroes */
-                if (pSign == cSign) {
-                    zSign = pSign;
-                } else if (status->float_rounding_mode == float_round_down) {
-                    zSign = 1;
-                } else {
-                    zSign = 0;
-                }
-                return packFloat64(zSign ^ signflip, 0, 0);
-            }
-            /* Exact zero plus a denorm */
-            if (status->flush_to_zero) {
-                float_raise(float_flag_output_denormal, status);
-                return packFloat64(cSign ^ signflip, 0, 0);
-            }
-        }
-        /* Zero plus something non-zero : just return the something */
-        if (flags & float_muladd_halve_result) {
-            if (cExp == 0) {
-                normalizeFloat64Subnormal(cSig, &cExp, &cSig);
-            }
-            /* Subtract one to halve, and one again because roundAndPackFloat64
-             * wants one less than the true exponent.
-             */
-            cExp -= 2;
-            cSig = (cSig | 0x0010000000000000ULL) << 10;
-            return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
-        }
-        return packFloat64(cSign ^ signflip, cExp, cSig);
-    }
-
-    if (aExp == 0) {
-        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
-    }
-    if (bExp == 0) {
-        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
-    }
-
-    /* Calculate the actual result a * b + c */
-
-    /* Multiply first; this is easy. */
-    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
-     * because we want the true exponent, not the "one-less-than"
-     * flavour that roundAndPackFloat64() takes.
-     */
-    pExp = aExp + bExp - 0x3fe;
-    aSig = (aSig | LIT64(0x0010000000000000))<<10;
-    bSig = (bSig | LIT64(0x0010000000000000))<<11;
-    mul64To128(aSig, bSig, &pSig0, &pSig1);
-    if ((int64_t)(pSig0 << 1) >= 0) {
-        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
-        pExp--;
-    }
-
-    zSign = pSign ^ signflip;
-
-    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
-     * bit in position 126.
-     */
-    if (cExp == 0) {
-        if (!cSig) {
-            /* Throw out the special case of c being an exact zero now */
-            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
-            if (flags & float_muladd_halve_result) {
-                pExp--;
-            }
-            return roundAndPackFloat64(zSign, pExp - 1,
-                                       pSig1, status);
-        }
-        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
-    }
-
-    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
-     * significand of the addend, with the explicit bit in position 126.
-     */
-    cSig0 = cSig << (126 - 64 - 52);
-    cSig1 = 0;
-    cSig0 |= LIT64(0x4000000000000000);
-    expDiff = pExp - cExp;
-
-    if (pSign == cSign) {
-        /* Addition */
-        if (expDiff > 0) {
-            /* scale c to match p */
-            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
-            zExp = pExp;
-        } else if (expDiff < 0) {
-            /* scale p to match c */
-            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
-            zExp = cExp;
-        } else {
-            /* no scaling needed */
-            zExp = cExp;
-        }
-        /* Add significands and make sure explicit bit ends up in posn 126 */
-        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
-        if ((int64_t)zSig0 < 0) {
-            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
-        } else {
-            zExp--;
-        }
-        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
-        if (flags & float_muladd_halve_result) {
-            zExp--;
-        }
-        return roundAndPackFloat64(zSign, zExp, zSig1, status);
-    } else {
-        /* Subtraction */
-        if (expDiff > 0) {
-            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
-            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
-            zExp = pExp;
-        } else if (expDiff < 0) {
-            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
-            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
-            zExp = cExp;
-            zSign ^= 1;
-        } else {
-            zExp = pExp;
-            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
-                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
-            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
-                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
-                zSign ^= 1;
-            } else {
-                /* Exact zero */
-                zSign = signflip;
-                if (status->float_rounding_mode == float_round_down) {
-                    zSign ^= 1;
-                }
-                return packFloat64(zSign, 0, 0);
-            }
-        }
-        --zExp;
-        /* Do the equivalent of normalizeRoundAndPackFloat64() but
-         * starting with the significand in a pair of uint64_t.
-         */
-        if (zSig0) {
-            shiftcount = countLeadingZeros64(zSig0) - 1;
-            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
-            if (zSig1) {
-                zSig0 |= 1;
-            }
-            zExp -= shiftcount;
-        } else {
-            shiftcount = countLeadingZeros64(zSig1);
-            if (shiftcount == 0) {
-                zSig0 = (zSig1 >> 1) | (zSig1 & 1);
-                zExp -= 63;
-            } else {
-                shiftcount--;
-                zSig0 = zSig1 << shiftcount;
-                zExp -= (shiftcount + 64);
-            }
-        }
-        if (flags & float_muladd_halve_result) {
-            zExp--;
-        }
-        return roundAndPackFloat64(zSign, zExp, zSig0, status);
-    }
-}
-
-/*----------------------------------------------------------------------------
-| Returns the square root of the double-precision floating-point value `a'.
-| The operation is performed according to the IEC/IEEE Standard for Binary
-| Floating-Point Arithmetic.
-*----------------------------------------------------------------------------*/
-
-float64 float64_sqrt(float64 a, float_status *status)
-{
-    flag aSign;
-    int aExp, zExp;
-    uint64_t aSig, zSig, doubleZSig;
-    uint64_t rem0, rem1, term0, term1;
-    a = float64_squash_input_denormal(a, status);
-
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-    if ( aExp == 0x7FF ) {
-        if (aSig) {
-            return propagateFloat64NaN(a, a, status);
-        }
-        if ( ! aSign ) return a;
-        float_raise(float_flag_invalid, status);
-        return float64_default_nan(status);
-    }
-    if ( aSign ) {
-        if ( ( aExp | aSig ) == 0 ) return a;
-        float_raise(float_flag_invalid, status);
-        return float64_default_nan(status);
-    }
-    if ( aExp == 0 ) {
-        if ( aSig == 0 ) return float64_zero;
-        normalizeFloat64Subnormal( aSig, &aExp, &aSig );
-    }
-    zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
-    aSig |= LIT64( 0x0010000000000000 );
-    zSig = estimateSqrt32( aExp, aSig>>21 );
-    aSig <<= 9 - ( aExp & 1 );
-    zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
-    if ( ( zSig & 0x1FF ) <= 5 ) {
-        doubleZSig = zSig<<1;
-        mul64To128( zSig, zSig, &term0, &term1 );
-        sub128( aSig, 0, term0, term1, &rem0, &rem1 );
-        while ( (int64_t) rem0 < 0 ) {
-            --zSig;
-            doubleZSig -= 2;
-            add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
-        }
-        zSig |= ( ( rem0 | rem1 ) != 0 );
-    }
-    return roundAndPackFloat64(0, zExp, zSig, status);
-
-}
-
-/*----------------------------------------------------------------------------
 | Returns the binary log of the double-precision floating-point value `a'.
 | The operation is performed according to the IEC/IEEE Standard for Binary
 | Floating-Point Arithmetic.
@@ -7255,318 +6949,6 @@ int float128_unordered_quiet(float128 a, float128 b, float_status *status)
     return 0;
 }
 
-/* misc functions */
-float32 uint32_to_float32(uint32_t a, float_status *status)
-{
-    return int64_to_float32(a, status);
-}
-
-float64 uint32_to_float64(uint32_t a, float_status *status)
-{
-    return int64_to_float64(a, status);
-}
-
-uint32_t float32_to_uint32(float32 a, float_status *status)
-{
-    int64_t v;
-    uint32_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float32_to_int64(a, status);
-    if (v < 0) {
-        res = 0;
-    } else if (v > 0xffffffff) {
-        res = 0xffffffff;
-    } else {
-        return v;
-    }
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
-{
-    int64_t v;
-    uint32_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float32_to_int64_round_to_zero(a, status);
-    if (v < 0) {
-        res = 0;
-    } else if (v > 0xffffffff) {
-        res = 0xffffffff;
-    } else {
-        return v;
-    }
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-int16_t float32_to_int16(float32 a, float_status *status)
-{
-    int32_t v;
-    int16_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float32_to_int32(a, status);
-    if (v < -0x8000) {
-        res = -0x8000;
-    } else if (v > 0x7fff) {
-        res = 0x7fff;
-    } else {
-        return v;
-    }
-
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-uint16_t float32_to_uint16(float32 a, float_status *status)
-{
-    int32_t v;
-    uint16_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float32_to_int32(a, status);
-    if (v < 0) {
-        res = 0;
-    } else if (v > 0xffff) {
-        res = 0xffff;
-    } else {
-        return v;
-    }
-
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
-{
-    int64_t v;
-    uint16_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float32_to_int64_round_to_zero(a, status);
-    if (v < 0) {
-        res = 0;
-    } else if (v > 0xffff) {
-        res = 0xffff;
-    } else {
-        return v;
-    }
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-uint32_t float64_to_uint32(float64 a, float_status *status)
-{
-    uint64_t v;
-    uint32_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float64_to_uint64(a, status);
-    if (v > 0xffffffff) {
-        res = 0xffffffff;
-    } else {
-        return v;
-    }
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
-{
-    uint64_t v;
-    uint32_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float64_to_uint64_round_to_zero(a, status);
-    if (v > 0xffffffff) {
-        res = 0xffffffff;
-    } else {
-        return v;
-    }
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-int16_t float64_to_int16(float64 a, float_status *status)
-{
-    int64_t v;
-    int16_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float64_to_int32(a, status);
-    if (v < -0x8000) {
-        res = -0x8000;
-    } else if (v > 0x7fff) {
-        res = 0x7fff;
-    } else {
-        return v;
-    }
-
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-uint16_t float64_to_uint16(float64 a, float_status *status)
-{
-    int64_t v;
-    uint16_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float64_to_int32(a, status);
-    if (v < 0) {
-        res = 0;
-    } else if (v > 0xffff) {
-        res = 0xffff;
-    } else {
-        return v;
-    }
-
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
-{
-    int64_t v;
-    uint16_t res;
-    int old_exc_flags = get_float_exception_flags(status);
-
-    v = float64_to_int64_round_to_zero(a, status);
-    if (v < 0) {
-        res = 0;
-    } else if (v > 0xffff) {
-        res = 0xffff;
-    } else {
-        return v;
-    }
-    set_float_exception_flags(old_exc_flags, status);
-    float_raise(float_flag_invalid, status);
-    return res;
-}
-
-/*----------------------------------------------------------------------------
-| Returns the result of converting the double-precision floating-point value
-| `a' to the 64-bit unsigned integer format.  The conversion is
-| performed according to the IEC/IEEE Standard for Binary Floating-Point
-| Arithmetic---which means in particular that the conversion is rounded
-| according to the current rounding mode.  If `a' is a NaN, the largest
-| positive integer is returned.  If the conversion overflows, the
-| largest unsigned integer is returned.  If 'a' is negative, the value is
-| rounded and zero is returned; negative values that do not round to zero
-| will raise the inexact exception.
-*----------------------------------------------------------------------------*/
-
-uint64_t float64_to_uint64(float64 a, float_status *status)
-{
-    flag aSign;
-    int aExp;
-    int shiftCount;
-    uint64_t aSig, aSigExtra;
-    a = float64_squash_input_denormal(a, status);
-
-    aSig = extractFloat64Frac(a);
-    aExp = extractFloat64Exp(a);
-    aSign = extractFloat64Sign(a);
-    if (aSign && (aExp > 1022)) {
-        float_raise(float_flag_invalid, status);
-        if (float64_is_any_nan(a)) {
-            return LIT64(0xFFFFFFFFFFFFFFFF);
-        } else {
-            return 0;
-        }
-    }
-    if (aExp) {
-        aSig |= LIT64(0x0010000000000000);
-    }
-    shiftCount = 0x433 - aExp;
-    if (shiftCount <= 0) {
-        if (0x43E < aExp) {
-            float_raise(float_flag_invalid, status);
-            return LIT64(0xFFFFFFFFFFFFFFFF);
-        }
-        aSigExtra = 0;
-        aSig <<= -shiftCount;
-    } else {
-        shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
-    }
-    return roundAndPackUint64(aSign, aSig, aSigExtra, status);
-}
-
-uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
-{
-    signed char current_rounding_mode = status->float_rounding_mode;
-    set_float_rounding_mode(float_round_to_zero, status);
-    uint64_t v = float64_to_uint64(a, status);
-    set_float_rounding_mode(current_rounding_mode, status);
-    return v;
-}
-
-#define COMPARE(s, nan_exp)                                                  \
-static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
-                                      int is_quiet, float_status *status)    \
-{                                                                            \
-    flag aSign, bSign;                                                       \
-    uint ## s ## _t av, bv;                                                  \
-    a = float ## s ## _squash_input_denormal(a, status);                     \
-    b = float ## s ## _squash_input_denormal(b, status);                     \
-                                                                             \
-    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
-         extractFloat ## s ## Frac( a ) ) ||                                 \
-        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
-          extractFloat ## s ## Frac( b ) )) {                                \
-        if (!is_quiet ||                                                     \
-            float ## s ## _is_signaling_nan(a, status) ||                  \
-            float ## s ## _is_signaling_nan(b, status)) {                 \
-            float_raise(float_flag_invalid, status);                         \
-        }                                                                    \
-        return float_relation_unordered;                                     \
-    }                                                                        \
-    aSign = extractFloat ## s ## Sign( a );                                  \
-    bSign = extractFloat ## s ## Sign( b );                                  \
-    av = float ## s ## _val(a);                                              \
-    bv = float ## s ## _val(b);                                              \
-    if ( aSign != bSign ) {                                                  \
-        if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
-            /* zero case */                                                  \
-            return float_relation_equal;                                     \
-        } else {                                                             \
-            return 1 - (2 * aSign);                                          \
-        }                                                                    \
-    } else {                                                                 \
-        if (av == bv) {                                                      \
-            return float_relation_equal;                                     \
-        } else {                                                             \
-            return 1 - 2 * (aSign ^ ( av < bv ));                            \
-        }                                                                    \
-    }                                                                        \
-}                                                                            \
-                                                                             \
-int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
-{                                                                            \
-    return float ## s ## _compare_internal(a, b, 0, status);                 \
-}                                                                            \
-                                                                             \
-int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
-                                 float_status *status)                       \
-{                                                                            \
-    return float ## s ## _compare_internal(a, b, 1, status);                 \
-}
-
-COMPARE(32, 0xff)
-COMPARE(64, 0x7ff)
-
 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
                                             int is_quiet, float_status *status)
 {
@@ -7661,186 +7043,6 @@ int float128_compare_quiet(float128 a, float128 b, float_status *status)
     return float128_compare_internal(a, b, 1, status);
 }
 
-/* min() and max() functions. These can't be implemented as
- * 'compare and pick one input' because that would mishandle
- * NaNs and +0 vs -0.
- *
- * minnum() and maxnum() functions. These are similar to the min()
- * and max() functions but if one of the arguments is a QNaN and
- * the other is numerical then the numerical argument is returned.
- * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
- * and maxNum() operations. min() and max() are the typical min/max
- * semantics provided by many CPUs which predate that specification.
- *
- * minnummag() and maxnummag() functions correspond to minNumMag()
- * and minNumMag() from the IEEE-754 2008.
- */
-#define MINMAX(s)                                                       \
-static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
-                                               int ismin, int isieee,   \
-                                               int ismag,               \
-                                               float_status *status)    \
-{                                                                       \
-    flag aSign, bSign;                                                  \
-    uint ## s ## _t av, bv, aav, abv;                                   \
-    a = float ## s ## _squash_input_denormal(a, status);                \
-    b = float ## s ## _squash_input_denormal(b, status);                \
-    if (float ## s ## _is_any_nan(a) ||                                 \
-        float ## s ## _is_any_nan(b)) {                                 \
-        if (isieee) {                                                   \
-            if (float ## s ## _is_quiet_nan(a, status) &&               \
-                !float ## s ##_is_any_nan(b)) {                         \
-                return b;                                               \
-            } else if (float ## s ## _is_quiet_nan(b, status) &&        \
-                       !float ## s ## _is_any_nan(a)) {                \
-                return a;                                               \
-            }                                                           \
-        }                                                               \
-        return propagateFloat ## s ## NaN(a, b, status);                \
-    }                                                                   \
-    aSign = extractFloat ## s ## Sign(a);                               \
-    bSign = extractFloat ## s ## Sign(b);                               \
-    av = float ## s ## _val(a);                                         \
-    bv = float ## s ## _val(b);                                         \
-    if (ismag) {                                                        \
-        aav = float ## s ## _abs(av);                                   \
-        abv = float ## s ## _abs(bv);                                   \
-        if (aav != abv) {                                               \
-            if (ismin) {                                                \
-                return (aav < abv) ? a : b;                             \
-            } else {                                                    \
-                return (aav < abv) ? b : a;                             \
-            }                                                           \
-        }                                                               \
-    }                                                                   \
-    if (aSign != bSign) {                                               \
-        if (ismin) {                                                    \
-            return aSign ? a : b;                                       \
-        } else {                                                        \
-            return aSign ? b : a;                                       \
-        }                                                               \
-    } else {                                                            \
-        if (ismin) {                                                    \
-            return (aSign ^ (av < bv)) ? a : b;                         \
-        } else {                                                        \
-            return (aSign ^ (av < bv)) ? b : a;                         \
-        }                                                               \
-    }                                                                   \
-}                                                                       \
-                                                                        \
-float ## s float ## s ## _min(float ## s a, float ## s b,               \
-                              float_status *status)                     \
-{                                                                       \
-    return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
-}                                                                       \
-                                                                        \
-float ## s float ## s ## _max(float ## s a, float ## s b,               \
-                              float_status *status)                     \
-{                                                                       \
-    return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
-}                                                                       \
-                                                                        \
-float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
-                                 float_status *status)                  \
-{                                                                       \
-    return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
-}                                                                       \
-                                                                        \
-float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
-                                 float_status *status)                  \
-{                                                                       \
-    return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
-}                                                                       \
-                                                                        \
-float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
-                                    float_status *status)               \
-{                                                                       \
-    return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
-}                                                                       \
-                                                                        \
-float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
-                                    float_status *status)               \
-{                                                                       \
-    return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
-}
-
-MINMAX(32)
-MINMAX(64)
-
-
-/* Multiply A by 2 raised to the power N.  */
-float32 float32_scalbn(float32 a, int n, float_status *status)
-{
-    flag aSign;
-    int16_t aExp;
-    uint32_t aSig;
-
-    a = float32_squash_input_denormal(a, status);
-    aSig = extractFloat32Frac( a );
-    aExp = extractFloat32Exp( a );
-    aSign = extractFloat32Sign( a );
-
-    if ( aExp == 0xFF ) {
-        if ( aSig ) {
-            return propagateFloat32NaN(a, a, status);
-        }
-        return a;
-    }
-    if (aExp != 0) {
-        aSig |= 0x00800000;
-    } else if (aSig == 0) {
-        return a;
-    } else {
-        aExp++;
-    }
-
-    if (n > 0x200) {
-        n = 0x200;
-    } else if (n < -0x200) {
-        n = -0x200;
-    }
-
-    aExp += n - 1;
-    aSig <<= 7;
-    return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
-}
-
-float64 float64_scalbn(float64 a, int n, float_status *status)
-{
-    flag aSign;
-    int16_t aExp;
-    uint64_t aSig;
-
-    a = float64_squash_input_denormal(a, status);
-    aSig = extractFloat64Frac( a );
-    aExp = extractFloat64Exp( a );
-    aSign = extractFloat64Sign( a );
-
-    if ( aExp == 0x7FF ) {
-        if ( aSig ) {
-            return propagateFloat64NaN(a, a, status);
-        }
-        return a;
-    }
-    if (aExp != 0) {
-        aSig |= LIT64( 0x0010000000000000 );
-    } else if (aSig == 0) {
-        return a;
-    } else {
-        aExp++;
-    }
-
-    if (n > 0x1000) {
-        n = 0x1000;
-    } else if (n < -0x1000) {
-        n = -0x1000;
-    }
-
-    aExp += n - 1;
-    aSig <<= 10;
-    return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
-}
-
 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
 {
     flag aSign;
diff --git a/hw/arm/raspi.c b/hw/arm/raspi.c
index 93121c56bf..a37881433c 100644
--- a/hw/arm/raspi.c
+++ b/hw/arm/raspi.c
@@ -187,3 +187,26 @@ static void raspi2_machine_init(MachineClass *mc)
     mc->ignore_memory_transaction_failures = true;
 };
 DEFINE_MACHINE("raspi2", raspi2_machine_init)
+
+#ifdef TARGET_AARCH64
+static void raspi3_init(MachineState *machine)
+{
+    raspi_init(machine, 3);
+}
+
+static void raspi3_machine_init(MachineClass *mc)
+{
+    mc->desc = "Raspberry Pi 3";
+    mc->init = raspi3_init;
+    mc->block_default_type = IF_SD;
+    mc->no_parallel = 1;
+    mc->no_floppy = 1;
+    mc->no_cdrom = 1;
+    mc->default_cpu_type = ARM_CPU_TYPE_NAME("cortex-a53");
+    mc->max_cpus = BCM2836_NCPUS;
+    mc->min_cpus = BCM2836_NCPUS;
+    mc->default_cpus = BCM2836_NCPUS;
+    mc->default_ram_size = 1024 * 1024 * 1024;
+}
+DEFINE_MACHINE("raspi3", raspi3_machine_init)
+#endif
diff --git a/hw/char/stm32f2xx_usart.c b/hw/char/stm32f2xx_usart.c
index 07b462d4b6..032b5fda13 100644
--- a/hw/char/stm32f2xx_usart.c
+++ b/hw/char/stm32f2xx_usart.c
@@ -96,12 +96,10 @@ static uint64_t stm32f2xx_usart_read(void *opaque, hwaddr addr,
     switch (addr) {
     case USART_SR:
         retvalue = s->usart_sr;
-        s->usart_sr &= ~USART_SR_TC;
         qemu_chr_fe_accept_input(&s->chr);
         return retvalue;
     case USART_DR:
         DB_PRINT("Value: 0x%" PRIx32 ", %c\n", s->usart_dr, (char) s->usart_dr);
-        s->usart_sr |= USART_SR_TXE;
         s->usart_sr &= ~USART_SR_RXNE;
         qemu_chr_fe_accept_input(&s->chr);
         qemu_set_irq(s->irq, 0);
@@ -137,7 +135,9 @@ static void stm32f2xx_usart_write(void *opaque, hwaddr addr,
     switch (addr) {
     case USART_SR:
         if (value <= 0x3FF) {
-            s->usart_sr = value;
+            /* I/O being synchronous, TXE is always set. In addition, it may
+               only be set by hardware, so keep it set here. */
+            s->usart_sr = value | USART_SR_TXE;
         } else {
             s->usart_sr &= value;
         }
@@ -151,8 +151,12 @@ static void stm32f2xx_usart_write(void *opaque, hwaddr addr,
             /* XXX this blocks entire thread. Rewrite to use
              * qemu_chr_fe_write and background I/O callbacks */
             qemu_chr_fe_write_all(&s->chr, &ch, 1);
+            /* XXX I/O are currently synchronous, making it impossible for
+               software to observe transient states where TXE or TC aren't
+               set. Unlike TXE however, which is read-only, software may
+               clear TC by writing 0 to the SR register, so set it again
+               on each write. */
             s->usart_sr |= USART_SR_TC;
-            s->usart_sr &= ~USART_SR_TXE;
         }
         return;
     case USART_BRR:
diff --git a/hw/display/virtio-gpu-3d.c b/hw/display/virtio-gpu-3d.c
index 7db84efe89..3558f38fe8 100644
--- a/hw/display/virtio-gpu-3d.c
+++ b/hw/display/virtio-gpu-3d.c
@@ -362,6 +362,11 @@ static void virgl_cmd_get_capset_info(VirtIOGPU *g,
         virgl_renderer_get_cap_set(resp.capset_id,
                                    &resp.capset_max_version,
                                    &resp.capset_max_size);
+    } else if (info.capset_index == 1) {
+        resp.capset_id = VIRTIO_GPU_CAPSET_VIRGL2;
+        virgl_renderer_get_cap_set(resp.capset_id,
+                                   &resp.capset_max_version,
+                                   &resp.capset_max_size);
     } else {
         resp.capset_max_version = 0;
         resp.capset_max_size = 0;
@@ -635,4 +640,14 @@ int virtio_gpu_virgl_init(VirtIOGPU *g)
     return 0;
 }
 
+int virtio_gpu_virgl_get_num_capsets(VirtIOGPU *g)
+{
+    uint32_t capset2_max_ver, capset2_max_size;
+    virgl_renderer_get_cap_set(VIRTIO_GPU_CAPSET_VIRGL2,
+                              &capset2_max_ver,
+                              &capset2_max_size);
+
+    return capset2_max_ver ? 2 : 1;
+}
+
 #endif /* CONFIG_VIRGL */
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 6658f6c6a6..2dd3c3481a 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -1215,7 +1215,12 @@ static void virtio_gpu_device_realize(DeviceState *qdev, Error **errp)
         /* use larger control queue in 3d mode */
         g->ctrl_vq   = virtio_add_queue(vdev, 256, virtio_gpu_handle_ctrl_cb);
         g->cursor_vq = virtio_add_queue(vdev, 16, virtio_gpu_handle_cursor_cb);
-        g->virtio_config.num_capsets = 1;
+
+#if defined(CONFIG_VIRGL)
+        g->virtio_config.num_capsets = virtio_gpu_virgl_get_num_capsets(g);
+#else
+        g->virtio_config.num_capsets = 0;
+#endif
     } else {
         g->ctrl_vq   = virtio_add_queue(vdev, 64, virtio_gpu_handle_ctrl_cb);
         g->cursor_vq = virtio_add_queue(vdev, 16, virtio_gpu_handle_cursor_cb);
diff --git a/hw/misc/aspeed_scu.c b/hw/misc/aspeed_scu.c
index 74537ce975..5e6d5744ee 100644
--- a/hw/misc/aspeed_scu.c
+++ b/hw/misc/aspeed_scu.c
@@ -191,7 +191,7 @@ static void aspeed_scu_write(void *opaque, hwaddr offset, uint64_t data,
     }
 
     if (reg > PROT_KEY && reg < CPU2_BASE_SEG1 &&
-            s->regs[PROT_KEY] != ASPEED_SCU_PROT_KEY) {
+            !s->regs[PROT_KEY]) {
         qemu_log_mask(LOG_GUEST_ERROR, "%s: SCU is locked!\n", __func__);
         return;
     }
@@ -199,6 +199,10 @@ static void aspeed_scu_write(void *opaque, hwaddr offset, uint64_t data,
     trace_aspeed_scu_write(offset, size, data);
 
     switch (reg) {
+    case PROT_KEY:
+        s->regs[reg] = (data == ASPEED_SCU_PROT_KEY) ? 1 : 0;
+        return;
+
     case FREQ_CNTR_EVAL:
     case VGA_SCRATCH1 ... VGA_SCRATCH8:
     case RNG_DATA:
diff --git a/hw/misc/aspeed_sdmc.c b/hw/misc/aspeed_sdmc.c
index f0b3053fae..0df008e52a 100644
--- a/hw/misc/aspeed_sdmc.c
+++ b/hw/misc/aspeed_sdmc.c
@@ -110,7 +110,12 @@ static void aspeed_sdmc_write(void *opaque, hwaddr addr, uint64_t data,
         return;
     }
 
-    if (addr != R_PROT && s->regs[R_PROT] != PROT_KEY_UNLOCK) {
+    if (addr == R_PROT) {
+        s->regs[addr] = (data == PROT_KEY_UNLOCK) ? 1 : 0;
+        return;
+    }
+
+    if (!s->regs[R_PROT]) {
         qemu_log_mask(LOG_GUEST_ERROR, "%s: SDMC is locked!\n", __func__);
         return;
     }
@@ -123,6 +128,7 @@ static void aspeed_sdmc_write(void *opaque, hwaddr addr, uint64_t data,
             data &= ~ASPEED_SDMC_READONLY_MASK;
             break;
         case AST2500_A0_SILICON_REV:
+        case AST2500_A1_SILICON_REV:
             data &= ~ASPEED_SDMC_AST2500_READONLY_MASK;
             break;
         default:
diff --git a/hw/sd/milkymist-memcard.c b/hw/sd/milkymist-memcard.c
index 341da88552..5570c1e9a0 100644
--- a/hw/sd/milkymist-memcard.c
+++ b/hw/sd/milkymist-memcard.c
@@ -22,11 +22,12 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "hw/hw.h"
 #include "hw/sysbus.h"
 #include "sysemu/sysemu.h"
 #include "trace.h"
-#include "qemu/error-report.h"
+#include "include/qapi/error.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/blockdev.h"
 #include "hw/sd/sd.h"
@@ -68,7 +69,7 @@ struct MilkymistMemcardState {
     SysBusDevice parent_obj;
 
     MemoryRegion regs_region;
-    SDState *card;
+    SDBus sdbus;
 
     int command_write_ptr;
     int response_read_ptr;
@@ -104,7 +105,7 @@ static void memcard_sd_command(MilkymistMemcardState *s)
     req.crc = s->command[5];
 
     s->response[0] = req.cmd;
-    s->response_len = sd_do_command(s->card, &req, s->response+1);
+    s->response_len = sdbus_do_command(&s->sdbus, &req, s->response + 1);
     s->response_read_ptr = 0;
 
     if (s->response_len == 16) {
@@ -138,8 +139,8 @@ static uint64_t memcard_read(void *opaque, hwaddr addr,
         } else {
             r = s->response[s->response_read_ptr++];
             if (s->response_read_ptr > s->response_len) {
-                error_report("milkymist_memcard: "
-                        "read more cmd bytes than available. Clipping.");
+                qemu_log_mask(LOG_GUEST_ERROR, "milkymist_memcard: "
+                              "read more cmd bytes than available. Clipping.");
                 s->response_read_ptr = 0;
             }
         }
@@ -149,10 +150,10 @@ static uint64_t memcard_read(void *opaque, hwaddr addr,
             r = 0xffffffff;
         } else {
             r = 0;
-            r |= sd_read_data(s->card) << 24;
-            r |= sd_read_data(s->card) << 16;
-            r |= sd_read_data(s->card) << 8;
-            r |= sd_read_data(s->card);
+            r |= sdbus_read_data(&s->sdbus) << 24;
+            r |= sdbus_read_data(&s->sdbus) << 16;
+            r |= sdbus_read_data(&s->sdbus) << 8;
+            r |= sdbus_read_data(&s->sdbus);
         }
         break;
     case R_CLK2XDIV:
@@ -163,8 +164,9 @@ static uint64_t memcard_read(void *opaque, hwaddr addr,
         break;
 
     default:
-        error_report("milkymist_memcard: read access to unknown register 0x"
-                TARGET_FMT_plx, addr << 2);
+        qemu_log_mask(LOG_UNIMP, "milkymist_memcard: "
+                      "read access to unknown register 0x%" HWADDR_PRIx "\n",
+                      addr << 2);
         break;
     }
 
@@ -205,10 +207,10 @@ static void memcard_write(void *opaque, hwaddr addr, uint64_t value,
         if (!s->enabled) {
             break;
         }
-        sd_write_data(s->card, (value >> 24) & 0xff);
-        sd_write_data(s->card, (value >> 16) & 0xff);
-        sd_write_data(s->card, (value >> 8) & 0xff);
-        sd_write_data(s->card, value & 0xff);
+        sdbus_write_data(&s->sdbus, (value >> 24) & 0xff);
+        sdbus_write_data(&s->sdbus, (value >> 16) & 0xff);
+        sdbus_write_data(&s->sdbus, (value >> 8) & 0xff);
+        sdbus_write_data(&s->sdbus, value & 0xff);
         break;
     case R_ENABLE:
         s->regs[addr] = value;
@@ -220,8 +222,9 @@ static void memcard_write(void *opaque, hwaddr addr, uint64_t value,
         break;
 
     default:
-        error_report("milkymist_memcard: write access to unknown register 0x"
-                TARGET_FMT_plx, addr << 2);
+        qemu_log_mask(LOG_UNIMP, "milkymist_memcard: "
+                      "write access to unknown register 0x%" HWADDR_PRIx " "
+                      "(value 0x%" PRIx64 ")\n", addr << 2, value);
         break;
     }
 }
@@ -248,33 +251,41 @@ static void milkymist_memcard_reset(DeviceState *d)
     for (i = 0; i < R_MAX; i++) {
         s->regs[i] = 0;
     }
-    /* Since we're still using the legacy SD API the card is not plugged
-     * into any bus, and we must reset it manually.
-     */
-    device_reset(DEVICE(s->card));
 }
 
-static int milkymist_memcard_init(SysBusDevice *dev)
+static void milkymist_memcard_init(Object *obj)
+{
+    MilkymistMemcardState *s = MILKYMIST_MEMCARD(obj);
+    SysBusDevice *dev = SYS_BUS_DEVICE(obj);
+
+    memory_region_init_io(&s->regs_region, OBJECT(s), &memcard_mmio_ops, s,
+            "milkymist-memcard", R_MAX * 4);
+    sysbus_init_mmio(dev, &s->regs_region);
+}
+
+static void milkymist_memcard_realize(DeviceState *dev, Error **errp)
 {
     MilkymistMemcardState *s = MILKYMIST_MEMCARD(dev);
-    DriveInfo *dinfo;
+    DeviceState *carddev;
     BlockBackend *blk;
+    DriveInfo *dinfo;
+    Error *err = NULL;
+
+    qbus_create_inplace(&s->sdbus, sizeof(s->sdbus), TYPE_SD_BUS,
+                        dev, "sd-bus");
 
+    /* Create and plug in the sd card */
     /* FIXME use a qdev drive property instead of drive_get_next() */
     dinfo = drive_get_next(IF_SD);
     blk = dinfo ? blk_by_legacy_dinfo(dinfo) : NULL;
-    s->card = sd_init(blk, false);
-    if (s->card == NULL) {
-        return -1;
+    carddev = qdev_create(&s->sdbus.qbus, TYPE_SD_CARD);
+    qdev_prop_set_drive(carddev, "drive", blk, &err);
+    object_property_set_bool(OBJECT(carddev), true, "realized", &err);
+    if (err) {
+        error_setg(errp, "failed to init SD card: %s", error_get_pretty(err));
+        return;
     }
-
     s->enabled = blk && blk_is_inserted(blk);
-
-    memory_region_init_io(&s->regs_region, OBJECT(s), &memcard_mmio_ops, s,
-            "milkymist-memcard", R_MAX * 4);
-    sysbus_init_mmio(dev, &s->regs_region);
-
-    return 0;
 }
 
 static const VMStateDescription vmstate_milkymist_memcard = {
@@ -297,9 +308,8 @@ static const VMStateDescription vmstate_milkymist_memcard = {
 static void milkymist_memcard_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-    k->init = milkymist_memcard_init;
+    dc->realize = milkymist_memcard_realize;
     dc->reset = milkymist_memcard_reset;
     dc->vmsd = &vmstate_milkymist_memcard;
     /* Reason: init() method uses drive_get_next() */
@@ -310,6 +320,7 @@ static const TypeInfo milkymist_memcard_info = {
     .name          = TYPE_MILKYMIST_MEMCARD,
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(MilkymistMemcardState),
+    .instance_init = milkymist_memcard_init,
     .class_init    = milkymist_memcard_class_init,
 };
 
diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 9ac9b63ff8..933890e86f 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -32,28 +32,21 @@
 #include "qemu/osdep.h"
 #include "hw/qdev.h"
 #include "hw/hw.h"
+#include "hw/registerfields.h"
 #include "sysemu/block-backend.h"
 #include "hw/sd/sd.h"
 #include "qapi/error.h"
 #include "qemu/bitmap.h"
+#include "qemu/cutils.h"
 #include "hw/qdev-properties.h"
 #include "qemu/error-report.h"
 #include "qemu/timer.h"
 #include "qemu/log.h"
+#include "sdmmc-internal.h"
+#include "trace.h"
 
 //#define DEBUG_SD 1
 
-#ifdef DEBUG_SD
-#define DPRINTF(fmt, ...) \
-do { fprintf(stderr, "SD: " fmt , ## __VA_ARGS__); } while (0)
-#else
-#define DPRINTF(fmt, ...) do {} while(0)
-#endif
-
-#define ACMD41_ENQUIRY_MASK     0x00ffffff
-#define OCR_POWER_UP            0x80000000
-#define OCR_POWER_DELAY_NS      500000 /* 0.5ms */
-
 typedef enum {
     sd_r0 = 0,    /* no response */
     sd_r1,        /* normal response command */
@@ -88,16 +81,21 @@ enum SDCardStates {
 struct SDState {
     DeviceState parent_obj;
 
-    uint32_t mode;    /* current card mode, one of SDCardModes */
-    int32_t state;    /* current card state, one of SDCardStates */
+    /* SD Memory Card Registers */
     uint32_t ocr;
-    QEMUTimer *ocr_power_timer;
     uint8_t scr[8];
     uint8_t cid[16];
     uint8_t csd[16];
     uint16_t rca;
     uint32_t card_status;
     uint8_t sd_status[64];
+
+    /* Configurable properties */
+    BlockBackend *blk;
+    bool spi;
+
+    uint32_t mode;    /* current card mode, one of SDCardModes */
+    int32_t state;    /* current card state, one of SDCardStates */
     uint32_t vhs;
     bool wp_switch;
     unsigned long *wp_groups;
@@ -110,8 +108,6 @@ struct SDState {
     uint8_t pwd[16];
     uint32_t pwd_len;
     uint8_t function_group[6];
-
-    bool spi;
     uint8_t current_cmd;
     /* True if we will handle the next command as an ACMD. Note that this does
      * *not* track the APP_CMD status bit!
@@ -123,13 +119,53 @@ struct SDState {
     uint8_t data[512];
     qemu_irq readonly_cb;
     qemu_irq inserted_cb;
-    BlockBackend *blk;
-
+    QEMUTimer *ocr_power_timer;
     bool enable;
     uint8_t dat_lines;
     bool cmd_line;
 };
 
+static const char *sd_state_name(enum SDCardStates state)
+{
+    static const char *state_name[] = {
+        [sd_idle_state]             = "idle",
+        [sd_ready_state]            = "ready",
+        [sd_identification_state]   = "identification",
+        [sd_standby_state]          = "standby",
+        [sd_transfer_state]         = "transfer",
+        [sd_sendingdata_state]      = "sendingdata",
+        [sd_receivingdata_state]    = "receivingdata",
+        [sd_programming_state]      = "programming",
+        [sd_disconnect_state]       = "disconnect",
+    };
+    if (state == sd_inactive_state) {
+        return "inactive";
+    }
+    assert(state <= ARRAY_SIZE(state_name));
+    return state_name[state];
+}
+
+static const char *sd_response_name(sd_rsp_type_t rsp)
+{
+    static const char *response_name[] = {
+        [sd_r0]     = "RESP#0 (no response)",
+        [sd_r1]     = "RESP#1 (normal cmd)",
+        [sd_r2_i]   = "RESP#2 (CID reg)",
+        [sd_r2_s]   = "RESP#2 (CSD reg)",
+        [sd_r3]     = "RESP#3 (OCR reg)",
+        [sd_r6]     = "RESP#6 (RCA)",
+        [sd_r7]     = "RESP#7 (operating voltage)",
+    };
+    if (rsp == sd_illegal) {
+        return "ILLEGAL RESP";
+    }
+    if (rsp == sd_r1b) {
+        rsp = sd_r1;
+    }
+    assert(rsp <= ARRAY_SIZE(response_name));
+    return response_name[rsp];
+}
+
 static uint8_t sd_get_dat_lines(SDState *sd)
 {
     return sd->enable ? sd->dat_lines : 0;
@@ -142,6 +178,8 @@ static bool sd_get_cmd_line(SDState *sd)
 
 static void sd_set_voltage(SDState *sd, uint16_t millivolts)
 {
+    trace_sdcard_set_voltage(millivolts);
+
     switch (millivolts) {
     case 3001 ... 3600: /* SD_VOLTAGE_3_3V */
     case 2001 ... 3000: /* SD_VOLTAGE_3_0V */
@@ -176,18 +214,21 @@ static void sd_set_mode(SDState *sd)
     }
 }
 
-static const sd_cmd_type_t sd_cmd_type[64] = {
+static const sd_cmd_type_t sd_cmd_type[SDMMC_CMD_MAX] = {
     sd_bc,   sd_none, sd_bcr,  sd_bcr,  sd_none, sd_none, sd_none, sd_ac,
     sd_bcr,  sd_ac,   sd_ac,   sd_adtc, sd_ac,   sd_ac,   sd_none, sd_ac,
+    /* 16 */
     sd_ac,   sd_adtc, sd_adtc, sd_none, sd_none, sd_none, sd_none, sd_none,
     sd_adtc, sd_adtc, sd_adtc, sd_adtc, sd_ac,   sd_ac,   sd_adtc, sd_none,
+    /* 32 */
     sd_ac,   sd_ac,   sd_none, sd_none, sd_none, sd_none, sd_ac,   sd_none,
     sd_none, sd_none, sd_bc,   sd_none, sd_none, sd_none, sd_none, sd_none,
+    /* 48 */
     sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_ac,
     sd_adtc, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none, sd_none,
 };
 
-static const int sd_cmd_class[64] = {
+static const int sd_cmd_class[SDMMC_CMD_MAX] = {
     0,  0,  0,  0,  0,  9, 10,  0,  0,  0,  0,  1,  0,  0,  0,  0,
     2,  2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  6,  6,  6,  6,
     5,  5, 10, 10, 10, 10,  5,  9,  9,  9,  7,  7,  7,  7,  7,  7,
@@ -227,27 +268,54 @@ static uint16_t sd_crc16(void *message, size_t width)
     return shift_reg;
 }
 
+#define OCR_POWER_DELAY_NS      500000 /* 0.5ms */
+
+FIELD(OCR, VDD_VOLTAGE_WINDOW,          0, 24)
+FIELD(OCR, VDD_VOLTAGE_WIN_LO,          0,  8)
+FIELD(OCR, DUAL_VOLTAGE_CARD,           7,  1)
+FIELD(OCR, VDD_VOLTAGE_WIN_HI,          8, 16)
+FIELD(OCR, ACCEPT_SWITCH_1V8,          24,  1) /* Only UHS-I */
+FIELD(OCR, UHS_II_CARD,                29,  1) /* Only UHS-II */
+FIELD(OCR, CARD_CAPACITY,              30,  1) /* 0:SDSC, 1:SDHC/SDXC */
+FIELD(OCR, CARD_POWER_UP,              31,  1)
+
+#define ACMD41_ENQUIRY_MASK     0x00ffffff
+#define ACMD41_R3_MASK          (R_OCR_VDD_VOLTAGE_WIN_HI_MASK \
+                               | R_OCR_ACCEPT_SWITCH_1V8_MASK \
+                               | R_OCR_UHS_II_CARD_MASK \
+                               | R_OCR_CARD_CAPACITY_MASK \
+                               | R_OCR_CARD_POWER_UP_MASK)
+
 static void sd_set_ocr(SDState *sd)
 {
-    /* All voltages OK, Standard Capacity SD Memory Card, not yet powered up */
-    sd->ocr = 0x00ffff00;
+    /* All voltages OK */
+    sd->ocr = R_OCR_VDD_VOLTAGE_WIN_HI_MASK;
 }
 
 static void sd_ocr_powerup(void *opaque)
 {
     SDState *sd = opaque;
 
-    /* Set powered up bit in OCR */
-    assert(!(sd->ocr & OCR_POWER_UP));
-    sd->ocr |= OCR_POWER_UP;
+    trace_sdcard_powerup();
+    assert(!FIELD_EX32(sd->ocr, OCR, CARD_POWER_UP));
+
+    /* card power-up OK */
+    sd->ocr = FIELD_DP32(sd->ocr, OCR, CARD_POWER_UP, 1);
+
+    if (sd->size > 1 * G_BYTE) {
+        sd->ocr = FIELD_DP32(sd->ocr, OCR, CARD_CAPACITY, 1);
+    }
 }
 
 static void sd_set_scr(SDState *sd)
 {
-    sd->scr[0] = 0x00;		/* SCR Structure */
-    sd->scr[1] = 0x2f;		/* SD Security Support */
-    sd->scr[2] = 0x00;
+    sd->scr[0] = (0 << 4)       /* SCR version 1.0 */
+                 | 0;           /* Spec Versions 1.0 and 1.01 */
+    sd->scr[1] = (2 << 4)       /* SDSC Card (Security Version 1.01) */
+                 | 0b0101;      /* 1-bit or 4-bit width bus modes */
+    sd->scr[2] = 0x00;          /* Extended Security is not supported. */
     sd->scr[3] = 0x00;
+    /* reserved for manufacturer usage */
     sd->scr[4] = 0x00;
     sd->scr[5] = 0x00;
     sd->scr[6] = 0x00;
@@ -299,11 +367,11 @@ static void sd_set_csd(SDState *sd, uint64_t size)
     uint32_t sectsize = (1 << (SECTOR_SHIFT + 1)) - 1;
     uint32_t wpsize = (1 << (WPGROUP_SHIFT + 1)) - 1;
 
-    if (size <= 0x40000000) {	/* Standard Capacity SD */
+    if (size <= 1 * G_BYTE) { /* Standard Capacity SD */
         sd->csd[0] = 0x00;	/* CSD structure */
         sd->csd[1] = 0x26;	/* Data read access-time-1 */
         sd->csd[2] = 0x00;	/* Data read access-time-2 */
-        sd->csd[3] = 0x5a;	/* Max. data transfer rate */
+        sd->csd[3] = 0x32;      /* Max. data transfer rate: 25 MHz */
         sd->csd[4] = 0x5f;	/* Card Command Classes */
         sd->csd[5] = 0x50 |	/* Max. read data block length */
             HWBLOCK_SHIFT;
@@ -324,7 +392,6 @@ static void sd_set_csd(SDState *sd, uint64_t size)
         sd->csd[13] = 0x20 |	/* Max. write data block length */
             ((HWBLOCK_SHIFT << 6) & 0xc0);
         sd->csd[14] = 0x00;	/* File format group */
-        sd->csd[15] = (sd_crc7(sd->csd, 15) << 1) | 1;
     } else {			/* SDHC */
         size /= 512 * 1024;
         size -= 1;
@@ -343,9 +410,8 @@ static void sd_set_csd(SDState *sd, uint64_t size)
         sd->csd[12] = 0x0a;
         sd->csd[13] = 0x40;
         sd->csd[14] = 0x00;
-        sd->csd[15] = 0x00;
-        sd->ocr |= 1 << 30;     /* High Capacity SD Memory Card */
     }
+    sd->csd[15] = (sd_crc7(sd->csd, 15) << 1) | 1;
 }
 
 static void sd_set_rca(SDState *sd)
@@ -353,14 +419,56 @@ static void sd_set_rca(SDState *sd)
     sd->rca += 0x4567;
 }
 
+FIELD(CSR, AKE_SEQ_ERROR,               3,  1)
+FIELD(CSR, APP_CMD,                     5,  1)
+FIELD(CSR, FX_EVENT,                    6,  1)
+FIELD(CSR, READY_FOR_DATA,              8,  1)
+FIELD(CSR, CURRENT_STATE,               9,  4)
+FIELD(CSR, ERASE_RESET,                13,  1)
+FIELD(CSR, CARD_ECC_DISABLED,          14,  1)
+FIELD(CSR, WP_ERASE_SKIP,              15,  1)
+FIELD(CSR, CSD_OVERWRITE,              16,  1)
+FIELD(CSR, DEFERRED_RESPONSE,          17,  1)
+FIELD(CSR, ERROR,                      19,  1)
+FIELD(CSR, CC_ERROR,                   20,  1)
+FIELD(CSR, CARD_ECC_FAILED,            21,  1)
+FIELD(CSR, ILLEGAL_COMMAND,            22,  1)
+FIELD(CSR, COM_CRC_ERROR,              23,  1)
+FIELD(CSR, LOCK_UNLOCK_FAILED,         24,  1)
+FIELD(CSR, CARD_IS_LOCKED,             25,  1)
+FIELD(CSR, WP_VIOLATION,               26,  1)
+FIELD(CSR, ERASE_PARAM,                27,  1)
+FIELD(CSR, ERASE_SEQ_ERROR,            28,  1)
+FIELD(CSR, BLOCK_LEN_ERROR,            29,  1)
+FIELD(CSR, ADDRESS_ERROR,              30,  1)
+FIELD(CSR, OUT_OF_RANGE,               31,  1)
+
 /* Card status bits, split by clear condition:
  * A : According to the card current state
  * B : Always related to the previous command
  * C : Cleared by read
  */
-#define CARD_STATUS_A	0x02004100
-#define CARD_STATUS_B	0x00c01e00
-#define CARD_STATUS_C	0xfd39a028
+#define CARD_STATUS_A           (R_CSR_READY_FOR_DATA_MASK \
+                               | R_CSR_CARD_ECC_DISABLED_MASK \
+                               | R_CSR_CARD_IS_LOCKED_MASK)
+#define CARD_STATUS_B           (R_CSR_CURRENT_STATE_MASK \
+                               | R_CSR_ILLEGAL_COMMAND_MASK \
+                               | R_CSR_COM_CRC_ERROR_MASK)
+#define CARD_STATUS_C           (R_CSR_AKE_SEQ_ERROR_MASK \
+                               | R_CSR_APP_CMD_MASK \
+                               | R_CSR_ERASE_RESET_MASK \
+                               | R_CSR_WP_ERASE_SKIP_MASK \
+                               | R_CSR_CSD_OVERWRITE_MASK \
+                               | R_CSR_ERROR_MASK \
+                               | R_CSR_CC_ERROR_MASK \
+                               | R_CSR_CARD_ECC_FAILED_MASK \
+                               | R_CSR_LOCK_UNLOCK_FAILED_MASK \
+                               | R_CSR_WP_VIOLATION_MASK \
+                               | R_CSR_ERASE_PARAM_MASK \
+                               | R_CSR_ERASE_SEQ_ERROR_MASK \
+                               | R_CSR_BLOCK_LEN_ERROR_MASK \
+                               | R_CSR_ADDRESS_ERROR_MASK \
+                               | R_CSR_OUT_OF_RANGE_MASK)
 
 static void sd_set_cardstatus(SDState *sd)
 {
@@ -376,57 +484,39 @@ static int sd_req_crc_validate(SDRequest *req)
 {
     uint8_t buffer[5];
     buffer[0] = 0x40 | req->cmd;
-    buffer[1] = (req->arg >> 24) & 0xff;
-    buffer[2] = (req->arg >> 16) & 0xff;
-    buffer[3] = (req->arg >> 8) & 0xff;
-    buffer[4] = (req->arg >> 0) & 0xff;
+    stl_be_p(&buffer[1], req->arg);
     return 0;
     return sd_crc7(buffer, 5) != req->crc;	/* TODO */
 }
 
 static void sd_response_r1_make(SDState *sd, uint8_t *response)
 {
-    uint32_t status = sd->card_status;
+    stl_be_p(response, sd->card_status);
+
     /* Clear the "clear on read" status bits */
     sd->card_status &= ~CARD_STATUS_C;
-
-    response[0] = (status >> 24) & 0xff;
-    response[1] = (status >> 16) & 0xff;
-    response[2] = (status >> 8) & 0xff;
-    response[3] = (status >> 0) & 0xff;
 }
 
 static void sd_response_r3_make(SDState *sd, uint8_t *response)
 {
-    response[0] = (sd->ocr >> 24) & 0xff;
-    response[1] = (sd->ocr >> 16) & 0xff;
-    response[2] = (sd->ocr >> 8) & 0xff;
-    response[3] = (sd->ocr >> 0) & 0xff;
+    stl_be_p(response, sd->ocr & ACMD41_R3_MASK);
 }
 
 static void sd_response_r6_make(SDState *sd, uint8_t *response)
 {
-    uint16_t arg;
     uint16_t status;
 
-    arg = sd->rca;
     status = ((sd->card_status >> 8) & 0xc000) |
              ((sd->card_status >> 6) & 0x2000) |
               (sd->card_status & 0x1fff);
     sd->card_status &= ~(CARD_STATUS_C & 0xc81fff);
-
-    response[0] = (arg >> 8) & 0xff;
-    response[1] = arg & 0xff;
-    response[2] = (status >> 8) & 0xff;
-    response[3] = status & 0xff;
+    stw_be_p(response + 0, sd->rca);
+    stw_be_p(response + 2, status);
 }
 
 static void sd_response_r7_make(SDState *sd, uint8_t *response)
 {
-    response[0] = (sd->vhs >> 24) & 0xff;
-    response[1] = (sd->vhs >> 16) & 0xff;
-    response[2] = (sd->vhs >>  8) & 0xff;
-    response[3] = (sd->vhs >>  0) & 0xff;
+    stl_be_p(response, sd->vhs);
 }
 
 static inline uint64_t sd_addr_to_wpnum(uint64_t addr)
@@ -440,6 +530,7 @@ static void sd_reset(DeviceState *dev)
     uint64_t size;
     uint64_t sect;
 
+    trace_sdcard_reset();
     if (sd->blk) {
         blk_get_geometry(sd->blk, &sect);
     } else {
@@ -493,7 +584,10 @@ static void sd_cardchange(void *opaque, bool load, Error **errp)
     bool readonly = sd_get_readonly(sd);
 
     if (inserted) {
+        trace_sdcard_inserted(readonly);
         sd_reset(dev);
+    } else {
+        trace_sdcard_ejected();
     }
 
     /* The IRQ notification is for legacy non-QOM SD controller devices;
@@ -521,7 +615,7 @@ static bool sd_ocr_vmstate_needed(void *opaque)
     SDState *sd = opaque;
 
     /* Include the OCR state (and timer) if it is not yet powered up */
-    return !(sd->ocr & OCR_POWER_UP);
+    return !FIELD_EX32(sd->ocr, OCR, CARD_POWER_UP);
 }
 
 static const VMStateDescription sd_ocr_vmstate = {
@@ -625,12 +719,13 @@ static void sd_erase(SDState *sd)
     uint64_t erase_start = sd->erase_start;
     uint64_t erase_end = sd->erase_end;
 
+    trace_sdcard_erase();
     if (!sd->erase_start || !sd->erase_end) {
         sd->card_status |= ERASE_SEQ_ERROR;
         return;
     }
 
-    if (extract32(sd->ocr, OCR_CCS_BITN, 1)) {
+    if (FIELD_EX32(sd->ocr, OCR, CARD_CAPACITY)) {
         /* High capacity memory card: erase units are 512 byte blocks */
         erase_start *= 512;
         erase_end *= 512;
@@ -667,7 +762,7 @@ static uint32_t sd_wpbits(SDState *sd, uint64_t addr)
 
 static void sd_function_switch(SDState *sd, uint32_t arg)
 {
-    int i, mode, new_func, crc;
+    int i, mode, new_func;
     mode = !!(arg & 0x80000000);
 
     sd->data[0] = 0x00;		/* Maximum current consumption */
@@ -691,9 +786,7 @@ static void sd_function_switch(SDState *sd, uint32_t arg)
         sd->data[14 + (i >> 1)] = new_func << ((i * 4) & 4);
     }
     memset(&sd->data[17], 0, 47);
-    crc = sd_crc16(sd->data, 64);
-    sd->data[65] = crc >> 8;
-    sd->data[66] = crc & 0xff;
+    stw_be_p(sd->data + 65, sd_crc16(sd->data, 64));
 }
 
 static inline bool sd_wp_addr(SDState *sd, uint64_t addr)
@@ -714,6 +807,11 @@ static void sd_lock_command(SDState *sd)
     else
         pwd_len = 0;
 
+    if (lock) {
+        trace_sdcard_lock();
+    } else {
+        trace_sdcard_unlock();
+    }
     if (erase) {
         if (!(sd->card_status & CARD_IS_LOCKED) || sd->blk_len > 1 ||
                         set_pwd || clr_pwd || lock || sd->wp_switch ||
@@ -774,11 +872,13 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
     uint32_t rca = 0x0000;
     uint64_t addr = (sd->ocr & (1 << 30)) ? (uint64_t) req.arg << 9 : req.arg;
 
+    trace_sdcard_normal_command(req.cmd, req.arg, sd_state_name(sd->state));
+
     /* Not interpreting this as an app command */
     sd->card_status &= ~APP_CMD;
 
-    if (sd_cmd_type[req.cmd & 0x3F] == sd_ac
-        || sd_cmd_type[req.cmd & 0x3F] == sd_adtc) {
+    if (sd_cmd_type[req.cmd] == sd_ac
+        || sd_cmd_type[req.cmd] == sd_adtc) {
         rca = req.arg >> 16;
     }
 
@@ -788,7 +888,6 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
         sd->multi_blk_cnt = 0;
     }
 
-    DPRINTF("CMD%d 0x%08x state %d\n", req.cmd, req.arg, sd->state);
     switch (req.cmd) {
     /* Basic commands (Class 0 and Class 1) */
     case 0:	/* CMD0:   GO_IDLE_STATE */
@@ -909,23 +1008,19 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
 
     case 8:	/* CMD8:   SEND_IF_COND */
         /* Physical Layer Specification Version 2.00 command */
-        switch (sd->state) {
-        case sd_idle_state:
-            sd->vhs = 0;
-
-            /* No response if not exactly one VHS bit is set.  */
-            if (!(req.arg >> 8) || (req.arg >> (ctz32(req.arg & ~0xff) + 1))) {
-                return sd->spi ? sd_r7 : sd_r0;
-            }
-
-            /* Accept.  */
-            sd->vhs = req.arg;
-            return sd_r7;
-
-        default:
+        if (sd->state != sd_idle_state) {
             break;
         }
-        break;
+        sd->vhs = 0;
+
+        /* No response if not exactly one VHS bit is set.  */
+        if (!(req.arg >> 8) || (req.arg >> (ctz32(req.arg & ~0xff) + 1))) {
+            return sd->spi ? sd_r7 : sd_r0;
+        }
+
+        /* Accept.  */
+        sd->vhs = req.arg;
+        return sd_r7;
 
     case 9:	/* CMD9:   SEND_CSD */
         switch (sd->state) {
@@ -971,24 +1066,6 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
         }
         break;
 
-    case 11:	/* CMD11:  READ_DAT_UNTIL_STOP */
-        if (sd->spi)
-            goto bad_cmd;
-        switch (sd->state) {
-        case sd_transfer_state:
-            sd->state = sd_sendingdata_state;
-            sd->data_start = req.arg;
-            sd->data_offset = 0;
-
-            if (sd->data_start + sd->blk_len > sd->size)
-                sd->card_status |= ADDRESS_ERROR;
-            return sd_r0;
-
-        default:
-            break;
-        }
-        break;
-
     case 12:	/* CMD12:  STOP_TRANSMISSION */
         switch (sd->state) {
         case sd_sendingdata_state:
@@ -1039,10 +1116,12 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
     case 16:	/* CMD16:  SET_BLOCKLEN */
         switch (sd->state) {
         case sd_transfer_state:
-            if (req.arg > (1 << HWBLOCK_SHIFT))
+            if (req.arg > (1 << HWBLOCK_SHIFT)) {
                 sd->card_status |= BLOCK_LEN_ERROR;
-            else
+            } else {
+                trace_sdcard_set_blocklen(req.arg);
                 sd->blk_len = req.arg;
+            }
 
             return sd_r1;
 
@@ -1096,8 +1175,9 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
 
     /* Block write commands (Class 4) */
     case 24:	/* CMD24:  WRITE_SINGLE_BLOCK */
-        if (sd->spi)
-            goto unimplemented_cmd;
+        if (sd->spi) {
+            goto unimplemented_spi_cmd;
+        }
         switch (sd->state) {
         case sd_transfer_state:
             /* Writing in SPI mode not implemented.  */
@@ -1122,8 +1202,9 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
         break;
 
     case 25:	/* CMD25:  WRITE_MULTIPLE_BLOCK */
-        if (sd->spi)
-            goto unimplemented_cmd;
+        if (sd->spi) {
+            goto unimplemented_spi_cmd;
+        }
         switch (sd->state) {
         case sd_transfer_state:
             /* Writing in SPI mode not implemented.  */
@@ -1163,8 +1244,9 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
         break;
 
     case 27:	/* CMD27:  PROGRAM_CSD */
-        if (sd->spi)
-            goto unimplemented_cmd;
+        if (sd->spi) {
+            goto unimplemented_spi_cmd;
+        }
         switch (sd->state) {
         case sd_transfer_state:
             sd->state = sd_receivingdata_state;
@@ -1274,8 +1356,9 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
 
     /* Lock card commands (Class 7) */
     case 42:	/* CMD42:  LOCK_UNLOCK */
-        if (sd->spi)
-            goto unimplemented_cmd;
+        if (sd->spi) {
+            goto unimplemented_spi_cmd;
+        }
         switch (sd->state) {
         case sd_transfer_state:
             sd->state = sd_receivingdata_state;
@@ -1288,9 +1371,8 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
         }
         break;
 
-    case 52:
-    case 53:
-        /* CMD52, CMD53: reserved for SDIO cards
+    case 52 ... 54:
+        /* CMD52, CMD53, CMD54: reserved for SDIO cards
          * (see the SDIO Simplified Specification V2.0)
          * Handle as illegal command but do not complain
          * on stderr, as some OSes may use these in their
@@ -1300,16 +1382,29 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
 
     /* Application specific commands (Class 8) */
     case 55:	/* CMD55:  APP_CMD */
-        if (sd->rca != rca)
-            return sd_r0;
-
+        switch (sd->state) {
+        case sd_ready_state:
+        case sd_identification_state:
+        case sd_inactive_state:
+            return sd_illegal;
+        case sd_idle_state:
+            if (rca) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                              "SD: illegal RCA 0x%04x for APP_CMD\n", req.cmd);
+            }
+        default:
+            break;
+        }
+        if (!sd->spi) {
+            if (sd->rca != rca) {
+                return sd_r0;
+            }
+        }
         sd->expecting_acmd = true;
         sd->card_status |= APP_CMD;
         return sd_r1;
 
     case 56:	/* CMD56:  GEN_CMD */
-        fprintf(stderr, "SD: GEN_CMD 0x%08x\n", req.arg);
-
         switch (sd->state) {
         case sd_transfer_state:
             sd->data_offset = 0;
@@ -1324,12 +1419,24 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
         }
         break;
 
+    case 58:    /* CMD58:   READ_OCR (SPI) */
+        if (!sd->spi) {
+            goto bad_cmd;
+        }
+        return sd_r3;
+
+    case 59:    /* CMD59:   CRC_ON_OFF (SPI) */
+        if (!sd->spi) {
+            goto bad_cmd;
+        }
+        goto unimplemented_spi_cmd;
+
     default:
     bad_cmd:
         qemu_log_mask(LOG_GUEST_ERROR, "SD: Unknown CMD%i\n", req.cmd);
         return sd_illegal;
 
-    unimplemented_cmd:
+    unimplemented_spi_cmd:
         /* Commands that are recognised but not yet implemented in SPI mode.  */
         qemu_log_mask(LOG_UNIMP, "SD: CMD%i not implemented in SPI mode\n",
                       req.cmd);
@@ -1343,10 +1450,13 @@ static sd_rsp_type_t sd_normal_command(SDState *sd,
 static sd_rsp_type_t sd_app_command(SDState *sd,
                                     SDRequest req)
 {
-    DPRINTF("ACMD%d 0x%08x\n", req.cmd, req.arg);
+    trace_sdcard_app_command(req.cmd, req.arg);
     sd->card_status |= APP_CMD;
     switch (req.cmd) {
     case 6:	/* ACMD6:  SET_BUS_WIDTH */
+        if (sd->spi) {
+            goto unimplemented_spi_cmd;
+        }
         switch (sd->state) {
         case sd_transfer_state:
             sd->sd_status[0] &= 0x3f;
@@ -1402,42 +1512,41 @@ static sd_rsp_type_t sd_app_command(SDState *sd,
             sd->state = sd_transfer_state;
             return sd_r1;
         }
-        switch (sd->state) {
-        case sd_idle_state:
-            /* If it's the first ACMD41 since reset, we need to decide
-             * whether to power up. If this is not an enquiry ACMD41,
-             * we immediately report power on and proceed below to the
-             * ready state, but if it is, we set a timer to model a
-             * delay for power up. This works around a bug in EDK2
-             * UEFI, which sends an initial enquiry ACMD41, but
-             * assumes that the card is in ready state as soon as it
-             * sees the power up bit set. */
-            if (!(sd->ocr & OCR_POWER_UP)) {
-                if ((req.arg & ACMD41_ENQUIRY_MASK) != 0) {
-                    timer_del(sd->ocr_power_timer);
-                    sd_ocr_powerup(sd);
-                } else if (!timer_pending(sd->ocr_power_timer)) {
+        if (sd->state != sd_idle_state) {
+            break;
+        }
+        /* If it's the first ACMD41 since reset, we need to decide
+         * whether to power up. If this is not an enquiry ACMD41,
+         * we immediately report power on and proceed below to the
+         * ready state, but if it is, we set a timer to model a
+         * delay for power up. This works around a bug in EDK2
+         * UEFI, which sends an initial enquiry ACMD41, but
+         * assumes that the card is in ready state as soon as it
+         * sees the power up bit set. */
+        if (!FIELD_EX32(sd->ocr, OCR, CARD_POWER_UP)) {
+            if ((req.arg & ACMD41_ENQUIRY_MASK) != 0) {
+                timer_del(sd->ocr_power_timer);
+                sd_ocr_powerup(sd);
+            } else {
+                trace_sdcard_inquiry_cmd41();
+                if (!timer_pending(sd->ocr_power_timer)) {
                     timer_mod_ns(sd->ocr_power_timer,
                                  (qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL)
                                   + OCR_POWER_DELAY_NS));
                 }
             }
+        }
 
+        if (FIELD_EX32(sd->ocr & req.arg, OCR, VDD_VOLTAGE_WINDOW)) {
             /* We accept any voltage.  10000 V is nothing.
              *
              * Once we're powered up, we advance straight to ready state
              * unless it's an enquiry ACMD41 (bits 23:0 == 0).
              */
-            if (req.arg & ACMD41_ENQUIRY_MASK) {
-                sd->state = sd_ready_state;
-            }
-
-            return sd_r3;
-
-        default:
-            break;
+            sd->state = sd_ready_state;
         }
-        break;
+
+        return sd_r3;
 
     case 42:	/* ACMD42: SET_CLR_CARD_DETECT */
         switch (sd->state) {
@@ -1463,9 +1572,27 @@ static sd_rsp_type_t sd_app_command(SDState *sd,
         }
         break;
 
+    case 18:    /* Reserved for SD security applications */
+    case 25:
+    case 26:
+    case 38:
+    case 43 ... 49:
+        /* Refer to the "SD Specifications Part3 Security Specification" for
+         * information about the SD Security Features.
+         */
+        qemu_log_mask(LOG_UNIMP, "SD: CMD%i Security not implemented\n",
+                      req.cmd);
+        return sd_illegal;
+
     default:
         /* Fall back to standard commands.  */
         return sd_normal_command(sd, req);
+
+    unimplemented_spi_cmd:
+        /* Commands that are recognised but not yet implemented in SPI mode.  */
+        qemu_log_mask(LOG_UNIMP, "SD: CMD%i not implemented in SPI mode\n",
+                      req.cmd);
+        return sd_illegal;
     }
 
     qemu_log_mask(LOG_GUEST_ERROR, "SD: ACMD%i in a wrong state\n", req.cmd);
@@ -1488,8 +1615,8 @@ static int cmd_valid_while_locked(SDState *sd, SDRequest *req)
     if (req->cmd == 16 || req->cmd == 55) {
         return 1;
     }
-    return sd_cmd_class[req->cmd & 0x3F] == 0
-            || sd_cmd_class[req->cmd & 0x3F] == 7;
+    return sd_cmd_class[req->cmd] == 0
+            || sd_cmd_class[req->cmd] == 7;
 }
 
 int sd_do_command(SDState *sd, SDRequest *req,
@@ -1508,6 +1635,12 @@ int sd_do_command(SDState *sd, SDRequest *req,
         goto send_response;
     }
 
+    if (req->cmd >= SDMMC_CMD_MAX) {
+        qemu_log_mask(LOG_GUEST_ERROR, "SD: incorrect command 0x%02x\n",
+                      req->cmd);
+        req->cmd &= 0x3f;
+    }
+
     if (sd->card_status & CARD_IS_LOCKED) {
         if (!cmd_valid_while_locked(sd, req)) {
             sd->card_status |= ILLEGAL_COMMAND;
@@ -1574,10 +1707,12 @@ send_response:
 
     case sd_r0:
     case sd_illegal:
-    default:
         rsplen = 0;
         break;
+    default:
+        g_assert_not_reached();
     }
+    trace_sdcard_response(sd_response_name(rtype), rsplen);
 
     if (rtype != sd_illegal) {
         /* Clear the "clear on valid command" status bits now we've
@@ -1587,16 +1722,7 @@ send_response:
     }
 
 #ifdef DEBUG_SD
-    if (rsplen) {
-        int i;
-        DPRINTF("Response:");
-        for (i = 0; i < rsplen; i++) {
-            DPRINTF(" %02x", response[i]);
-        }
-        DPRINTF(" state %d\n", sd->state);
-    } else {
-        DPRINTF("No response %d\n", sd->state);
-    }
+    qemu_hexdump((const char *)response, stderr, "Response", rsplen);
 #endif
 
     return rsplen;
@@ -1604,8 +1730,7 @@ send_response:
 
 static void sd_blk_read(SDState *sd, uint64_t addr, uint32_t len)
 {
-    DPRINTF("sd_blk_read: addr = 0x%08llx, len = %d\n",
-            (unsigned long long) addr, len);
+    trace_sdcard_read_block(addr, len);
     if (!sd->blk || blk_pread(sd->blk, addr, sd->data, len) < 0) {
         fprintf(stderr, "sd_blk_read: read error on host side\n");
     }
@@ -1613,6 +1738,7 @@ static void sd_blk_read(SDState *sd, uint64_t addr, uint32_t len)
 
 static void sd_blk_write(SDState *sd, uint64_t addr, uint32_t len)
 {
+    trace_sdcard_write_block(addr, len);
     if (!sd->blk || blk_pwrite(sd->blk, addr, sd->data, len, 0) < 0) {
         fprintf(stderr, "sd_blk_write: write error on host side\n");
     }
@@ -1639,6 +1765,7 @@ void sd_write_data(SDState *sd, uint8_t value)
     if (sd->card_status & (ADDRESS_ERROR | WP_VIOLATION))
         return;
 
+    trace_sdcard_write_data(sd->current_cmd, value);
     switch (sd->current_cmd) {
     case 24:	/* CMD24:  WRITE_SINGLE_BLOCK */
         sd->data[sd->data_offset ++] = value;
@@ -1776,6 +1903,7 @@ uint8_t sd_read_data(SDState *sd)
 
     io_len = (sd->ocr & (1 << 30)) ? 512 : sd->blk_len;
 
+    trace_sdcard_read_data(sd->current_cmd, io_len);
     switch (sd->current_cmd) {
     case 6:	/* CMD6:   SWITCH_FUNCTION */
         ret = sd->data[sd->data_offset ++];
@@ -1792,21 +1920,6 @@ uint8_t sd_read_data(SDState *sd)
             sd->state = sd_transfer_state;
         break;
 
-    case 11:	/* CMD11:  READ_DAT_UNTIL_STOP */
-        if (sd->data_offset == 0)
-            BLK_READ_BLOCK(sd->data_start, io_len);
-        ret = sd->data[sd->data_offset ++];
-
-        if (sd->data_offset >= io_len) {
-            sd->data_start += io_len;
-            sd->data_offset = 0;
-            if (sd->data_start + io_len > sd->size) {
-                sd->card_status |= ADDRESS_ERROR;
-                break;
-            }
-        }
-        break;
-
     case 13:	/* ACMD13: SD_STATUS */
         ret = sd->sd_status[sd->data_offset ++];
 
diff --git a/hw/sd/sdmmc-internal.h b/hw/sd/sdmmc-internal.h
new file mode 100644
index 0000000000..0e96cb0081
--- /dev/null
+++ b/hw/sd/sdmmc-internal.h
@@ -0,0 +1,15 @@
+/*
+ * SD/MMC cards common
+ *
+ * Copyright (c) 2018  Philippe Mathieu-Daudé <f4bug@amsat.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#ifndef SD_INTERNAL_H
+#define SD_INTERNAL_H
+
+#define SDMMC_CMD_MAX 64
+
+#endif
diff --git a/hw/sd/ssi-sd.c b/hw/sd/ssi-sd.c
index f88f509e0a..ae04b6641b 100644
--- a/hw/sd/ssi-sd.c
+++ b/hw/sd/ssi-sd.c
@@ -47,7 +47,7 @@ typedef struct {
     int32_t arglen;
     int32_t response_pos;
     int32_t stopping;
-    SDState *sd;
+    SDBus sdbus;
 } ssi_sd_state;
 
 #define TYPE_SSI_SD "ssi-sd"
@@ -100,7 +100,7 @@ static uint32_t ssi_sd_transfer(SSISlave *dev, uint32_t val)
             request.arg = (s->cmdarg[0] << 24) | (s->cmdarg[1] << 16)
                            | (s->cmdarg[2] << 8) | s->cmdarg[3];
             DPRINTF("CMD%d arg 0x%08x\n", s->cmd, request.arg);
-            s->arglen = sd_do_command(s->sd, &request, longresp);
+            s->arglen = sdbus_do_command(&s->sdbus, &request, longresp);
             if (s->arglen <= 0) {
                 s->arglen = 1;
                 s->response[0] = 4;
@@ -177,7 +177,7 @@ static uint32_t ssi_sd_transfer(SSISlave *dev, uint32_t val)
             DPRINTF("Response 0x%02x\n", s->response[s->response_pos]);
             return s->response[s->response_pos++];
         }
-        if (sd_data_ready(s->sd)) {
+        if (sdbus_data_ready(&s->sdbus)) {
             DPRINTF("Data read\n");
             s->mode = SSI_SD_DATA_START;
         } else {
@@ -190,8 +190,8 @@ static uint32_t ssi_sd_transfer(SSISlave *dev, uint32_t val)
         s->mode = SSI_SD_DATA_READ;
         return 0xfe;
     case SSI_SD_DATA_READ:
-        val = sd_read_data(s->sd);
-        if (!sd_data_ready(s->sd)) {
+        val = sdbus_read_data(&s->sdbus);
+        if (!sdbus_data_ready(&s->sdbus)) {
             DPRINTF("Data read end\n");
             s->mode = SSI_SD_CMD;
         }
@@ -242,13 +242,24 @@ static const VMStateDescription vmstate_ssi_sd = {
 static void ssi_sd_realize(SSISlave *d, Error **errp)
 {
     ssi_sd_state *s = FROM_SSI_SLAVE(ssi_sd_state, d);
+    DeviceState *carddev;
     DriveInfo *dinfo;
+    Error *err = NULL;
 
+    qbus_create_inplace(&s->sdbus, sizeof(s->sdbus), TYPE_SD_BUS,
+                        DEVICE(d), "sd-bus");
+
+    /* Create and plug in the sd card */
     /* FIXME use a qdev drive property instead of drive_get_next() */
     dinfo = drive_get_next(IF_SD);
-    s->sd = sd_init(dinfo ? blk_by_legacy_dinfo(dinfo) : NULL, true);
-    if (s->sd == NULL) {
-        error_setg(errp, "Device initialization failed.");
+    carddev = qdev_create(&s->sdbus.qbus, TYPE_SD_CARD);
+    if (dinfo) {
+        qdev_prop_set_drive(carddev, "drive", blk_by_legacy_dinfo(dinfo), &err);
+    }
+    object_property_set_bool(OBJECT(carddev), true, "spi", &err);
+    object_property_set_bool(OBJECT(carddev), true, "realized", &err);
+    if (err) {
+        error_setg(errp, "failed to init SD card: %s", error_get_pretty(err));
         return;
     }
 }
@@ -264,11 +275,6 @@ static void ssi_sd_reset(DeviceState *dev)
     s->arglen = 0;
     s->response_pos = 0;
     s->stopping = 0;
-
-    /* Since we're still using the legacy SD API the card is not plugged
-     * into any bus, and we must reset it manually.
-     */
-    device_reset(DEVICE(s->sd));
 }
 
 static void ssi_sd_class_init(ObjectClass *klass, void *data)
diff --git a/hw/sd/trace-events b/hw/sd/trace-events
index 0f8536db32..3040d32560 100644
--- a/hw/sd/trace-events
+++ b/hw/sd/trace-events
@@ -23,6 +23,26 @@ sdhci_read_dataport(uint16_t data_count) "all %u bytes of data have been read fr
 sdhci_write_dataport(uint16_t data_count) "write buffer filled with %u bytes of data"
 sdhci_capareg(const char *desc, uint16_t val) "%s: %u"
 
+# hw/sd/sd.c
+sdcard_normal_command(uint8_t cmd, uint32_t arg, const char *state) "CMD%d arg 0x%08x (state %s)"
+sdcard_app_command(uint8_t acmd, uint32_t arg) "ACMD%d arg 0x%08x"
+sdcard_response(const char *rspdesc, int rsplen) "%s (sz:%d)"
+sdcard_powerup(void) ""
+sdcard_inquiry_cmd41(void) ""
+sdcard_set_enable(bool current_state, bool new_state) "%u -> %u"
+sdcard_reset(void) ""
+sdcard_set_blocklen(uint16_t length) "0x%04x"
+sdcard_inserted(bool readonly) "read_only: %u"
+sdcard_ejected(void) ""
+sdcard_erase(void) ""
+sdcard_lock(void) ""
+sdcard_unlock(void) ""
+sdcard_read_block(uint64_t addr, uint32_t len) "addr 0x%" PRIx64 " size 0x%x"
+sdcard_write_block(uint64_t addr, uint32_t len) "addr 0x%" PRIx64 " size 0x%x"
+sdcard_write_data(uint8_t cmd, uint8_t value) "CMD%02d value 0x%02x"
+sdcard_read_data(uint8_t cmd, int length) "CMD%02d len %d"
+sdcard_set_voltage(uint16_t millivolts) "%u mV"
+
 # hw/sd/milkymist-memcard.c
 milkymist_memcard_memory_read(uint32_t addr, uint32_t value) "addr 0x%08x value 0x%08x"
 milkymist_memcard_memory_write(uint32_t addr, uint32_t value) "addr 0x%08x value 0x%08x"
diff --git a/hw/usb/dev-mtp.c b/hw/usb/dev-mtp.c
index 94c2e94f10..6ecf70a79b 100644
--- a/hw/usb/dev-mtp.c
+++ b/hw/usb/dev-mtp.c
@@ -46,6 +46,9 @@ enum mtp_code {
     CMD_GET_OBJECT_HANDLES         = 0x1007,
     CMD_GET_OBJECT_INFO            = 0x1008,
     CMD_GET_OBJECT                 = 0x1009,
+    CMD_DELETE_OBJECT              = 0x100b,
+    CMD_SEND_OBJECT_INFO           = 0x100c,
+    CMD_SEND_OBJECT                = 0x100d,
     CMD_GET_PARTIAL_OBJECT         = 0x101b,
     CMD_GET_OBJECT_PROPS_SUPPORTED = 0x9801,
     CMD_GET_OBJECT_PROP_DESC       = 0x9802,
@@ -62,7 +65,13 @@ enum mtp_code {
     RES_INVALID_STORAGE_ID         = 0x2008,
     RES_INVALID_OBJECT_HANDLE      = 0x2009,
     RES_INVALID_OBJECT_FORMAT_CODE = 0x200b,
+    RES_STORE_FULL                 = 0x200c,
+    RES_STORE_READ_ONLY            = 0x200e,
+    RES_PARTIAL_DELETE             = 0x2012,
+    RES_STORE_NOT_AVAILABLE        = 0x2013,
     RES_SPEC_BY_FORMAT_UNSUPPORTED = 0x2014,
+    RES_INVALID_OBJECTINFO         = 0x2015,
+    RES_DESTINATION_UNSUPPORTED    = 0x2020,
     RES_INVALID_PARENT_OBJECT      = 0x201a,
     RES_INVALID_PARAMETER          = 0x201d,
     RES_SESSION_ALREADY_OPEN       = 0x201e,
@@ -172,6 +181,7 @@ struct MTPState {
     MTPControl   *result;
     uint32_t     session;
     uint32_t     next_handle;
+    bool         readonly;
 
     QTAILQ_HEAD(, MTPObject) objects;
 #ifdef CONFIG_INOTIFY1
@@ -179,8 +189,44 @@ struct MTPState {
     int          inotifyfd;
     QTAILQ_HEAD(events, MTPMonEntry) events;
 #endif
+    /* Responder is expecting a write operation */
+    bool write_pending;
+    struct {
+        uint32_t parent_handle;
+        uint16_t format;
+        uint32_t size;
+        char *filename;
+    } dataset;
 };
 
+/*
+ * ObjectInfo dataset received from initiator
+ * Fields we don't care about are ignored
+ */
+typedef struct {
+    uint32_t storage_id; /*unused*/
+    uint16_t format;
+    uint16_t protection_status; /*unused*/
+    uint32_t size;
+    uint16_t thumb_format; /*unused*/
+    uint32_t thumb_comp_sz; /*unused*/
+    uint32_t thumb_pix_width; /*unused*/
+    uint32_t thumb_pix_height; /*unused*/
+    uint32_t image_pix_width; /*unused*/
+    uint32_t image_pix_height; /*unused*/
+    uint32_t image_bit_depth; /*unused*/
+    uint32_t parent; /*unused*/
+    uint16_t assoc_type;
+    uint32_t assoc_desc;
+    uint32_t seq_no; /*unused*/
+    uint8_t length; /*part of filename field*/
+    uint16_t filename[0];
+    char date_created[0]; /*unused*/
+    char date_modified[0]; /*unused*/
+    char keywords[0]; /*unused*/
+    /* string and other data follows */
+} QEMU_PACKED ObjectInfo;
+
 #define TYPE_USB_MTP "usb-mtp"
 #define USB_MTP(obj) OBJECT_CHECK(MTPState, (obj), TYPE_USB_MTP)
 
@@ -422,7 +468,6 @@ static MTPObject *usb_mtp_add_child(MTPState *s, MTPObject *o,
     return child;
 }
 
-#ifdef CONFIG_INOTIFY1
 static MTPObject *usb_mtp_object_lookup_name(MTPObject *parent,
                                              char *name, int len)
 {
@@ -437,6 +482,7 @@ static MTPObject *usb_mtp_object_lookup_name(MTPObject *parent,
     return NULL;
 }
 
+#ifdef CONFIG_INOTIFY1
 static MTPObject *usb_mtp_object_lookup_wd(MTPState *s, int wd)
 {
     MTPObject *iter;
@@ -540,9 +586,8 @@ static void inotify_watchfn(void *arg)
                 break;
 
             case IN_IGNORED:
-                o = usb_mtp_object_lookup_name(parent, event->name, event->len);
-                trace_usb_mtp_inotify_event(s->dev.addr, o->path,
-                                      event->mask, "Obj ignored");
+                trace_usb_mtp_inotify_event(s->dev.addr, parent->path,
+                                      event->mask, "Obj parent dir ignored");
                 break;
 
             default:
@@ -765,7 +810,8 @@ static void usb_mtp_add_time(MTPData *data, time_t time)
 /* ----------------------------------------------------------------------- */
 
 static void usb_mtp_queue_result(MTPState *s, uint16_t code, uint32_t trans,
-                                 int argc, uint32_t arg0, uint32_t arg1)
+                                 int argc, uint32_t arg0, uint32_t arg1,
+                                 uint32_t arg2)
 {
     MTPControl *c = g_new0(MTPControl, 1);
 
@@ -778,6 +824,9 @@ static void usb_mtp_queue_result(MTPState *s, uint16_t code, uint32_t trans,
     if (argc > 1) {
         c->argv[1] = arg1;
     }
+    if (argc > 2) {
+        c->argv[2] = arg2;
+    }
 
     assert(s->result == NULL);
     s->result = c;
@@ -796,6 +845,9 @@ static MTPData *usb_mtp_get_device_info(MTPState *s, MTPControl *c)
         CMD_GET_NUM_OBJECTS,
         CMD_GET_OBJECT_HANDLES,
         CMD_GET_OBJECT_INFO,
+        CMD_DELETE_OBJECT,
+        CMD_SEND_OBJECT_INFO,
+        CMD_SEND_OBJECT,
         CMD_GET_OBJECT,
         CMD_GET_PARTIAL_OBJECT,
         CMD_GET_OBJECT_PROPS_SUPPORTED,
@@ -1110,16 +1162,126 @@ static MTPData *usb_mtp_get_object_prop_value(MTPState *s, MTPControl *c,
     return d;
 }
 
+/* Return correct return code for a delete event */
+enum {
+    ALL_DELETE,
+    PARTIAL_DELETE,
+    READ_ONLY,
+};
+
+/* Assumes that children, if any, have been already freed */
+static void usb_mtp_object_free_one(MTPState *s, MTPObject *o)
+{
+#ifndef CONFIG_INOTIFY1
+    assert(o->nchildren == 0);
+    QTAILQ_REMOVE(&s->objects, o, next);
+    g_free(o->name);
+    g_free(o->path);
+    g_free(o);
+#endif
+}
+
+static int usb_mtp_deletefn(MTPState *s, MTPObject *o, uint32_t trans)
+{
+    MTPObject *iter, *iter2;
+    bool partial_delete = false;
+    bool success = false;
+
+    /*
+     * TODO: Add support for Protection Status
+     */
+
+    QLIST_FOREACH(iter, &o->children, list) {
+        if (iter->format == FMT_ASSOCIATION) {
+            QLIST_FOREACH(iter2, &iter->children, list) {
+                usb_mtp_deletefn(s, iter2, trans);
+            }
+        }
+    }
+
+    if (o->format == FMT_UNDEFINED_OBJECT) {
+        if (remove(o->path)) {
+            partial_delete = true;
+        } else {
+            usb_mtp_object_free_one(s, o);
+            success = true;
+        }
+    }
+
+    if (o->format == FMT_ASSOCIATION) {
+        if (rmdir(o->path)) {
+            partial_delete = true;
+        } else {
+            usb_mtp_object_free_one(s, o);
+            success = true;
+        }
+    }
+
+    if (success && partial_delete) {
+        return PARTIAL_DELETE;
+    }
+    if (!success && partial_delete) {
+        return READ_ONLY;
+    }
+    return ALL_DELETE;
+}
+
+static void usb_mtp_object_delete(MTPState *s, uint32_t handle,
+                                  uint32_t format_code, uint32_t trans)
+{
+    MTPObject *o;
+    int ret;
+
+    /* Return error if store is read-only */
+    if (!FLAG_SET(s, MTP_FLAG_WRITABLE)) {
+        usb_mtp_queue_result(s, RES_STORE_READ_ONLY,
+                             trans, 0, 0, 0, 0);
+        return;
+    }
+
+    if (format_code != 0) {
+        usb_mtp_queue_result(s, RES_SPEC_BY_FORMAT_UNSUPPORTED,
+                             trans, 0, 0, 0, 0);
+        return;
+    }
+
+    if (handle == 0xFFFFFFF) {
+        o = QTAILQ_FIRST(&s->objects);
+    } else {
+        o = usb_mtp_object_lookup(s, handle);
+    }
+    if (o == NULL) {
+        usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE,
+                             trans, 0, 0, 0, 0);
+        return;
+    }
+
+    ret = usb_mtp_deletefn(s, o, trans);
+    if (ret == PARTIAL_DELETE) {
+        usb_mtp_queue_result(s, RES_PARTIAL_DELETE,
+                             trans, 0, 0, 0, 0);
+        return;
+    } else if (ret == READ_ONLY) {
+        usb_mtp_queue_result(s, RES_STORE_READ_ONLY, trans,
+                             0, 0, 0, 0);
+        return;
+    } else {
+        usb_mtp_queue_result(s, RES_OK, trans,
+                             0, 0, 0, 0);
+        return;
+    }
+}
+
 static void usb_mtp_command(MTPState *s, MTPControl *c)
 {
     MTPData *data_in = NULL;
-    MTPObject *o;
+    MTPObject *o = NULL;
     uint32_t nres = 0, res0 = 0;
 
     /* sanity checks */
     if (c->code >= CMD_CLOSE_SESSION && s->session == 0) {
         usb_mtp_queue_result(s, RES_SESSION_NOT_OPEN,
-                             c->trans, 0, 0, 0);
+                             c->trans, 0, 0, 0, 0);
         return;
     }
 
@@ -1131,12 +1293,12 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
     case CMD_OPEN_SESSION:
         if (s->session) {
             usb_mtp_queue_result(s, RES_SESSION_ALREADY_OPEN,
-                                 c->trans, 1, s->session, 0);
+                                 c->trans, 1, s->session, 0, 0);
             return;
         }
         if (c->argv[0] == 0) {
             usb_mtp_queue_result(s, RES_INVALID_PARAMETER,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         trace_usb_mtp_op_open_session(s->dev.addr);
@@ -1165,7 +1327,7 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
         if (c->argv[0] != QEMU_STORAGE_ID &&
             c->argv[0] != 0xffffffff) {
             usb_mtp_queue_result(s, RES_INVALID_STORAGE_ID,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         data_in = usb_mtp_get_storage_info(s, c);
@@ -1175,12 +1337,12 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
         if (c->argv[0] != QEMU_STORAGE_ID &&
             c->argv[0] != 0xffffffff) {
             usb_mtp_queue_result(s, RES_INVALID_STORAGE_ID,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         if (c->argv[1] != 0x00000000) {
             usb_mtp_queue_result(s, RES_SPEC_BY_FORMAT_UNSUPPORTED,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         if (c->argv[2] == 0x00000000 ||
@@ -1191,12 +1353,12 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
         }
         if (o == NULL) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         if (o->format != FMT_ASSOCIATION) {
             usb_mtp_queue_result(s, RES_INVALID_PARENT_OBJECT,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         usb_mtp_object_readdir(s, o);
@@ -1212,7 +1374,7 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
         o = usb_mtp_object_lookup(s, c->argv[0]);
         if (o == NULL) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         data_in = usb_mtp_get_object_info(s, c, o);
@@ -1221,47 +1383,98 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
         o = usb_mtp_object_lookup(s, c->argv[0]);
         if (o == NULL) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         if (o->format == FMT_ASSOCIATION) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         data_in = usb_mtp_get_object(s, c, o);
         if (data_in == NULL) {
             usb_mtp_queue_result(s, RES_GENERAL_ERROR,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         break;
+    case CMD_DELETE_OBJECT:
+        usb_mtp_object_delete(s, c->argv[0], c->argv[1], c->trans);
+        return;
     case CMD_GET_PARTIAL_OBJECT:
         o = usb_mtp_object_lookup(s, c->argv[0]);
         if (o == NULL) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         if (o->format == FMT_ASSOCIATION) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         data_in = usb_mtp_get_partial_object(s, c, o);
         if (data_in == NULL) {
             usb_mtp_queue_result(s, RES_GENERAL_ERROR,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         nres = 1;
         res0 = data_in->length;
         break;
+    case CMD_SEND_OBJECT_INFO:
+        /* Return error if store is read-only */
+        if (!FLAG_SET(s, MTP_FLAG_WRITABLE)) {
+            usb_mtp_queue_result(s, RES_STORE_READ_ONLY,
+                                 c->trans, 0, 0, 0, 0);
+        } else if (c->argv[0] && (c->argv[0] != QEMU_STORAGE_ID)) {
+            /* First parameter points to storage id or is 0 */
+            usb_mtp_queue_result(s, RES_STORE_NOT_AVAILABLE, c->trans,
+                                 0, 0, 0, 0);
+        } else if (c->argv[1] && !c->argv[0]) {
+            /* If second parameter is specified, first must also be specified */
+            usb_mtp_queue_result(s, RES_DESTINATION_UNSUPPORTED, c->trans,
+                                 0, 0, 0, 0);
+        } else {
+            uint32_t handle = c->argv[1];
+            if (handle == 0xFFFFFFFF || handle == 0) {
+                /* root object */
+                o = QTAILQ_FIRST(&s->objects);
+            } else {
+                o = usb_mtp_object_lookup(s, handle);
+            }
+            if (o == NULL) {
+                usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE, c->trans,
+                                     0, 0, 0, 0);
+            }
+            if (o->format != FMT_ASSOCIATION) {
+                usb_mtp_queue_result(s, RES_INVALID_PARENT_OBJECT, c->trans,
+                                     0, 0, 0, 0);
+            }
+        }
+        if (o) {
+            s->dataset.parent_handle = o->handle;
+        }
+        s->data_out = usb_mtp_data_alloc(c);
+        return;
+    case CMD_SEND_OBJECT:
+        if (!FLAG_SET(s, MTP_FLAG_WRITABLE)) {
+            usb_mtp_queue_result(s, RES_STORE_READ_ONLY,
+                                 c->trans, 0, 0, 0, 0);
+            return;
+        }
+        if (!s->write_pending) {
+            usb_mtp_queue_result(s, RES_INVALID_OBJECTINFO,
+                                 c->trans, 0, 0, 0, 0);
+            return;
+        }
+        s->data_out = usb_mtp_data_alloc(c);
+        return;
     case CMD_GET_OBJECT_PROPS_SUPPORTED:
         if (c->argv[0] != FMT_UNDEFINED_OBJECT &&
             c->argv[0] != FMT_ASSOCIATION) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_FORMAT_CODE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         data_in = usb_mtp_get_object_props_supported(s, c);
@@ -1270,13 +1483,13 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
         if (c->argv[1] != FMT_UNDEFINED_OBJECT &&
             c->argv[1] != FMT_ASSOCIATION) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_FORMAT_CODE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         data_in = usb_mtp_get_object_prop_desc(s, c);
         if (data_in == NULL) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_PROP_CODE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         break;
@@ -1284,20 +1497,20 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
         o = usb_mtp_object_lookup(s, c->argv[0]);
         if (o == NULL) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_HANDLE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         data_in = usb_mtp_get_object_prop_value(s, c, o);
         if (data_in == NULL) {
             usb_mtp_queue_result(s, RES_INVALID_OBJECT_PROP_CODE,
-                                 c->trans, 0, 0, 0);
+                                 c->trans, 0, 0, 0, 0);
             return;
         }
         break;
     default:
         trace_usb_mtp_op_unknown(s->dev.addr, c->code);
         usb_mtp_queue_result(s, RES_OPERATION_NOT_SUPPORTED,
-                             c->trans, 0, 0, 0);
+                             c->trans, 0, 0, 0, 0);
         return;
     }
 
@@ -1306,7 +1519,7 @@ static void usb_mtp_command(MTPState *s, MTPControl *c)
         assert(s->data_in == NULL);
         s->data_in = data_in;
     }
-    usb_mtp_queue_result(s, RES_OK, c->trans, nres, res0, 0);
+    usb_mtp_queue_result(s, RES_OK, c->trans, nres, res0, 0, 0);
 }
 
 /* ----------------------------------------------------------------------- */
@@ -1351,12 +1564,191 @@ static void usb_mtp_cancel_packet(USBDevice *dev, USBPacket *p)
     fprintf(stderr, "%s\n", __func__);
 }
 
+static void utf16_to_str(uint8_t len, uint16_t *arr, char *name)
+{
+    int count;
+    wchar_t *wstr = g_new0(wchar_t, len);
+
+    for (count = 0; count < len; count++) {
+        wstr[count] = (wchar_t)arr[count];
+    }
+
+    wcstombs(name, wstr, len);
+    g_free(wstr);
+}
+
+static void usb_mtp_write_data(MTPState *s)
+{
+    MTPData *d = s->data_out;
+    MTPObject *parent =
+        usb_mtp_object_lookup(s, s->dataset.parent_handle);
+    char *path = NULL;
+    int rc = -1;
+    mode_t mask = 0644;
+
+    assert(d != NULL);
+
+    if (parent == NULL || !s->write_pending) {
+        usb_mtp_queue_result(s, RES_INVALID_OBJECTINFO, d->trans,
+                             0, 0, 0, 0);
+        return;
+    }
+
+    if (s->dataset.filename) {
+        path = g_strdup_printf("%s/%s", parent->path, s->dataset.filename);
+        if (s->dataset.format == FMT_ASSOCIATION) {
+            d->fd = mkdir(path, mask);
+            goto free;
+        }
+        if (s->dataset.size < d->length) {
+            usb_mtp_queue_result(s, RES_STORE_FULL, d->trans,
+                                 0, 0, 0, 0);
+            goto done;
+        }
+        d->fd = open(path, O_CREAT | O_WRONLY, mask);
+        if (d->fd == -1) {
+            usb_mtp_queue_result(s, RES_STORE_FULL, d->trans,
+                                 0, 0, 0, 0);
+            goto done;
+        }
+
+        /*
+         * Return success if initiator sent 0 sized data
+         */
+        if (!s->dataset.size) {
+            goto success;
+        }
+
+        rc = write(d->fd, d->data, s->dataset.size);
+        if (rc == -1) {
+            usb_mtp_queue_result(s, RES_STORE_FULL, d->trans,
+                                 0, 0, 0, 0);
+            goto done;
+            }
+        if (rc != s->dataset.size) {
+            usb_mtp_queue_result(s, RES_INCOMPLETE_TRANSFER, d->trans,
+                                 0, 0, 0, 0);
+            goto done;
+        }
+    }
+
+success:
+    usb_mtp_queue_result(s, RES_OK, d->trans,
+                         0, 0, 0, 0);
+
+done:
+    /*
+     * The write dataset is kept around and freed only
+     * on success or if another write request comes in
+     */
+    if (d->fd != -1) {
+        close(d->fd);
+    }
+free:
+    g_free(s->dataset.filename);
+    g_free(path);
+    s->write_pending = false;
+}
+
+static void usb_mtp_write_metadata(MTPState *s)
+{
+    MTPData *d = s->data_out;
+    ObjectInfo *dataset = (ObjectInfo *)d->data;
+    char *filename = g_new0(char, dataset->length);
+    MTPObject *o;
+    MTPObject *p = usb_mtp_object_lookup(s, s->dataset.parent_handle);
+    uint32_t next_handle = s->next_handle;
+
+    assert(!s->write_pending);
+
+    utf16_to_str(dataset->length, dataset->filename, filename);
+
+    o = usb_mtp_object_lookup_name(p, filename, dataset->length);
+    if (o != NULL) {
+        next_handle = o->handle;
+    }
+
+    s->dataset.filename = filename;
+    s->dataset.format = dataset->format;
+    s->dataset.size = dataset->size;
+    s->dataset.filename = filename;
+    s->write_pending = true;
+
+    if (s->dataset.format == FMT_ASSOCIATION) {
+        usb_mtp_write_data(s);
+        /* next_handle will be allocated to the newly created dir */
+        if (d->fd == -1) {
+            usb_mtp_queue_result(s, RES_STORE_FULL, d->trans,
+                                 0, 0, 0, 0);
+            return;
+        }
+        d->fd = -1;
+    }
+
+    usb_mtp_queue_result(s, RES_OK, d->trans, 3, QEMU_STORAGE_ID,
+                         s->dataset.parent_handle, next_handle);
+}
+
+static void usb_mtp_get_data(MTPState *s, mtp_container *container,
+                             USBPacket *p)
+{
+    MTPData *d = s->data_out;
+    uint64_t dlen;
+    uint32_t data_len = p->iov.size;
+
+    if (d->first) {
+        /* Total length of incoming data */
+        d->length = cpu_to_le32(container->length) - sizeof(mtp_container);
+        /* Length of data in this packet */
+        data_len -= sizeof(mtp_container);
+        usb_mtp_realloc(d, d->length);
+        d->offset = 0;
+        d->first = false;
+    }
+
+    if (d->length - d->offset > data_len) {
+        dlen = data_len;
+    } else {
+        dlen = d->length - d->offset;
+    }
+
+    switch (d->code) {
+    case CMD_SEND_OBJECT_INFO:
+        usb_packet_copy(p, d->data + d->offset, dlen);
+        d->offset += dlen;
+        if (d->offset == d->length) {
+            /* The operation might have already failed */
+            if (!s->result) {
+                usb_mtp_write_metadata(s);
+            }
+            usb_mtp_data_free(s->data_out);
+            s->data_out = NULL;
+            return;
+        }
+        break;
+    case CMD_SEND_OBJECT:
+        usb_packet_copy(p, d->data + d->offset, dlen);
+        d->offset += dlen;
+        if (d->offset == d->length) {
+            usb_mtp_write_data(s);
+            usb_mtp_data_free(s->data_out);
+            s->data_out = NULL;
+            return;
+        }
+        break;
+    default:
+        p->status = USB_RET_STALL;
+        return;
+    }
+}
+
 static void usb_mtp_handle_data(USBDevice *dev, USBPacket *p)
 {
     MTPState *s = USB_MTP(dev);
     MTPControl cmd;
     mtp_container container;
     uint32_t params[5];
+    uint16_t container_type;
     int i, rc;
 
     switch (p->ep->nr) {
@@ -1446,8 +1838,13 @@ static void usb_mtp_handle_data(USBDevice *dev, USBPacket *p)
             p->status = USB_RET_STALL;
             return;
         }
-        usb_packet_copy(p, &container, sizeof(container));
-        switch (le16_to_cpu(container.type)) {
+        if (s->data_out && !s->data_out->first) {
+            container_type = TYPE_DATA;
+        } else {
+            usb_packet_copy(p, &container, sizeof(container));
+            container_type = le16_to_cpu(container.type);
+        }
+        switch (container_type) {
         case TYPE_COMMAND:
             if (s->data_in || s->data_out || s->result) {
                 trace_usb_mtp_stall(s->dev.addr, "transaction inflight");
@@ -1478,6 +1875,15 @@ static void usb_mtp_handle_data(USBDevice *dev, USBPacket *p)
                                   (cmd.argc > 4) ? cmd.argv[4] : 0);
             usb_mtp_command(s, &cmd);
             break;
+        case TYPE_DATA:
+            /* One of the previous transfers has already errored but the
+             * responder is still sending data associated with it
+             */
+            if (s->result != NULL) {
+                return;
+            }
+            usb_mtp_get_data(s, &container, p);
+            break;
         default:
             /* not needed as long as the mtp device is read-only */
             p->status = USB_RET_STALL;
@@ -1542,6 +1948,10 @@ static void usb_mtp_realize(USBDevice *dev, Error **errp)
             return;
         }
         s->desc = strrchr(s->root, '/');
+        /* Mark store as RW */
+        if (!s->readonly) {
+            s->flags |= (1 << MTP_FLAG_WRITABLE);
+        }
         if (s->desc && s->desc[0]) {
             s->desc = g_strdup(s->desc + 1);
         } else {
@@ -1564,6 +1974,7 @@ static const VMStateDescription vmstate_usb_mtp = {
 static Property mtp_properties[] = {
     DEFINE_PROP_STRING("x-root", MTPState, root),
     DEFINE_PROP_STRING("desc", MTPState, desc),
+    DEFINE_PROP_BOOL("readonly", MTPState, readonly, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/elf.h b/include/elf.h
index e8a515ce3d..943ee21171 100644
--- a/include/elf.h
+++ b/include/elf.h
@@ -33,6 +33,9 @@ typedef int64_t  Elf64_Sxword;
 
 /* Flags in the e_flags field of the header */
 /* MIPS architecture level. */
+#define EF_MIPS_ARCH            0xf0000000
+
+/* Legal values for MIPS architecture level.  */
 #define EF_MIPS_ARCH_1		0x00000000	/* -mips1 code.  */
 #define EF_MIPS_ARCH_2		0x10000000	/* -mips2 code.  */
 #define EF_MIPS_ARCH_3		0x20000000	/* -mips3 code.  */
@@ -40,6 +43,10 @@ typedef int64_t  Elf64_Sxword;
 #define EF_MIPS_ARCH_5		0x40000000	/* -mips5 code.  */
 #define EF_MIPS_ARCH_32		0x50000000	/* MIPS32 code.  */
 #define EF_MIPS_ARCH_64		0x60000000	/* MIPS64 code.  */
+#define EF_MIPS_ARCH_32R2       0x70000000      /* MIPS32r2 code.  */
+#define EF_MIPS_ARCH_64R2       0x80000000      /* MIPS64r2 code.  */
+#define EF_MIPS_ARCH_32R6       0x90000000      /* MIPS32r6 code.  */
+#define EF_MIPS_ARCH_64R6       0xa0000000      /* MIPS64r6 code.  */
 
 /* The ABI of a file. */
 #define EF_MIPS_ABI_O32		0x00001000	/* O32 ABI.  */
@@ -537,6 +544,34 @@ typedef struct {
 #define HWCAP_S390_HIGH_GPRS    512
 #define HWCAP_S390_TE           1024
 
+/* M68K specific definitions. */
+/* We use the top 24 bits to encode information about the
+   architecture variant.  */
+#define EF_M68K_CPU32    0x00810000
+#define EF_M68K_M68000   0x01000000
+#define EF_M68K_CFV4E    0x00008000
+#define EF_M68K_FIDO     0x02000000
+#define EF_M68K_ARCH_MASK                                               \
+  (EF_M68K_M68000 | EF_M68K_CPU32 | EF_M68K_CFV4E | EF_M68K_FIDO)
+
+/* We use the bottom 8 bits to encode information about the
+   coldfire variant.  If we use any of these bits, the top 24 bits are
+   either 0 or EF_M68K_CFV4E.  */
+#define EF_M68K_CF_ISA_MASK     0x0F  /* Which ISA */
+#define EF_M68K_CF_ISA_A_NODIV  0x01  /* ISA A except for div */
+#define EF_M68K_CF_ISA_A        0x02
+#define EF_M68K_CF_ISA_A_PLUS   0x03
+#define EF_M68K_CF_ISA_B_NOUSP  0x04  /* ISA_B except for USP */
+#define EF_M68K_CF_ISA_B        0x05
+#define EF_M68K_CF_ISA_C        0x06
+#define EF_M68K_CF_ISA_C_NODIV  0x07  /* ISA C except for div */
+#define EF_M68K_CF_MAC_MASK     0x30
+#define EF_M68K_CF_MAC          0x10  /* MAC */
+#define EF_M68K_CF_EMAC         0x20  /* EMAC */
+#define EF_M68K_CF_EMAC_B       0x30  /* EMAC_B */
+#define EF_M68K_CF_FLOAT        0x40  /* Has float insns */
+#define EF_M68K_CF_MASK         0xFF
+
 /*
  * 68k ELF relocation types
  */
diff --git a/include/fpu/softfloat-types.h b/include/fpu/softfloat-types.h
new file mode 100644
index 0000000000..4e378cb612
--- /dev/null
+++ b/include/fpu/softfloat-types.h
@@ -0,0 +1,179 @@
+/*
+ * QEMU float support
+ *
+ * The code in this source file is derived from release 2a of the SoftFloat
+ * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
+ * some later contributions) are provided under that license, as detailed below.
+ * It has subsequently been modified by contributors to the QEMU Project,
+ * so some portions are provided under:
+ *  the SoftFloat-2a license
+ *  the BSD license
+ *  GPL-v2-or-later
+ *
+ * This header holds definitions for code that might be dealing with
+ * softfloat types but not need access to the actual library functions.
+ */
+/*
+===============================================================================
+This C header file is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2a.
+
+Written by John R. Hauser.  This work was made possible in part by the
+International Computer Science Institute, located at Suite 600, 1947 Center
+Street, Berkeley, California 94704.  Funding was partially provided by the
+National Science Foundation under grant MIP-9311980.  The original version
+of this code was written as part of a project to build a fixed-point vector
+processor in collaboration with the University of California at Berkeley,
+overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
+is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
+arithmetic/SoftFloat.html'.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
+has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
+TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
+PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
+AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
+
+Derivative works are acceptable, even for commercial purposes, so long as
+(1) they include prominent notice that the work is derivative, and (2) they
+include prominent notice akin to these four paragraphs for those parts of
+this code that are retained.
+
+===============================================================================
+*/
+
+/* BSD licensing:
+ * Copyright (c) 2006, Fabrice Bellard
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef SOFTFLOAT_TYPES_H
+#define SOFTFLOAT_TYPES_H
+
+/* This 'flag' type must be able to hold at least 0 and 1. It should
+ * probably be replaced with 'bool' but the uses would need to be audited
+ * to check that they weren't accidentally relying on it being a larger type.
+ */
+typedef uint8_t flag;
+
+/*
+ * Software IEC/IEEE floating-point types.
+ */
+
+typedef uint16_t float16;
+typedef uint32_t float32;
+typedef uint64_t float64;
+#define float16_val(x) (x)
+#define float32_val(x) (x)
+#define float64_val(x) (x)
+#define make_float16(x) (x)
+#define make_float32(x) (x)
+#define make_float64(x) (x)
+#define const_float16(x) (x)
+#define const_float32(x) (x)
+#define const_float64(x) (x)
+typedef struct {
+    uint64_t low;
+    uint16_t high;
+} floatx80;
+#define make_floatx80(exp, mant) ((floatx80) { mant, exp })
+#define make_floatx80_init(exp, mant) { .low = mant, .high = exp }
+typedef struct {
+#ifdef HOST_WORDS_BIGENDIAN
+    uint64_t high, low;
+#else
+    uint64_t low, high;
+#endif
+} float128;
+#define make_float128(high_, low_) ((float128) { .high = high_, .low = low_ })
+#define make_float128_init(high_, low_) { .high = high_, .low = low_ }
+
+/*
+ * Software IEC/IEEE floating-point underflow tininess-detection mode.
+ */
+
+enum {
+    float_tininess_after_rounding  = 0,
+    float_tininess_before_rounding = 1
+};
+
+/*
+ *Software IEC/IEEE floating-point rounding mode.
+ */
+
+enum {
+    float_round_nearest_even = 0,
+    float_round_down         = 1,
+    float_round_up           = 2,
+    float_round_to_zero      = 3,
+    float_round_ties_away    = 4,
+    /* Not an IEEE rounding mode: round to the closest odd mantissa value */
+    float_round_to_odd       = 5,
+};
+
+/*
+ * Software IEC/IEEE floating-point exception flags.
+ */
+
+enum {
+    float_flag_invalid   =  1,
+    float_flag_divbyzero =  4,
+    float_flag_overflow  =  8,
+    float_flag_underflow = 16,
+    float_flag_inexact   = 32,
+    float_flag_input_denormal = 64,
+    float_flag_output_denormal = 128
+};
+
+
+/*
+ * Floating Point Status. Individual architectures may maintain
+ * several versions of float_status for different functions. The
+ * correct status for the operation is then passed by reference to
+ * most of the softfloat functions.
+ */
+
+typedef struct float_status {
+    signed char float_detect_tininess;
+    signed char float_rounding_mode;
+    uint8_t     float_exception_flags;
+    signed char floatx80_rounding_precision;
+    /* should denormalised results go to zero and set the inexact flag? */
+    flag flush_to_zero;
+    /* should denormalised inputs go to zero and set the input_denormal flag? */
+    flag flush_inputs_to_zero;
+    flag default_nan_mode;
+    flag snan_bit_is_one;
+} float_status;
+
+#endif /* SOFTFLOAT_TYPES_H */
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 0f96a0edd1..9b7b5e34e2 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -82,12 +82,6 @@ this code that are retained.
 #ifndef SOFTFLOAT_H
 #define SOFTFLOAT_H
 
-/* This 'flag' type must be able to hold at least 0 and 1. It should
- * probably be replaced with 'bool' but the uses would need to be audited
- * to check that they weren't accidentally relying on it being a larger type.
- */
-typedef uint8_t flag;
-
 #define LIT64( a ) a##LL
 
 /*----------------------------------------------------------------------------
@@ -100,110 +94,7 @@ enum {
     float_relation_unordered =  2
 };
 
-/*----------------------------------------------------------------------------
-| Software IEC/IEEE floating-point types.
-*----------------------------------------------------------------------------*/
-/* Use structures for soft-float types.  This prevents accidentally mixing
-   them with native int/float types.  A sufficiently clever compiler and
-   sane ABI should be able to see though these structs.  However
-   x86/gcc 3.x seems to struggle a bit, so leave them disabled by default.  */
-//#define USE_SOFTFLOAT_STRUCT_TYPES
-#ifdef USE_SOFTFLOAT_STRUCT_TYPES
-typedef struct {
-    uint16_t v;
-} float16;
-#define float16_val(x) (((float16)(x)).v)
-#define make_float16(x) __extension__ ({ float16 f16_val = {x}; f16_val; })
-#define const_float16(x) { x }
-typedef struct {
-    uint32_t v;
-} float32;
-/* The cast ensures an error if the wrong type is passed.  */
-#define float32_val(x) (((float32)(x)).v)
-#define make_float32(x) __extension__ ({ float32 f32_val = {x}; f32_val; })
-#define const_float32(x) { x }
-typedef struct {
-    uint64_t v;
-} float64;
-#define float64_val(x) (((float64)(x)).v)
-#define make_float64(x) __extension__ ({ float64 f64_val = {x}; f64_val; })
-#define const_float64(x) { x }
-#else
-typedef uint16_t float16;
-typedef uint32_t float32;
-typedef uint64_t float64;
-#define float16_val(x) (x)
-#define float32_val(x) (x)
-#define float64_val(x) (x)
-#define make_float16(x) (x)
-#define make_float32(x) (x)
-#define make_float64(x) (x)
-#define const_float16(x) (x)
-#define const_float32(x) (x)
-#define const_float64(x) (x)
-#endif
-typedef struct {
-    uint64_t low;
-    uint16_t high;
-} floatx80;
-#define make_floatx80(exp, mant) ((floatx80) { mant, exp })
-#define make_floatx80_init(exp, mant) { .low = mant, .high = exp }
-typedef struct {
-#ifdef HOST_WORDS_BIGENDIAN
-    uint64_t high, low;
-#else
-    uint64_t low, high;
-#endif
-} float128;
-#define make_float128(high_, low_) ((float128) { .high = high_, .low = low_ })
-#define make_float128_init(high_, low_) { .high = high_, .low = low_ }
-
-/*----------------------------------------------------------------------------
-| Software IEC/IEEE floating-point underflow tininess-detection mode.
-*----------------------------------------------------------------------------*/
-enum {
-    float_tininess_after_rounding  = 0,
-    float_tininess_before_rounding = 1
-};
-
-/*----------------------------------------------------------------------------
-| Software IEC/IEEE floating-point rounding mode.
-*----------------------------------------------------------------------------*/
-enum {
-    float_round_nearest_even = 0,
-    float_round_down         = 1,
-    float_round_up           = 2,
-    float_round_to_zero      = 3,
-    float_round_ties_away    = 4,
-    /* Not an IEEE rounding mode: round to the closest odd mantissa value */
-    float_round_to_odd       = 5,
-};
-
-/*----------------------------------------------------------------------------
-| Software IEC/IEEE floating-point exception flags.
-*----------------------------------------------------------------------------*/
-enum {
-    float_flag_invalid   =  1,
-    float_flag_divbyzero =  4,
-    float_flag_overflow  =  8,
-    float_flag_underflow = 16,
-    float_flag_inexact   = 32,
-    float_flag_input_denormal = 64,
-    float_flag_output_denormal = 128
-};
-
-typedef struct float_status {
-    signed char float_detect_tininess;
-    signed char float_rounding_mode;
-    uint8_t     float_exception_flags;
-    signed char floatx80_rounding_precision;
-    /* should denormalised results go to zero and set the inexact flag? */
-    flag flush_to_zero;
-    /* should denormalised inputs go to zero and set the input_denormal flag? */
-    flag flush_inputs_to_zero;
-    flag default_nan_mode;
-    flag snan_bit_is_one;
-} float_status;
+#include "fpu/softfloat-types.h"
 
 static inline void set_float_detect_tininess(int val, float_status *status)
 {
@@ -277,6 +168,7 @@ void float_raise(uint8_t flags, float_status *status);
 | If `a' is denormal and we are in flush-to-zero mode then set the
 | input-denormal exception and return zero. Otherwise just return the value.
 *----------------------------------------------------------------------------*/
+float16 float16_squash_input_denormal(float16 a, float_status *status);
 float32 float32_squash_input_denormal(float32 a, float_status *status);
 float64 float64_squash_input_denormal(float64 a, float_status *status);
 
@@ -298,9 +190,13 @@ enum {
 /*----------------------------------------------------------------------------
 | Software IEC/IEEE integer-to-floating-point conversion routines.
 *----------------------------------------------------------------------------*/
+float32 int16_to_float32(int16_t, float_status *status);
 float32 int32_to_float32(int32_t, float_status *status);
+float64 int16_to_float64(int16_t, float_status *status);
 float64 int32_to_float64(int32_t, float_status *status);
+float32 uint16_to_float32(uint16_t, float_status *status);
 float32 uint32_to_float32(uint32_t, float_status *status);
+float64 uint16_to_float64(uint16_t, float_status *status);
 float64 uint32_to_float64(uint32_t, float_status *status);
 floatx80 int32_to_floatx80(int32_t, float_status *status);
 float128 int32_to_float128(int32_t, float_status *status);
@@ -312,27 +208,6 @@ float32 uint64_to_float32(uint64_t, float_status *status);
 float64 uint64_to_float64(uint64_t, float_status *status);
 float128 uint64_to_float128(uint64_t, float_status *status);
 
-/* We provide the int16 versions for symmetry of API with float-to-int */
-static inline float32 int16_to_float32(int16_t v, float_status *status)
-{
-    return int32_to_float32(v, status);
-}
-
-static inline float32 uint16_to_float32(uint16_t v, float_status *status)
-{
-    return uint32_to_float32(v, status);
-}
-
-static inline float64 int16_to_float64(int16_t v, float_status *status)
-{
-    return int32_to_float64(v, status);
-}
-
-static inline float64 uint16_to_float64(uint16_t v, float_status *status)
-{
-    return uint32_to_float64(v, status);
-}
-
 /*----------------------------------------------------------------------------
 | Software half-precision conversion routines.
 *----------------------------------------------------------------------------*/
@@ -340,10 +215,46 @@ float16 float32_to_float16(float32, flag, float_status *status);
 float32 float16_to_float32(float16, flag, float_status *status);
 float16 float64_to_float16(float64 a, flag ieee, float_status *status);
 float64 float16_to_float64(float16 a, flag ieee, float_status *status);
+int16_t float16_to_int16(float16, float_status *status);
+uint16_t float16_to_uint16(float16 a, float_status *status);
+int16_t float16_to_int16_round_to_zero(float16, float_status *status);
+uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *status);
+int32_t float16_to_int32(float16, float_status *status);
+uint32_t float16_to_uint32(float16 a, float_status *status);
+int32_t float16_to_int32_round_to_zero(float16, float_status *status);
+uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *status);
+int64_t float16_to_int64(float16, float_status *status);
+uint64_t float16_to_uint64(float16 a, float_status *status);
+int64_t float16_to_int64_round_to_zero(float16, float_status *status);
+uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *status);
+float16 int16_to_float16(int16_t a, float_status *status);
+float16 int32_to_float16(int32_t a, float_status *status);
+float16 int64_to_float16(int64_t a, float_status *status);
+float16 uint16_to_float16(uint16_t a, float_status *status);
+float16 uint32_to_float16(uint32_t a, float_status *status);
+float16 uint64_to_float16(uint64_t a, float_status *status);
 
 /*----------------------------------------------------------------------------
 | Software half-precision operations.
 *----------------------------------------------------------------------------*/
+
+float16 float16_round_to_int(float16, float_status *status);
+float16 float16_add(float16, float16, float_status *status);
+float16 float16_sub(float16, float16, float_status *status);
+float16 float16_mul(float16, float16, float_status *status);
+float16 float16_muladd(float16, float16, float16, int, float_status *status);
+float16 float16_div(float16, float16, float_status *status);
+float16 float16_scalbn(float16, int, float_status *status);
+float16 float16_min(float16, float16, float_status *status);
+float16 float16_max(float16, float16, float_status *status);
+float16 float16_minnum(float16, float16, float_status *status);
+float16 float16_maxnum(float16, float16, float_status *status);
+float16 float16_minnummag(float16, float16, float_status *status);
+float16 float16_maxnummag(float16, float16, float_status *status);
+float16 float16_sqrt(float16, float_status *status);
+int float16_compare(float16, float16, float_status *status);
+int float16_compare_quiet(float16, float16, float_status *status);
+
 int float16_is_quiet_nan(float16, float_status *status);
 int float16_is_signaling_nan(float16, float_status *status);
 float16 float16_maybe_silence_nan(float16, float_status *status);
@@ -373,6 +284,32 @@ static inline int float16_is_zero_or_denormal(float16 a)
     return (float16_val(a) & 0x7c00) == 0;
 }
 
+static inline float16 float16_abs(float16 a)
+{
+    /* Note that abs does *not* handle NaN specially, nor does
+     * it flush denormal inputs to zero.
+     */
+    return make_float16(float16_val(a) & 0x7fff);
+}
+
+static inline float16 float16_chs(float16 a)
+{
+    /* Note that chs does *not* handle NaN specially, nor does
+     * it flush denormal inputs to zero.
+     */
+    return make_float16(float16_val(a) ^ 0x8000);
+}
+
+static inline float16 float16_set_sign(float16 a, int sign)
+{
+    return make_float16((float16_val(a) & 0x7fff) | (sign << 15));
+}
+
+#define float16_zero make_float16(0)
+#define float16_one make_float16(0x3c00)
+#define float16_half make_float16(0x3800)
+#define float16_infinity make_float16(0x7c00)
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated half-precision NaN.
 *----------------------------------------------------------------------------*/
@@ -479,8 +416,6 @@ static inline float32 float32_set_sign(float32 a, int sign)
 
 #define float32_zero make_float32(0)
 #define float32_one make_float32(0x3f800000)
-#define float32_ln2 make_float32(0x3f317218)
-#define float32_pi make_float32(0x40490fdb)
 #define float32_half make_float32(0x3f000000)
 #define float32_infinity make_float32(0x7f800000)
 
@@ -593,7 +528,6 @@ static inline float64 float64_set_sign(float64 a, int sign)
 #define float64_zero make_float64(0)
 #define float64_one make_float64(0x3ff0000000000000LL)
 #define float64_ln2 make_float64(0x3fe62e42fefa39efLL)
-#define float64_pi make_float64(0x400921fb54442d18LL)
 #define float64_half make_float64(0x3fe0000000000000LL)
 #define float64_infinity make_float64(0x7ff0000000000000LL)
 
diff --git a/include/hw/char/stm32f2xx_usart.h b/include/hw/char/stm32f2xx_usart.h
index 9d03a7527c..84c4029777 100644
--- a/include/hw/char/stm32f2xx_usart.h
+++ b/include/hw/char/stm32f2xx_usart.h
@@ -37,7 +37,12 @@
 #define USART_CR3  0x14
 #define USART_GTPR 0x18
 
-#define USART_SR_RESET 0x00C00000
+/*
+ * NB: The reset value mentioned in "24.6.1 Status register" seems bogus.
+ * Looking at "Table 98 USART register map and reset values", it seems it
+ * should be 0xc0, and that's how real hardware behaves.
+ */
+#define USART_SR_RESET (USART_SR_TXE | USART_SR_TC)
 
 #define USART_SR_TXE  (1 << 7)
 #define USART_SR_TC   (1 << 6)
diff --git a/include/hw/sd/sd.h b/include/hw/sd/sd.h
index bf1eb0713c..9bdb3c9285 100644
--- a/include/hw/sd/sd.h
+++ b/include/hw/sd/sd.h
@@ -53,7 +53,6 @@
 #define READY_FOR_DATA		(1 << 8)
 #define APP_CMD			(1 << 5)
 #define AKE_SEQ_ERROR		(1 << 3)
-#define OCR_CCS_BITN        30
 
 typedef enum {
     SD_VOLTAGE_0_4V     = 400,  /* currently not supported */
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index 83f474ffc3..22ac3c2d0e 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -171,5 +171,5 @@ void virtio_gpu_virgl_fence_poll(VirtIOGPU *g);
 void virtio_gpu_virgl_reset(VirtIOGPU *g);
 void virtio_gpu_gl_block(void *opaque, bool block);
 int virtio_gpu_virgl_init(VirtIOGPU *g);
-
+int virtio_gpu_virgl_get_num_capsets(VirtIOGPU *g);
 #endif
diff --git a/include/qemu/bswap.h b/include/qemu/bswap.h
index 09c78fd28a..3f28f661b1 100644
--- a/include/qemu/bswap.h
+++ b/include/qemu/bswap.h
@@ -1,7 +1,7 @@
 #ifndef BSWAP_H
 #define BSWAP_H
 
-#include "fpu/softfloat.h"
+#include "fpu/softfloat-types.h"
 
 #ifdef CONFIG_MACHINE_BSWAP_H
 # include <sys/endian.h>
diff --git a/include/standard-headers/linux/virtio_gpu.h b/include/standard-headers/linux/virtio_gpu.h
index c1c8f0751d..52a830dcf8 100644
--- a/include/standard-headers/linux/virtio_gpu.h
+++ b/include/standard-headers/linux/virtio_gpu.h
@@ -260,6 +260,7 @@ struct virtio_gpu_cmd_submit {
 };
 
 #define VIRTIO_GPU_CAPSET_VIRGL 1
+#define VIRTIO_GPU_CAPSET_VIRGL2 2
 
 /* VIRTIO_GPU_CMD_GET_CAPSET_INFO */
 struct virtio_gpu_get_capset_info {
diff --git a/include/ui/console.h b/include/ui/console.h
index e6b1227bef..f29bacd625 100644
--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -230,8 +230,10 @@ typedef struct DisplayChangeListenerOps {
     void (*dpy_gl_scanout_dmabuf)(DisplayChangeListener *dcl,
                                   QemuDmaBuf *dmabuf);
     void (*dpy_gl_cursor_dmabuf)(DisplayChangeListener *dcl,
-                                 QemuDmaBuf *dmabuf,
-                                 uint32_t pos_x, uint32_t pos_y);
+                                 QemuDmaBuf *dmabuf, bool have_hot,
+                                 uint32_t hot_x, uint32_t hot_y);
+    void (*dpy_gl_cursor_position)(DisplayChangeListener *dcl,
+                                   uint32_t pos_x, uint32_t pos_y);
     void (*dpy_gl_release_dmabuf)(DisplayChangeListener *dcl,
                                   QemuDmaBuf *dmabuf);
     void (*dpy_gl_update)(DisplayChangeListener *dcl,
@@ -304,9 +306,10 @@ void dpy_gl_scanout_texture(QemuConsole *con,
                             uint32_t x, uint32_t y, uint32_t w, uint32_t h);
 void dpy_gl_scanout_dmabuf(QemuConsole *con,
                            QemuDmaBuf *dmabuf);
-void dpy_gl_cursor_dmabuf(QemuConsole *con,
-                          QemuDmaBuf *dmabuf,
-                          uint32_t pos_x, uint32_t pos_y);
+void dpy_gl_cursor_dmabuf(QemuConsole *con, QemuDmaBuf *dmabuf,
+                          bool have_hot, uint32_t hot_x, uint32_t hot_y);
+void dpy_gl_cursor_position(QemuConsole *con,
+                            uint32_t pos_x, uint32_t pos_y);
 void dpy_gl_release_dmabuf(QemuConsole *con,
                            QemuDmaBuf *dmabuf);
 void dpy_gl_update(QemuConsole *con,
diff --git a/linux-user/aarch64/target_elf.h b/linux-user/aarch64/target_elf.h
new file mode 100644
index 0000000000..a7eb962fba
--- /dev/null
+++ b/linux-user/aarch64/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef AARCH64_TARGET_ELF_H
+#define AARCH64_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/alpha/target_elf.h b/linux-user/alpha/target_elf.h
new file mode 100644
index 0000000000..344e9f4d39
--- /dev/null
+++ b/linux-user/alpha/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef ALPHA_TARGET_ELF_H
+#define ALPHA_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/arm/target_elf.h b/linux-user/arm/target_elf.h
new file mode 100644
index 0000000000..58ff6a0986
--- /dev/null
+++ b/linux-user/arm/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef ARM_TARGET_ELF_H
+#define ARM_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/cris/target_elf.h b/linux-user/cris/target_elf.h
new file mode 100644
index 0000000000..99eb4ec704
--- /dev/null
+++ b/linux-user/cris/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef CRIS_TARGET_ELF_H
+#define CRIS_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 8bb9a2c3e8..0208022445 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -2396,6 +2396,41 @@ give_up:
     g_free(syms);
 }
 
+uint32_t get_elf_eflags(int fd)
+{
+    struct elfhdr ehdr;
+    off_t offset;
+    int ret;
+
+    /* Read ELF header */
+    offset = lseek(fd, 0, SEEK_SET);
+    if (offset == (off_t) -1) {
+        return 0;
+    }
+    ret = read(fd, &ehdr, sizeof(ehdr));
+    if (ret < sizeof(ehdr)) {
+        return 0;
+    }
+    offset = lseek(fd, offset, SEEK_SET);
+    if (offset == (off_t) -1) {
+        return 0;
+    }
+
+    /* Check ELF signature */
+    if (!elf_check_ident(&ehdr)) {
+        return 0;
+    }
+
+    /* check header */
+    bswap_ehdr(&ehdr);
+    if (!elf_check_ehdr(&ehdr)) {
+        return 0;
+    }
+
+    /* return architecture id */
+    return ehdr.e_flags;
+}
+
 int load_elf_binary(struct linux_binprm *bprm, struct image_info *info)
 {
     struct image_info interp_info;
diff --git a/linux-user/hppa/target_elf.h b/linux-user/hppa/target_elf.h
new file mode 100644
index 0000000000..82b4e9535e
--- /dev/null
+++ b/linux-user/hppa/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef HPPA_TARGET_ELF_H
+#define HPPA_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/i386/target_elf.h b/linux-user/i386/target_elf.h
new file mode 100644
index 0000000000..1c6142e7da
--- /dev/null
+++ b/linux-user/i386/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef I386_TARGET_ELF_H
+#define I386_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "qemu32";
+}
+#endif
diff --git a/linux-user/m68k/target_elf.h b/linux-user/m68k/target_elf.h
new file mode 100644
index 0000000000..998fe0fe2f
--- /dev/null
+++ b/linux-user/m68k/target_elf.h
@@ -0,0 +1,20 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef M68K_TARGET_ELF_H
+#define M68K_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    if (eflags == 0 || (eflags & EF_M68K_M68000)) {
+        /* 680x0 */
+        return "m68040";
+    }
+
+    /* Coldfire */
+    return "any";
+}
+#endif
diff --git a/linux-user/main.c b/linux-user/main.c
index fd7900628b..bbeb78fb89 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -35,6 +35,7 @@
 #include "elf.h"
 #include "exec/log.h"
 #include "trace/control.h"
+#include "target_elf.h"
 
 char *exec_path;
 
@@ -4343,46 +4344,17 @@ int main(int argc, char **argv, char **envp)
 
     init_qemu_uname_release();
 
+    execfd = qemu_getauxval(AT_EXECFD);
+    if (execfd == 0) {
+        execfd = open(filename, O_RDONLY);
+        if (execfd < 0) {
+            printf("Error while loading %s: %s\n", filename, strerror(errno));
+            _exit(EXIT_FAILURE);
+        }
+    }
+
     if (cpu_model == NULL) {
-#if defined(TARGET_I386)
-#ifdef TARGET_X86_64
-        cpu_model = "qemu64";
-#else
-        cpu_model = "qemu32";
-#endif
-#elif defined(TARGET_ARM)
-        cpu_model = "any";
-#elif defined(TARGET_UNICORE32)
-        cpu_model = "any";
-#elif defined(TARGET_M68K)
-        cpu_model = "any";
-#elif defined(TARGET_SPARC)
-#ifdef TARGET_SPARC64
-        cpu_model = "TI UltraSparc II";
-#else
-        cpu_model = "Fujitsu MB86904";
-#endif
-#elif defined(TARGET_MIPS)
-#if defined(TARGET_ABI_MIPSN32) || defined(TARGET_ABI_MIPSN64)
-        cpu_model = "5KEf";
-#else
-        cpu_model = "24Kf";
-#endif
-#elif defined TARGET_OPENRISC
-        cpu_model = "or1200";
-#elif defined(TARGET_PPC)
-# ifdef TARGET_PPC64
-        cpu_model = "POWER8";
-# else
-        cpu_model = "750";
-# endif
-#elif defined TARGET_SH4
-        cpu_model = "sh7785";
-#elif defined TARGET_S390X
-        cpu_model = "qemu";
-#else
-        cpu_model = "any";
-#endif
+        cpu_model = cpu_get_model(get_elf_eflags(execfd));
     }
     tcg_exec_init(0);
     /* NOTE: we need to init the CPU at this stage to get
@@ -4475,15 +4447,6 @@ int main(int argc, char **argv, char **envp)
     cpu->opaque = ts;
     task_settid(ts);
 
-    execfd = qemu_getauxval(AT_EXECFD);
-    if (execfd == 0) {
-        execfd = open(filename, O_RDONLY);
-        if (execfd < 0) {
-            printf("Error while loading %s: %s\n", filename, strerror(errno));
-            _exit(EXIT_FAILURE);
-        }
-    }
-
     ret = loader_exec(execfd, filename, target_argv, target_environ, regs,
         info, &bprm);
     if (ret != 0) {
diff --git a/linux-user/microblaze/target_elf.h b/linux-user/microblaze/target_elf.h
new file mode 100644
index 0000000000..8a8f1debff
--- /dev/null
+++ b/linux-user/microblaze/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef MICROBLAZE_TARGET_ELF_H
+#define MICROBLAZE_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/mips/target_elf.h b/linux-user/mips/target_elf.h
new file mode 100644
index 0000000000..fa5d30bf99
--- /dev/null
+++ b/linux-user/mips/target_elf.h
@@ -0,0 +1,17 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef MIPS_TARGET_ELF_H
+#define MIPS_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    if ((eflags & EF_MIPS_ARCH) == EF_MIPS_ARCH_32R6) {
+        return "mips32r6-generic";
+    }
+    return "24Kf";
+}
+#endif
diff --git a/linux-user/mips64/target_elf.h b/linux-user/mips64/target_elf.h
new file mode 100644
index 0000000000..ec55d8542a
--- /dev/null
+++ b/linux-user/mips64/target_elf.h
@@ -0,0 +1,17 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef MIPS64_TARGET_ELF_H
+#define MIPS64_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    if ((eflags & EF_MIPS_ARCH) == EF_MIPS_ARCH_64R6) {
+        return "I6400";
+    }
+    return "5KEf";
+}
+#endif
diff --git a/linux-user/nios2/target_elf.h b/linux-user/nios2/target_elf.h
new file mode 100644
index 0000000000..801e20afaf
--- /dev/null
+++ b/linux-user/nios2/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef NIOS2_TARGET_ELF_H
+#define NIOS2_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/openrisc/target_elf.h b/linux-user/openrisc/target_elf.h
new file mode 100644
index 0000000000..40ceb025c9
--- /dev/null
+++ b/linux-user/openrisc/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef OPENRISC_TARGET_ELF_H
+#define OPENRISC_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "or1200";
+}
+#endif
diff --git a/linux-user/ppc/target_elf.h b/linux-user/ppc/target_elf.h
new file mode 100644
index 0000000000..576a5b9959
--- /dev/null
+++ b/linux-user/ppc/target_elf.h
@@ -0,0 +1,18 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef PPC_TARGET_ELF_H
+#define PPC_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+#ifdef TARGET_PPC64
+    return "POWER8";
+#else
+    return "750";
+#endif
+}
+#endif
diff --git a/linux-user/qemu.h b/linux-user/qemu.h
index bc4bf35036..f4b4ca72ad 100644
--- a/linux-user/qemu.h
+++ b/linux-user/qemu.h
@@ -186,6 +186,7 @@ int loader_exec(int fdexec, const char *filename, char **argv, char **envp,
              struct target_pt_regs * regs, struct image_info *infop,
              struct linux_binprm *);
 
+uint32_t get_elf_eflags(int fd);
 int load_elf_binary(struct linux_binprm *bprm, struct image_info *info);
 int load_flt_binary(struct linux_binprm *bprm, struct image_info *info);
 
diff --git a/linux-user/s390x/target_elf.h b/linux-user/s390x/target_elf.h
new file mode 100644
index 0000000000..8114b59c1d
--- /dev/null
+++ b/linux-user/s390x/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef S390X_TARGET_ELF_H
+#define S390X_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "qemu";
+}
+#endif
diff --git a/linux-user/sh4/target_elf.h b/linux-user/sh4/target_elf.h
new file mode 100644
index 0000000000..f485e0cef2
--- /dev/null
+++ b/linux-user/sh4/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef SH4_TARGET_ELF_H
+#define SH4_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "sh7785";
+}
+#endif
diff --git a/linux-user/sparc/target_elf.h b/linux-user/sparc/target_elf.h
new file mode 100644
index 0000000000..a510ceb612
--- /dev/null
+++ b/linux-user/sparc/target_elf.h
@@ -0,0 +1,18 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef SPARC_TARGET_ELF_H
+#define SPARC_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+#ifdef TARGET_SPARC64
+    return "TI UltraSparc II";
+#else
+    return "Fujitsu MB86904";
+#endif
+}
+#endif
diff --git a/linux-user/sparc64/target_elf.h b/linux-user/sparc64/target_elf.h
new file mode 100644
index 0000000000..d6e388f1cf
--- /dev/null
+++ b/linux-user/sparc64/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef SPARC64_TARGET_ELF_H
+#define SPARC64_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "TI UltraSparc II";
+}
+#endif
diff --git a/linux-user/tilegx/target_elf.h b/linux-user/tilegx/target_elf.h
new file mode 100644
index 0000000000..7197bb0005
--- /dev/null
+++ b/linux-user/tilegx/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef TILEGX_TARGET_ELF_H
+#define TILEGX_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/unicore32/target_elf.h b/linux-user/unicore32/target_elf.h
new file mode 100644
index 0000000000..e2bfcb2ca3
--- /dev/null
+++ b/linux-user/unicore32/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef UNICORE32_TARGET_ELF_H
+#define UNICORE32_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "any";
+}
+#endif
diff --git a/linux-user/x86_64/target_elf.h b/linux-user/x86_64/target_elf.h
new file mode 100644
index 0000000000..7b76a90de8
--- /dev/null
+++ b/linux-user/x86_64/target_elf.h
@@ -0,0 +1,14 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation, or (at your option) any
+ * later version. See the COPYING file in the top-level directory.
+ */
+
+#ifndef X86_64_TARGET_ELF_H
+#define X86_64_TARGET_ELF_H
+static inline const char *cpu_get_model(uint32_t eflags)
+{
+    return "qemu64";
+}
+#endif
diff --git a/pc-bios/openbios-ppc b/pc-bios/openbios-ppc
index 3596529734..af70c365ee 100644
--- a/pc-bios/openbios-ppc
+++ b/pc-bios/openbios-ppc
Binary files differdiff --git a/pc-bios/openbios-sparc32 b/pc-bios/openbios-sparc32
index 7b5dc083aa..9e736e8838 100644
--- a/pc-bios/openbios-sparc32
+++ b/pc-bios/openbios-sparc32
Binary files differdiff --git a/pc-bios/openbios-sparc64 b/pc-bios/openbios-sparc64
index a4a3b8d3ec..82ea0f8be6 100644
--- a/pc-bios/openbios-sparc64
+++ b/pc-bios/openbios-sparc64
Binary files differdiff --git a/roms/openbios b/roms/openbios
-Subproject b5c93acd14b7b3886c2c81d84cd18e666984a4c
+Subproject 54d959d97fb331708767b2fd4a878efd2bbc41b
diff --git a/scripts/decodetree.py b/scripts/decodetree.py
new file mode 100755
index 0000000000..6a33f8f8dd
--- /dev/null
+++ b/scripts/decodetree.py
@@ -0,0 +1,1062 @@
+#!/usr/bin/env python
+# Copyright (c) 2018 Linaro Limited
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# Generate a decoding tree from a specification file.
+#
+# The tree is built from instruction "patterns".  A pattern may represent
+# a single architectural instruction or a group of same, depending on what
+# is convenient for further processing.
+#
+# Each pattern has "fixedbits" & "fixedmask", the combination of which
+# describes the condition under which the pattern is matched:
+#
+#   (insn & fixedmask) == fixedbits
+#
+# Each pattern may have "fields", which are extracted from the insn and
+# passed along to the translator.  Examples of such are registers,
+# immediates, and sub-opcodes.
+#
+# In support of patterns, one may declare fields, argument sets, and
+# formats, each of which may be re-used to simplify further definitions.
+#
+# *** Field syntax:
+#
+# field_def     := '%' identifier ( unnamed_field )+ ( !function=identifier )?
+# unnamed_field := number ':' ( 's' ) number
+#
+# For unnamed_field, the first number is the least-significant bit position of
+# the field and the second number is the length of the field.  If the 's' is
+# present, the field is considered signed.  If multiple unnamed_fields are
+# present, they are concatenated.  In this way one can define disjoint fields.
+#
+# If !function is specified, the concatenated result is passed through the
+# named function, taking and returning an integral value.
+#
+# FIXME: the fields of the structure into which this result will be stored
+# is restricted to "int".  Which means that we cannot expand 64-bit items.
+#
+# Field examples:
+#
+#   %disp   0:s16          -- sextract(i, 0, 16)
+#   %imm9   16:6 10:3      -- extract(i, 16, 6) << 3 | extract(i, 10, 3)
+#   %disp12 0:s1 1:1 2:10  -- sextract(i, 0, 1) << 11
+#                             | extract(i, 1, 1) << 10
+#                             | extract(i, 2, 10)
+#   %shimm8 5:s8 13:1 !function=expand_shimm8
+#                          -- expand_shimm8(sextract(i, 5, 8) << 1
+#                                           | extract(i, 13, 1))
+#
+# *** Argument set syntax:
+#
+# args_def    := '&' identifier ( args_elt )+
+# args_elt    := identifier
+#
+# Each args_elt defines an argument within the argument set.
+# Each argument set will be rendered as a C structure "arg_$name"
+# with each of the fields being one of the member arguments.
+#
+# Argument set examples:
+#
+#   &reg3       ra rb rc
+#   &loadstore  reg base offset
+#
+# *** Format syntax:
+#
+# fmt_def      := '@' identifier ( fmt_elt )+
+# fmt_elt      := fixedbit_elt | field_elt | field_ref | args_ref
+# fixedbit_elt := [01.-]+
+# field_elt    := identifier ':' 's'? number
+# field_ref    := '%' identifier | identifier '=' '%' identifier
+# args_ref     := '&' identifier
+#
+# Defining a format is a handy way to avoid replicating groups of fields
+# across many instruction patterns.
+#
+# A fixedbit_elt describes a contiguous sequence of bits that must
+# be 1, 0, [.-] for don't care.  The difference between '.' and '-'
+# is that '.' means that the bit will be covered with a field or a
+# final [01] from the pattern, and '-' means that the bit is really
+# ignored by the cpu and will not be specified.
+#
+# A field_elt describes a simple field only given a width; the position of
+# the field is implied by its position with respect to other fixedbit_elt
+# and field_elt.
+#
+# If any fixedbit_elt or field_elt appear then all bits must be defined.
+# Padding with a fixedbit_elt of all '.' is an easy way to accomplish that.
+#
+# A field_ref incorporates a field by reference.  This is the only way to
+# add a complex field to a format.  A field may be renamed in the process
+# via assignment to another identifier.  This is intended to allow the
+# same argument set be used with disjoint named fields.
+#
+# A single args_ref may specify an argument set to use for the format.
+# The set of fields in the format must be a subset of the arguments in
+# the argument set.  If an argument set is not specified, one will be
+# inferred from the set of fields.
+#
+# It is recommended, but not required, that all field_ref and args_ref
+# appear at the end of the line, not interleaving with fixedbit_elf or
+# field_elt.
+#
+# Format examples:
+#
+#   @opr    ...... ra:5 rb:5 ... 0 ....... rc:5
+#   @opi    ...... ra:5 lit:8    1 ....... rc:5
+#
+# *** Pattern syntax:
+#
+# pat_def      := identifier ( pat_elt )+
+# pat_elt      := fixedbit_elt | field_elt | field_ref
+#               | args_ref | fmt_ref | const_elt
+# fmt_ref      := '@' identifier
+# const_elt    := identifier '=' number
+#
+# The fixedbit_elt and field_elt specifiers are unchanged from formats.
+# A pattern that does not specify a named format will have one inferred
+# from a referenced argument set (if present) and the set of fields.
+#
+# A const_elt allows a argument to be set to a constant value.  This may
+# come in handy when fields overlap between patterns and one has to
+# include the values in the fixedbit_elt instead.
+#
+# The decoder will call a translator function for each pattern matched.
+#
+# Pattern examples:
+#
+#   addl_r   010000 ..... ..... .... 0000000 ..... @opr
+#   addl_i   010000 ..... ..... .... 0000000 ..... @opi
+#
+# which will, in part, invoke
+#
+#   trans_addl_r(ctx, &arg_opr, insn)
+# and
+#   trans_addl_i(ctx, &arg_opi, insn)
+#
+
+import io
+import os
+import re
+import sys
+import getopt
+import pdb
+
+insnwidth = 32
+insnmask = 0xffffffff
+fields = {}
+arguments = {}
+formats = {}
+patterns = []
+
+translate_prefix = 'trans'
+translate_scope = 'static '
+input_file = ''
+output_file = None
+output_fd = None
+insntype = 'uint32_t'
+
+re_ident = '[a-zA-Z][a-zA-Z0-9_]*'
+
+
+def error(lineno, *args):
+    """Print an error message from file:line and args and exit."""
+    global output_file
+    global output_fd
+
+    if lineno:
+        r = '{0}:{1}: error:'.format(input_file, lineno)
+    elif input_file:
+        r = '{0}: error:'.format(input_file)
+    else:
+        r = 'error:'
+    for a in args:
+        r += ' ' + str(a)
+    r += '\n'
+    sys.stderr.write(r)
+    if output_file and output_fd:
+        output_fd.close()
+        os.remove(output_file)
+    exit(1)
+
+
+def output(*args):
+    global output_fd
+    for a in args:
+        output_fd.write(a)
+
+
+if sys.version_info >= (3, 0):
+    re_fullmatch = re.fullmatch
+else:
+    def re_fullmatch(pat, str):
+        return re.match('^' + pat + '$', str)
+
+
+def output_autogen():
+    output('/* This file is autogenerated by scripts/decodetree.py.  */\n\n')
+
+
+def str_indent(c):
+    """Return a string with C spaces"""
+    return ' ' * c
+
+
+def str_fields(fields):
+    """Return a string uniquely identifing FIELDS"""
+    r = ''
+    for n in sorted(fields.keys()):
+        r += '_' + n
+    return r[1:]
+
+
+def str_match_bits(bits, mask):
+    """Return a string pretty-printing BITS/MASK"""
+    global insnwidth
+
+    i = 1 << (insnwidth - 1)
+    space = 0x01010100
+    r = ''
+    while i != 0:
+        if i & mask:
+            if i & bits:
+                r += '1'
+            else:
+                r += '0'
+        else:
+            r += '.'
+        if i & space:
+            r += ' '
+        i >>= 1
+    return r
+
+
+def is_pow2(x):
+    """Return true iff X is equal to a power of 2."""
+    return (x & (x - 1)) == 0
+
+
+def ctz(x):
+    """Return the number of times 2 factors into X."""
+    r = 0
+    while ((x >> r) & 1) == 0:
+        r += 1
+    return r
+
+
+def is_contiguous(bits):
+    shift = ctz(bits)
+    if is_pow2((bits >> shift) + 1):
+        return shift
+    else:
+        return -1
+
+
+def eq_fields_for_args(flds_a, flds_b):
+    if len(flds_a) != len(flds_b):
+        return False
+    for k, a in flds_a.items():
+        if k not in flds_b:
+            return False
+    return True
+
+
+def eq_fields_for_fmts(flds_a, flds_b):
+    if len(flds_a) != len(flds_b):
+        return False
+    for k, a in flds_a.items():
+        if k not in flds_b:
+            return False
+        b = flds_b[k]
+        if a.__class__ != b.__class__ or a != b:
+            return False
+    return True
+
+
+class Field:
+    """Class representing a simple instruction field"""
+    def __init__(self, sign, pos, len):
+        self.sign = sign
+        self.pos = pos
+        self.len = len
+        self.mask = ((1 << len) - 1) << pos
+
+    def __str__(self):
+        if self.sign:
+            s = 's'
+        else:
+            s = ''
+        return str(pos) + ':' + s + str(len)
+
+    def str_extract(self):
+        if self.sign:
+            extr = 'sextract32'
+        else:
+            extr = 'extract32'
+        return '{0}(insn, {1}, {2})'.format(extr, self.pos, self.len)
+
+    def __eq__(self, other):
+        return self.sign == other.sign and self.sign == other.sign
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+# end Field
+
+
+class MultiField:
+    """Class representing a compound instruction field"""
+    def __init__(self, subs, mask):
+        self.subs = subs
+        self.sign = subs[0].sign
+        self.mask = mask
+
+    def __str__(self):
+        return str(self.subs)
+
+    def str_extract(self):
+        ret = '0'
+        pos = 0
+        for f in reversed(self.subs):
+            if pos == 0:
+                ret = f.str_extract()
+            else:
+                ret = 'deposit32({0}, {1}, {2}, {3})' \
+                      .format(ret, pos, 32 - pos, f.str_extract())
+            pos += f.len
+        return ret
+
+    def __ne__(self, other):
+        if len(self.subs) != len(other.subs):
+            return True
+        for a, b in zip(self.subs, other.subs):
+            if a.__class__ != b.__class__ or a != b:
+                return True
+        return False
+
+    def __eq__(self, other):
+        return not self.__ne__(other)
+# end MultiField
+
+
+class ConstField:
+    """Class representing an argument field with constant value"""
+    def __init__(self, value):
+        self.value = value
+        self.mask = 0
+        self.sign = value < 0
+
+    def __str__(self):
+        return str(self.value)
+
+    def str_extract(self):
+        return str(self.value)
+
+    def __cmp__(self, other):
+        return self.value - other.value
+# end ConstField
+
+
+class FunctionField:
+    """Class representing a field passed through an expander"""
+    def __init__(self, func, base):
+        self.mask = base.mask
+        self.sign = base.sign
+        self.base = base
+        self.func = func
+
+    def __str__(self):
+        return self.func + '(' + str(self.base) + ')'
+
+    def str_extract(self):
+        return self.func + '(' + self.base.str_extract() + ')'
+
+    def __eq__(self, other):
+        return self.func == other.func and self.base == other.base
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+# end FunctionField
+
+
+class Arguments:
+    """Class representing the extracted fields of a format"""
+    def __init__(self, nm, flds):
+        self.name = nm
+        self.fields = sorted(flds)
+
+    def __str__(self):
+        return self.name + ' ' + str(self.fields)
+
+    def struct_name(self):
+        return 'arg_' + self.name
+
+    def output_def(self):
+        output('typedef struct {\n')
+        for n in self.fields:
+            output('    int ', n, ';\n')
+        output('} ', self.struct_name(), ';\n\n')
+# end Arguments
+
+
+class General:
+    """Common code between instruction formats and instruction patterns"""
+    def __init__(self, name, lineno, base, fixb, fixm, udfm, fldm, flds):
+        self.name = name
+        self.lineno = lineno
+        self.base = base
+        self.fixedbits = fixb
+        self.fixedmask = fixm
+        self.undefmask = udfm
+        self.fieldmask = fldm
+        self.fields = flds
+
+    def __str__(self):
+        r = self.name
+        if self.base:
+            r = r + ' ' + self.base.name
+        else:
+            r = r + ' ' + str(self.fields)
+        r = r + ' ' + str_match_bits(self.fixedbits, self.fixedmask)
+        return r
+
+    def str1(self, i):
+        return str_indent(i) + self.__str__()
+# end General
+
+
+class Format(General):
+    """Class representing an instruction format"""
+
+    def extract_name(self):
+        return 'extract_' + self.name
+
+    def output_extract(self):
+        output('static void ', self.extract_name(), '(',
+               self.base.struct_name(), ' *a, ', insntype, ' insn)\n{\n')
+        for n, f in self.fields.items():
+            output('    a->', n, ' = ', f.str_extract(), ';\n')
+        output('}\n\n')
+# end Format
+
+
+class Pattern(General):
+    """Class representing an instruction pattern"""
+
+    def output_decl(self):
+        global translate_scope
+        global translate_prefix
+        output('typedef ', self.base.base.struct_name(),
+               ' arg_', self.name, ';\n')
+        output(translate_scope, 'void ', translate_prefix, '_', self.name,
+               '(DisasContext *ctx, arg_', self.name,
+               ' *a, ', insntype, ' insn);\n')
+
+    def output_code(self, i, extracted, outerbits, outermask):
+        global translate_prefix
+        ind = str_indent(i)
+        arg = self.base.base.name
+        output(ind, '/* line ', str(self.lineno), ' */\n')
+        if not extracted:
+            output(ind, self.base.extract_name(), '(&u.f_', arg, ', insn);\n')
+        for n, f in self.fields.items():
+            output(ind, 'u.f_', arg, '.', n, ' = ', f.str_extract(), ';\n')
+        output(ind, translate_prefix, '_', self.name,
+               '(ctx, &u.f_', arg, ', insn);\n')
+        output(ind, 'return true;\n')
+# end Pattern
+
+
+def parse_field(lineno, name, toks):
+    """Parse one instruction field from TOKS at LINENO"""
+    global fields
+    global re_ident
+    global insnwidth
+
+    # A "simple" field will have only one entry;
+    # a "multifield" will have several.
+    subs = []
+    width = 0
+    func = None
+    for t in toks:
+        if re_fullmatch('!function=' + re_ident, t):
+            if func:
+                error(lineno, 'duplicate function')
+            func = t.split('=')
+            func = func[1]
+            continue
+
+        if re_fullmatch('[0-9]+:s[0-9]+', t):
+            # Signed field extract
+            subtoks = t.split(':s')
+            sign = True
+        elif re_fullmatch('[0-9]+:[0-9]+', t):
+            # Unsigned field extract
+            subtoks = t.split(':')
+            sign = False
+        else:
+            error(lineno, 'invalid field token "{0}"'.format(t))
+        po = int(subtoks[0])
+        le = int(subtoks[1])
+        if po + le > insnwidth:
+            error(lineno, 'field {0} too large'.format(t))
+        f = Field(sign, po, le)
+        subs.append(f)
+        width += le
+
+    if width > insnwidth:
+        error(lineno, 'field too large')
+    if len(subs) == 1:
+        f = subs[0]
+    else:
+        mask = 0
+        for s in subs:
+            if mask & s.mask:
+                error(lineno, 'field components overlap')
+            mask |= s.mask
+        f = MultiField(subs, mask)
+    if func:
+        f = FunctionField(func, f)
+
+    if name in fields:
+        error(lineno, 'duplicate field', name)
+    fields[name] = f
+# end parse_field
+
+
+def parse_arguments(lineno, name, toks):
+    """Parse one argument set from TOKS at LINENO"""
+    global arguments
+    global re_ident
+
+    flds = []
+    for t in toks:
+        if not re_fullmatch(re_ident, t):
+            error(lineno, 'invalid argument set token "{0}"'.format(t))
+        if t in flds:
+            error(lineno, 'duplicate argument "{0}"'.format(t))
+        flds.append(t)
+
+    if name in arguments:
+        error(lineno, 'duplicate argument set', name)
+    arguments[name] = Arguments(name, flds)
+# end parse_arguments
+
+
+def lookup_field(lineno, name):
+    global fields
+    if name in fields:
+        return fields[name]
+    error(lineno, 'undefined field', name)
+
+
+def add_field(lineno, flds, new_name, f):
+    if new_name in flds:
+        error(lineno, 'duplicate field', new_name)
+    flds[new_name] = f
+    return flds
+
+
+def add_field_byname(lineno, flds, new_name, old_name):
+    return add_field(lineno, flds, new_name, lookup_field(lineno, old_name))
+
+
+def infer_argument_set(flds):
+    global arguments
+
+    for arg in arguments.values():
+        if eq_fields_for_args(flds, arg.fields):
+            return arg
+
+    name = str(len(arguments))
+    arg = Arguments(name, flds.keys())
+    arguments[name] = arg
+    return arg
+
+
+def infer_format(arg, fieldmask, flds):
+    global arguments
+    global formats
+
+    const_flds = {}
+    var_flds = {}
+    for n, c in flds.items():
+        if c is ConstField:
+            const_flds[n] = c
+        else:
+            var_flds[n] = c
+
+    # Look for an existing format with the same argument set and fields
+    for fmt in formats.values():
+        if arg and fmt.base != arg:
+            continue
+        if fieldmask != fmt.fieldmask:
+            continue
+        if not eq_fields_for_fmts(flds, fmt.fields):
+            continue
+        return (fmt, const_flds)
+
+    name = 'Fmt_' + str(len(formats))
+    if not arg:
+        arg = infer_argument_set(flds)
+
+    fmt = Format(name, 0, arg, 0, 0, 0, fieldmask, var_flds)
+    formats[name] = fmt
+
+    return (fmt, const_flds)
+# end infer_format
+
+
+def parse_generic(lineno, is_format, name, toks):
+    """Parse one instruction format from TOKS at LINENO"""
+    global fields
+    global arguments
+    global formats
+    global patterns
+    global re_ident
+    global insnwidth
+    global insnmask
+
+    fixedmask = 0
+    fixedbits = 0
+    undefmask = 0
+    width = 0
+    flds = {}
+    arg = None
+    fmt = None
+    for t in toks:
+        # '&Foo' gives a format an explcit argument set.
+        if t[0] == '&':
+            tt = t[1:]
+            if arg:
+                error(lineno, 'multiple argument sets')
+            if tt in arguments:
+                arg = arguments[tt]
+            else:
+                error(lineno, 'undefined argument set', t)
+            continue
+
+        # '@Foo' gives a pattern an explicit format.
+        if t[0] == '@':
+            tt = t[1:]
+            if fmt:
+                error(lineno, 'multiple formats')
+            if tt in formats:
+                fmt = formats[tt]
+            else:
+                error(lineno, 'undefined format', t)
+            continue
+
+        # '%Foo' imports a field.
+        if t[0] == '%':
+            tt = t[1:]
+            flds = add_field_byname(lineno, flds, tt, tt)
+            continue
+
+        # 'Foo=%Bar' imports a field with a different name.
+        if re_fullmatch(re_ident + '=%' + re_ident, t):
+            (fname, iname) = t.split('=%')
+            flds = add_field_byname(lineno, flds, fname, iname)
+            continue
+
+        # 'Foo=number' sets an argument field to a constant value
+        if re_fullmatch(re_ident + '=[0-9]+', t):
+            (fname, value) = t.split('=')
+            value = int(value)
+            flds = add_field(lineno, flds, fname, ConstField(value))
+            continue
+
+        # Pattern of 0s, 1s, dots and dashes indicate required zeros,
+        # required ones, or dont-cares.
+        if re_fullmatch('[01.-]+', t):
+            shift = len(t)
+            fms = t.replace('0', '1')
+            fms = fms.replace('.', '0')
+            fms = fms.replace('-', '0')
+            fbs = t.replace('.', '0')
+            fbs = fbs.replace('-', '0')
+            ubm = t.replace('1', '0')
+            ubm = ubm.replace('.', '0')
+            ubm = ubm.replace('-', '1')
+            fms = int(fms, 2)
+            fbs = int(fbs, 2)
+            ubm = int(ubm, 2)
+            fixedbits = (fixedbits << shift) | fbs
+            fixedmask = (fixedmask << shift) | fms
+            undefmask = (undefmask << shift) | ubm
+        # Otherwise, fieldname:fieldwidth
+        elif re_fullmatch(re_ident + ':s?[0-9]+', t):
+            (fname, flen) = t.split(':')
+            sign = False
+            if flen[0] == 's':
+                sign = True
+                flen = flen[1:]
+            shift = int(flen, 10)
+            f = Field(sign, insnwidth - width - shift, shift)
+            flds = add_field(lineno, flds, fname, f)
+            fixedbits <<= shift
+            fixedmask <<= shift
+            undefmask <<= shift
+        else:
+            error(lineno, 'invalid token "{0}"'.format(t))
+        width += shift
+
+    # We should have filled in all of the bits of the instruction.
+    if not (is_format and width == 0) and width != insnwidth:
+        error(lineno, 'definition has {0} bits'.format(width))
+
+    # Do not check for fields overlaping fields; one valid usage
+    # is to be able to duplicate fields via import.
+    fieldmask = 0
+    for f in flds.values():
+        fieldmask |= f.mask
+
+    # Fix up what we've parsed to match either a format or a pattern.
+    if is_format:
+        # Formats cannot reference formats.
+        if fmt:
+            error(lineno, 'format referencing format')
+        # If an argument set is given, then there should be no fields
+        # without a place to store it.
+        if arg:
+            for f in flds.keys():
+                if f not in arg.fields:
+                    error(lineno, 'field {0} not in argument set {1}'
+                                  .format(f, arg.name))
+        else:
+            arg = infer_argument_set(flds)
+        if name in formats:
+            error(lineno, 'duplicate format name', name)
+        fmt = Format(name, lineno, arg, fixedbits, fixedmask,
+                     undefmask, fieldmask, flds)
+        formats[name] = fmt
+    else:
+        # Patterns can reference a format ...
+        if fmt:
+            # ... but not an argument simultaneously
+            if arg:
+                error(lineno, 'pattern specifies both format and argument set')
+            if fixedmask & fmt.fixedmask:
+                error(lineno, 'pattern fixed bits overlap format fixed bits')
+            fieldmask |= fmt.fieldmask
+            fixedbits |= fmt.fixedbits
+            fixedmask |= fmt.fixedmask
+            undefmask |= fmt.undefmask
+        else:
+            (fmt, flds) = infer_format(arg, fieldmask, flds)
+        arg = fmt.base
+        for f in flds.keys():
+            if f not in arg.fields:
+                error(lineno, 'field {0} not in argument set {1}'
+                              .format(f, arg.name))
+            if f in fmt.fields.keys():
+                error(lineno, 'field {0} set by format and pattern'.format(f))
+        for f in arg.fields:
+            if f not in flds.keys() and f not in fmt.fields.keys():
+                error(lineno, 'field {0} not initialized'.format(f))
+        pat = Pattern(name, lineno, fmt, fixedbits, fixedmask,
+                      undefmask, fieldmask, flds)
+        patterns.append(pat)
+
+    # Validate the masks that we have assembled.
+    if fieldmask & fixedmask:
+        error(lineno, 'fieldmask overlaps fixedmask (0x{0:08x} & 0x{1:08x})'
+                      .format(fieldmask, fixedmask))
+    if fieldmask & undefmask:
+        error(lineno, 'fieldmask overlaps undefmask (0x{0:08x} & 0x{1:08x})'
+                      .format(fieldmask, undefmask))
+    if fixedmask & undefmask:
+        error(lineno, 'fixedmask overlaps undefmask (0x{0:08x} & 0x{1:08x})'
+                      .format(fixedmask, undefmask))
+    if not is_format:
+        allbits = fieldmask | fixedmask | undefmask
+        if allbits != insnmask:
+            error(lineno, 'bits left unspecified (0x{0:08x})'
+                          .format(allbits ^ insnmask))
+# end parse_general
+
+
+def parse_file(f):
+    """Parse all of the patterns within a file"""
+
+    # Read all of the lines of the file.  Concatenate lines
+    # ending in backslash; discard empty lines and comments.
+    toks = []
+    lineno = 0
+    for line in f:
+        lineno += 1
+
+        # Discard comments
+        end = line.find('#')
+        if end >= 0:
+            line = line[:end]
+
+        t = line.split()
+        if len(toks) != 0:
+            # Next line after continuation
+            toks.extend(t)
+        elif len(t) == 0:
+            # Empty line
+            continue
+        else:
+            toks = t
+
+        # Continuation?
+        if toks[-1] == '\\':
+            toks.pop()
+            continue
+
+        if len(toks) < 2:
+            error(lineno, 'short line')
+
+        name = toks[0]
+        del toks[0]
+
+        # Determine the type of object needing to be parsed.
+        if name[0] == '%':
+            parse_field(lineno, name[1:], toks)
+        elif name[0] == '&':
+            parse_arguments(lineno, name[1:], toks)
+        elif name[0] == '@':
+            parse_generic(lineno, True, name[1:], toks)
+        else:
+            parse_generic(lineno, False, name, toks)
+        toks = []
+# end parse_file
+
+
+class Tree:
+    """Class representing a node in a decode tree"""
+
+    def __init__(self, fm, tm):
+        self.fixedmask = fm
+        self.thismask = tm
+        self.subs = []
+        self.base = None
+
+    def str1(self, i):
+        ind = str_indent(i)
+        r = '{0}{1:08x}'.format(ind, self.fixedmask)
+        if self.format:
+            r += ' ' + self.format.name
+        r += ' [\n'
+        for (b, s) in self.subs:
+            r += '{0}  {1:08x}:\n'.format(ind, b)
+            r += s.str1(i + 4) + '\n'
+        r += ind + ']'
+        return r
+
+    def __str__(self):
+        return self.str1(0)
+
+    def output_code(self, i, extracted, outerbits, outermask):
+        ind = str_indent(i)
+
+        # If we identified all nodes below have the same format,
+        # extract the fields now.
+        if not extracted and self.base:
+            output(ind, self.base.extract_name(),
+                   '(&u.f_', self.base.base.name, ', insn);\n')
+            extracted = True
+
+        # Attempt to aid the compiler in producing compact switch statements.
+        # If the bits in the mask are contiguous, extract them.
+        sh = is_contiguous(self.thismask)
+        if sh > 0:
+            # Propagate SH down into the local functions.
+            def str_switch(b, sh=sh):
+                return '(insn >> {0}) & 0x{1:x}'.format(sh, b >> sh)
+
+            def str_case(b, sh=sh):
+                return '0x{0:x}'.format(b >> sh)
+        else:
+            def str_switch(b):
+                return 'insn & 0x{0:08x}'.format(b)
+
+            def str_case(b):
+                return '0x{0:08x}'.format(b)
+
+        output(ind, 'switch (', str_switch(self.thismask), ') {\n')
+        for b, s in sorted(self.subs):
+            assert (self.thismask & ~s.fixedmask) == 0
+            innermask = outermask | self.thismask
+            innerbits = outerbits | b
+            output(ind, 'case ', str_case(b), ':\n')
+            output(ind, '    /* ',
+                   str_match_bits(innerbits, innermask), ' */\n')
+            s.output_code(i + 4, extracted, innerbits, innermask)
+        output(ind, '}\n')
+        output(ind, 'return false;\n')
+# end Tree
+
+
+def build_tree(pats, outerbits, outermask):
+    # Find the intersection of all remaining fixedmask.
+    innermask = ~outermask
+    for i in pats:
+        innermask &= i.fixedmask
+
+    if innermask == 0:
+        pnames = []
+        for p in pats:
+            pnames.append(p.name + ':' + str(p.lineno))
+        error(pats[0].lineno, 'overlapping patterns:', pnames)
+
+    fullmask = outermask | innermask
+
+    # Sort each element of pats into the bin selected by the mask.
+    bins = {}
+    for i in pats:
+        fb = i.fixedbits & innermask
+        if fb in bins:
+            bins[fb].append(i)
+        else:
+            bins[fb] = [i]
+
+    # We must recurse if any bin has more than one element or if
+    # the single element in the bin has not been fully matched.
+    t = Tree(fullmask, innermask)
+
+    for b, l in bins.items():
+        s = l[0]
+        if len(l) > 1 or s.fixedmask & ~fullmask != 0:
+            s = build_tree(l, b | outerbits, fullmask)
+        t.subs.append((b, s))
+
+    return t
+# end build_tree
+
+
+def prop_format(tree):
+    """Propagate Format objects into the decode tree"""
+
+    # Depth first search.
+    for (b, s) in tree.subs:
+        if isinstance(s, Tree):
+            prop_format(s)
+
+    # If all entries in SUBS have the same format, then
+    # propagate that into the tree.
+    f = None
+    for (b, s) in tree.subs:
+        if f is None:
+            f = s.base
+            if f is None:
+                return
+        if f is not s.base:
+            return
+    tree.base = f
+# end prop_format
+
+
+def main():
+    global arguments
+    global formats
+    global patterns
+    global translate_scope
+    global translate_prefix
+    global output_fd
+    global output_file
+    global input_file
+    global insnwidth
+    global insntype
+
+    decode_function = 'decode'
+    decode_scope = 'static '
+
+    long_opts = ['decode=', 'translate=', 'output=', 'insnwidth=']
+    try:
+        (opts, args) = getopt.getopt(sys.argv[1:], 'o:w:', long_opts)
+    except getopt.GetoptError as err:
+        error(0, err)
+    for o, a in opts:
+        if o in ('-o', '--output'):
+            output_file = a
+        elif o == '--decode':
+            decode_function = a
+            decode_scope = ''
+        elif o == '--translate':
+            translate_prefix = a
+            translate_scope = ''
+        elif o in ('-w', '--insnwidth'):
+            insnwidth = int(a)
+            if insnwidth == 16:
+                insntype = 'uint16_t'
+                insnmask = 0xffff
+            elif insnwidth != 32:
+                error(0, 'cannot handle insns of width', insnwidth)
+        else:
+            assert False, 'unhandled option'
+
+    if len(args) < 1:
+        error(0, 'missing input file')
+    input_file = args[0]
+    f = open(input_file, 'r')
+    parse_file(f)
+    f.close()
+
+    t = build_tree(patterns, 0, 0)
+    prop_format(t)
+
+    if output_file:
+        output_fd = open(output_file, 'w')
+    else:
+        output_fd = sys.stdout
+
+    output_autogen()
+    for n in sorted(arguments.keys()):
+        f = arguments[n]
+        f.output_def()
+
+    # A single translate function can be invoked for different patterns.
+    # Make sure that the argument sets are the same, and declare the
+    # function only once.
+    out_pats = {}
+    for i in patterns:
+        if i.name in out_pats:
+            p = out_pats[i.name]
+            if i.base.base != p.base.base:
+                error(0, i.name, ' has conflicting argument sets')
+        else:
+            i.output_decl()
+            out_pats[i.name] = i
+    output('\n')
+
+    for n in sorted(formats.keys()):
+        f = formats[n]
+        f.output_extract()
+
+    output(decode_scope, 'bool ', decode_function,
+           '(DisasContext *ctx, ', insntype, ' insn)\n{\n')
+
+    i4 = str_indent(4)
+    output(i4, 'union {\n')
+    for n in sorted(arguments.keys()):
+        f = arguments[n]
+        output(i4, i4, f.struct_name(), ' f_', f.name, ';\n')
+    output(i4, '} u;\n\n')
+
+    t.output_code(4, False, 0, 0)
+
+    output('}\n')
+
+    if output_file:
+        output_fd.close()
+# end main
+
+
+if __name__ == '__main__':
+    main()
diff --git a/target/alpha/cpu.h b/target/alpha/cpu.h
index 09720c2f3b..a79fc2e780 100644
--- a/target/alpha/cpu.h
+++ b/target/alpha/cpu.h
@@ -33,8 +33,6 @@
 
 #include "exec/cpu-defs.h"
 
-#include "fpu/softfloat.h"
-
 #define ICACHE_LINE_SIZE 32
 #define DCACHE_LINE_SIZE 32
 
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index d796085be9..1b3ae62db6 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -34,6 +34,7 @@
 #include "sysemu/hw_accel.h"
 #include "kvm_arm.h"
 #include "disas/capstone.h"
+#include "fpu/softfloat.h"
 
 static void arm_cpu_set_pc(CPUState *cs, vaddr value)
 {
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index de62df091c..8c839faa8f 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -39,8 +39,6 @@
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
 
-#include "fpu/softfloat.h"
-
 #define EXCP_UDEF            1   /* undefined instruction */
 #define EXCP_SWI             2   /* software interrupt */
 #define EXCP_PREFETCH_ABORT  3
diff --git a/target/arm/helper-a64.c b/target/arm/helper-a64.c
index 06fd321fae..10e08bdc1f 100644
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@@ -31,6 +31,7 @@
 #include "exec/cpu_ldst.h"
 #include "qemu/int128.h"
 #include "tcg.h"
+#include "fpu/softfloat.h"
 #include <zlib.h> /* For crc32 */
 
 /* C2.4.7 Multiply and divide */
diff --git a/target/arm/helper.c b/target/arm/helper.c
index e7586fcf6c..c5bc69b961 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -15,6 +15,7 @@
 #include <zlib.h> /* For crc32 */
 #include "exec/semihost.h"
 #include "sysemu/kvm.h"
+#include "fpu/softfloat.h"
 
 #define ARM_CPU_FREQ 1000000000 /* FIXME: 1 GHz, should be configurable */
 
@@ -5068,8 +5069,8 @@ void register_cp_regs_for_features(ARMCPU *cpu)
             { .name = "VPIDR", .state = ARM_CP_STATE_AA32,
               .cp = 15, .opc1 = 4, .crn = 0, .crm = 0, .opc2 = 0,
               .access = PL2_RW, .accessfn = access_el3_aa32ns,
-              .resetvalue = cpu->midr,
-              .fieldoffset = offsetof(CPUARMState, cp15.vpidr_el2) },
+              .resetvalue = cpu->midr, .type = ARM_CP_ALIAS,
+              .fieldoffset = offsetoflow32(CPUARMState, cp15.vpidr_el2) },
             { .name = "VPIDR_EL2", .state = ARM_CP_STATE_AA64,
               .opc0 = 3, .opc1 = 4, .crn = 0, .crm = 0, .opc2 = 0,
               .access = PL2_RW, .resetvalue = cpu->midr,
@@ -5077,8 +5078,8 @@ void register_cp_regs_for_features(ARMCPU *cpu)
             { .name = "VMPIDR", .state = ARM_CP_STATE_AA32,
               .cp = 15, .opc1 = 4, .crn = 0, .crm = 0, .opc2 = 5,
               .access = PL2_RW, .accessfn = access_el3_aa32ns,
-              .resetvalue = vmpidr_def,
-              .fieldoffset = offsetof(CPUARMState, cp15.vmpidr_el2) },
+              .resetvalue = vmpidr_def, .type = ARM_CP_ALIAS,
+              .fieldoffset = offsetoflow32(CPUARMState, cp15.vmpidr_el2) },
             { .name = "VMPIDR_EL2", .state = ARM_CP_STATE_AA64,
               .opc0 = 3, .opc1 = 4, .crn = 0, .crm = 0, .opc2 = 5,
               .access = PL2_RW,
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
index 689491cad3..a1ec6537eb 100644
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -11,6 +11,7 @@
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
 
 #define SIGNBIT (uint32_t)0x80000000
 #define SIGNBIT64 ((uint64_t)1 << 63)
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index 7b635cc4ac..969f628f0a 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -23,6 +23,7 @@
 #include "cpu.h"
 #include "qemu-common.h"
 #include "exec/exec-all.h"
+#include "fpu/softfloat.h"
 
 
 static void hppa_cpu_set_pc(CPUState *cs, vaddr value)
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 7640c81221..c88d844938 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -51,7 +51,6 @@
 #define CPUArchState struct CPUHPPAState
 
 #include "exec/cpu-defs.h"
-#include "fpu/softfloat.h"
 
 #define TARGET_PAGE_BITS 12
 
diff --git a/target/hppa/op_helper.c b/target/hppa/op_helper.c
index 4ee936bf86..a3af62daf7 100644
--- a/target/hppa/op_helper.c
+++ b/target/hppa/op_helper.c
@@ -24,7 +24,7 @@
 #include "exec/cpu_ldst.h"
 #include "sysemu/sysemu.h"
 #include "qemu/timer.h"
-
+#include "fpu/softfloat.h"
 
 void QEMU_NORETURN HELPER(excp)(CPUHPPAState *env, int excp)
 {
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index f91e37d25d..faf39ec1ce 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -52,10 +52,6 @@
 
 #define CPUArchState struct CPUX86State
 
-#ifdef CONFIG_TCG
-#include "fpu/softfloat.h"
-#endif
-
 enum {
     R_EAX = 0,
     R_ECX = 1,
diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c
index 9014b6f88a..ea5a0c4861 100644
--- a/target/i386/fpu_helper.c
+++ b/target/i386/fpu_helper.c
@@ -24,6 +24,7 @@
 #include "qemu/host-utils.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
+#include "fpu/softfloat.h"
 
 #define FPU_RC_MASK         0xc00
 #define FPU_RC_NEAR         0x000
diff --git a/target/m68k/cpu.c b/target/m68k/cpu.c
index 3026714471..a4ed8770aa 100644
--- a/target/m68k/cpu.c
+++ b/target/m68k/cpu.c
@@ -24,7 +24,7 @@
 #include "qemu-common.h"
 #include "migration/vmstate.h"
 #include "exec/exec-all.h"
-
+#include "fpu/softfloat.h"
 
 static void m68k_cpu_set_pc(CPUState *cs, vaddr value)
 {
diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
index 1d79885222..65f4fb95cb 100644
--- a/target/m68k/cpu.h
+++ b/target/m68k/cpu.h
@@ -28,7 +28,6 @@
 #include "qemu-common.h"
 #include "exec/cpu-defs.h"
 #include "cpu-qom.h"
-#include "fpu/softfloat.h"
 
 #define OS_BYTE     0
 #define OS_WORD     1
diff --git a/target/m68k/fpu_helper.c b/target/m68k/fpu_helper.c
index 665e7609af..3c5a82aaa0 100644
--- a/target/m68k/fpu_helper.c
+++ b/target/m68k/fpu_helper.c
@@ -23,6 +23,7 @@
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
+#include "fpu/softfloat.h"
 
 /* Undefined offsets may be different on various FPU.
  * On 68040 they return 0.0 (floatx80_zero)
diff --git a/target/m68k/helper.c b/target/m68k/helper.c
index 20155c7801..917d46efcc 100644
--- a/target/m68k/helper.c
+++ b/target/m68k/helper.c
@@ -24,6 +24,7 @@
 #include "exec/gdbstub.h"
 
 #include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
 
 #define SIGNBIT (1u << 31)
 
diff --git a/target/m68k/translate.c b/target/m68k/translate.c
index 70c7583621..93cd38950e 100644
--- a/target/m68k/translate.c
+++ b/target/m68k/translate.c
@@ -32,6 +32,8 @@
 
 #include "trace-tcg.h"
 #include "exec/log.h"
+#include "fpu/softfloat.h"
+
 
 //#define DEBUG_DISPATCH 1
 
diff --git a/target/microblaze/cpu.c b/target/microblaze/cpu.c
index d8df2fb07e..4dc1404800 100644
--- a/target/microblaze/cpu.c
+++ b/target/microblaze/cpu.c
@@ -28,6 +28,7 @@
 #include "hw/qdev-properties.h"
 #include "migration/vmstate.h"
 #include "exec/exec-all.h"
+#include "fpu/softfloat.h"
 
 static const struct {
     const char *name;
diff --git a/target/microblaze/cpu.h b/target/microblaze/cpu.h
index f3e7405a62..1fe21c8539 100644
--- a/target/microblaze/cpu.h
+++ b/target/microblaze/cpu.h
@@ -28,7 +28,7 @@
 #define CPUArchState struct CPUMBState
 
 #include "exec/cpu-defs.h"
-#include "fpu/softfloat.h"
+#include "fpu/softfloat-types.h"
 struct CPUMBState;
 typedef struct CPUMBState CPUMBState;
 #if !defined(CONFIG_USER_ONLY)
diff --git a/target/microblaze/op_helper.c b/target/microblaze/op_helper.c
index 869072a2d1..1b4fe796e7 100644
--- a/target/microblaze/op_helper.c
+++ b/target/microblaze/op_helper.c
@@ -24,6 +24,7 @@
 #include "qemu/host-utils.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
+#include "fpu/softfloat.h"
 
 #define D(x)
 
diff --git a/target/moxie/cpu.h b/target/moxie/cpu.h
index a01f480821..d85e1fc061 100644
--- a/target/moxie/cpu.h
+++ b/target/moxie/cpu.h
@@ -34,7 +34,6 @@
 #define MOXIE_EX_BREAK      16
 
 #include "exec/cpu-defs.h"
-#include "fpu/softfloat.h"
 
 #define TARGET_PAGE_BITS 12     /* 4k */
 
diff --git a/target/nios2/cpu.h b/target/nios2/cpu.h
index 204b39add7..cd4e40d1b4 100644
--- a/target/nios2/cpu.h
+++ b/target/nios2/cpu.h
@@ -27,7 +27,6 @@
 #define CPUArchState struct CPUNios2State
 
 #include "exec/cpu-defs.h"
-#include "fpu/softfloat.h"
 #include "qom/cpu.h"
 struct CPUNios2State;
 typedef struct CPUNios2State CPUNios2State;
diff --git a/target/openrisc/cpu.h b/target/openrisc/cpu.h
index fb46cc9986..5050b1135c 100644
--- a/target/openrisc/cpu.h
+++ b/target/openrisc/cpu.h
@@ -29,7 +29,6 @@ struct OpenRISCCPU;
 
 #include "qemu-common.h"
 #include "exec/cpu-defs.h"
-#include "fpu/softfloat.h"
 #include "qom/cpu.h"
 
 #define TYPE_OPENRISC_CPU "or1k-cpu"
diff --git a/target/openrisc/fpu_helper.c b/target/openrisc/fpu_helper.c
index 1375cea948..977a1e8e55 100644
--- a/target/openrisc/fpu_helper.c
+++ b/target/openrisc/fpu_helper.c
@@ -22,6 +22,7 @@
 #include "cpu.h"
 #include "exec/helper-proto.h"
 #include "exception.h"
+#include "fpu/softfloat.h"
 
 static inline uint32_t ieee_ex_to_openrisc(OpenRISCCPU *cpu, int fexcp)
 {
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 9f8cbbe7aa..7bde1884a1 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -79,7 +79,6 @@
 
 #include "exec/cpu-defs.h"
 #include "cpu-qom.h"
-#include "fpu/softfloat.h"
 
 #if defined (TARGET_PPC64)
 #define PPC_ELF_MACHINE     EM_PPC64
diff --git a/target/ppc/fpu_helper.c b/target/ppc/fpu_helper.c
index c4dab159e4..9ae418a577 100644
--- a/target/ppc/fpu_helper.c
+++ b/target/ppc/fpu_helper.c
@@ -21,6 +21,7 @@
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
 #include "internal.h"
+#include "fpu/softfloat.h"
 
 static inline float128 float128_snan_to_qnan(float128 x)
 {
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 3a50f1e1b7..35bdf09773 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -23,6 +23,7 @@
 #include "qemu/host-utils.h"
 #include "exec/helper-proto.h"
 #include "crypto/aes.h"
+#include "fpu/softfloat.h"
 
 #include "helper_regs.h"
 /*****************************************************************************/
diff --git a/target/ppc/translate_init.c b/target/ppc/translate_init.c
index cbaa343e04..17a87df654 100644
--- a/target/ppc/translate_init.c
+++ b/target/ppc/translate_init.c
@@ -38,6 +38,7 @@
 #include "sysemu/qtest.h"
 #include "qemu/cutils.h"
 #include "disas/capstone.h"
+#include "fpu/softfloat.h"
 
 //#define PPC_DUMP_CPU
 //#define PPC_DEBUG_SPR
diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
index da7cb9c278..a665b9e60e 100644
--- a/target/s390x/cpu.c
+++ b/target/s390x/cpu.c
@@ -42,6 +42,7 @@
 #include "sysemu/arch_init.h"
 #include "sysemu/sysemu.h"
 #endif
+#include "fpu/softfloat.h"
 
 #define CR0_RESET       0xE0UL
 #define CR14_RESET      0xC2000000UL;
diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index 21ce40d5b6..96df2fe5c9 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -41,8 +41,6 @@
 
 #include "exec/cpu-all.h"
 
-#include "fpu/softfloat.h"
-
 #define NB_MMU_MODES 4
 #define TARGET_INSN_START_EXTRA_WORDS 1
 
diff --git a/target/s390x/fpu_helper.c b/target/s390x/fpu_helper.c
index 334159119f..43f8bf1c94 100644
--- a/target/s390x/fpu_helper.c
+++ b/target/s390x/fpu_helper.c
@@ -24,6 +24,7 @@
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
 #include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
 
 /* #define DEBUG_HELPER */
 #ifdef DEBUG_HELPER
diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index e37c187ca2..6302cfda3a 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -25,6 +25,7 @@
 #include "qemu-common.h"
 #include "migration/vmstate.h"
 #include "exec/exec-all.h"
+#include "fpu/softfloat.h"
 
 
 static void superh_cpu_set_pc(CPUState *cs, vaddr value)
diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index 52a4568dd5..a649b68d78 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -40,8 +40,6 @@
 
 #include "exec/cpu-defs.h"
 
-#include "fpu/softfloat.h"
-
 #define TARGET_PAGE_BITS 12	/* 4k XXXXX */
 
 #define TARGET_PHYS_ADDR_SPACE_BITS 32
diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
index 4b8bbf63b4..4f825bae5a 100644
--- a/target/sh4/op_helper.c
+++ b/target/sh4/op_helper.c
@@ -21,6 +21,7 @@
 #include "exec/helper-proto.h"
 #include "exec/exec-all.h"
 #include "exec/cpu_ldst.h"
+#include "fpu/softfloat.h"
 
 #ifndef CONFIG_USER_ONLY
 
diff --git a/target/sparc/cpu.h b/target/sparc/cpu.h
index 3eaffb354e..9724134a5b 100644
--- a/target/sparc/cpu.h
+++ b/target/sparc/cpu.h
@@ -29,8 +29,6 @@
 
 #include "exec/cpu-defs.h"
 
-#include "fpu/softfloat.h"
-
 /*#define EXCP_INTERRUPT 0x100*/
 
 /* trap definitions */
diff --git a/target/sparc/fop_helper.c b/target/sparc/fop_helper.c
index c7fb176e4c..b6642fd1d7 100644
--- a/target/sparc/fop_helper.c
+++ b/target/sparc/fop_helper.c
@@ -21,6 +21,7 @@
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
 
 #define QT0 (env->qt0)
 #define QT1 (env->qt1)
diff --git a/target/tricore/cpu.h b/target/tricore/cpu.h
index f41d2ceb69..e7dfe4bcc6 100644
--- a/target/tricore/cpu.h
+++ b/target/tricore/cpu.h
@@ -24,7 +24,6 @@
 #include "qemu-common.h"
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
-#include "fpu/softfloat.h"
 
 #define CPUArchState struct CPUTriCoreState
 
diff --git a/target/tricore/fpu_helper.c b/target/tricore/fpu_helper.c
index 7979bb6692..df162902d6 100644
--- a/target/tricore/fpu_helper.c
+++ b/target/tricore/fpu_helper.c
@@ -20,6 +20,7 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
 
 #define QUIET_NAN 0x7fc00000
 #define ADD_NAN   0x7fc00001
diff --git a/target/tricore/helper.c b/target/tricore/helper.c
index 378c2a4a76..45276d3782 100644
--- a/target/tricore/helper.c
+++ b/target/tricore/helper.c
@@ -19,6 +19,7 @@
 
 #include "cpu.h"
 #include "exec/exec-all.h"
+#include "fpu/softfloat.h"
 
 enum {
     TLBRET_DIRTY = -4,
diff --git a/target/unicore32/cpu.c b/target/unicore32/cpu.c
index fb837aab4c..29d160a88d 100644
--- a/target/unicore32/cpu.c
+++ b/target/unicore32/cpu.c
@@ -18,6 +18,7 @@
 #include "qemu-common.h"
 #include "migration/vmstate.h"
 #include "exec/exec-all.h"
+#include "fpu/softfloat.h"
 
 static void uc32_cpu_set_pc(CPUState *cs, vaddr value)
 {
diff --git a/target/unicore32/cpu.h b/target/unicore32/cpu.h
index a3cc71416d..42e1d52478 100644
--- a/target/unicore32/cpu.h
+++ b/target/unicore32/cpu.h
@@ -23,7 +23,6 @@
 #include "qemu-common.h"
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
-#include "fpu/softfloat.h"
 
 #define NB_MMU_MODES            2
 
diff --git a/target/unicore32/ucf64_helper.c b/target/unicore32/ucf64_helper.c
index 6c919010c3..fad3fa6618 100644
--- a/target/unicore32/ucf64_helper.c
+++ b/target/unicore32/ucf64_helper.c
@@ -11,6 +11,7 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "exec/helper-proto.h"
+#include "fpu/softfloat.h"
 
 /*
  * The convention used for UniCore-F64 instructions:
diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
index f300c02c07..49c2e3cf9a 100644
--- a/target/xtensa/cpu.h
+++ b/target/xtensa/cpu.h
@@ -36,7 +36,6 @@
 #include "qemu-common.h"
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
-#include "fpu/softfloat.h"
 #include "xtensa-isa.h"
 
 #define NB_MMU_MODES 4
diff --git a/target/xtensa/op_helper.c b/target/xtensa/op_helper.c
index 43182b113e..7486b99799 100644
--- a/target/xtensa/op_helper.c
+++ b/target/xtensa/op_helper.c
@@ -34,6 +34,7 @@
 #include "exec/cpu_ldst.h"
 #include "exec/address-spaces.h"
 #include "qemu/timer.h"
+#include "fpu/softfloat.h"
 
 void xtensa_cpu_do_unaligned_access(CPUState *cs,
         vaddr addr, MMUAccessType access_type,
diff --git a/tests/Makefile.include b/tests/Makefile.include
index b89f20b482..937cbd874a 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -934,6 +934,13 @@ $(patsubst %, check-%, $(check-qapi-schema-y)): check-%.json: $(SRC_PATH)/%.json
 check-tests/qapi-schema/doc-good.texi: tests/qapi-schema/doc-good.test.texi
 	@diff -q $(SRC_PATH)/tests/qapi-schema/doc-good.texi $<
 
+.PHONY: check-decodetree
+check-decodetree:
+	$(call quiet-command, \
+	  cd $(SRC_PATH)/tests/decode && \
+          ./check.sh "$(PYTHON)" "$(SRC_PATH)/scripts/decodetree.py", \
+          TEST, decodetree.py)
+
 # Consolidated targets
 
 .PHONY: check-qapi-schema check-qtest check-unit check check-clean
@@ -942,7 +949,7 @@ check-qtest: $(patsubst %,check-qtest-%, $(QTEST_TARGETS))
 check-unit: $(patsubst %,check-%, $(check-unit-y))
 check-speed: $(patsubst %,check-%, $(check-speed-y))
 check-block: $(patsubst %,check-%, $(check-block-y))
-check: check-qapi-schema check-unit check-qtest
+check: check-qapi-schema check-unit check-qtest check-decodetree
 check-clean:
 	$(MAKE) -C tests/tcg clean
 	rm -rf $(check-unit-y) tests/*.o $(QEMU_IOTESTS_HELPERS-y)
diff --git a/tests/decode/check.sh b/tests/decode/check.sh
new file mode 100755
index 0000000000..79a06c37cd
--- /dev/null
+++ b/tests/decode/check.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+PYTHON=$1
+DECODETREE=$2
+E=0
+
+# All of these tests should produce errors
+for i in err_*.decode; do
+    if $PYTHON $DECODETREE $i > /dev/null 2> /dev/null; then
+        # Pass, aka failed to fail.
+        echo FAIL: $i 1>&2
+        E=1
+    fi
+done
+
+exit $E
diff --git a/tests/decode/err_argset1.decode b/tests/decode/err_argset1.decode
new file mode 100644
index 0000000000..fcaebcce76
--- /dev/null
+++ b/tests/decode/err_argset1.decode
@@ -0,0 +1,5 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose duplicate member names
+&args	a a
diff --git a/tests/decode/err_argset2.decode b/tests/decode/err_argset2.decode
new file mode 100644
index 0000000000..256b2f9b12
--- /dev/null
+++ b/tests/decode/err_argset2.decode
@@ -0,0 +1,5 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose invalid member names
+&args	a b c d0 0e
diff --git a/tests/decode/err_field1.decode b/tests/decode/err_field1.decode
new file mode 100644
index 0000000000..e07a5a73e0
--- /dev/null
+++ b/tests/decode/err_field1.decode
@@ -0,0 +1,5 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose invalid field syntax
+%field	asdf
diff --git a/tests/decode/err_field2.decode b/tests/decode/err_field2.decode
new file mode 100644
index 0000000000..7664a3ebe9
--- /dev/null
+++ b/tests/decode/err_field2.decode
@@ -0,0 +1,5 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose invalid field width.
+%field	0:33
diff --git a/tests/decode/err_field3.decode b/tests/decode/err_field3.decode
new file mode 100644
index 0000000000..87e680f103
--- /dev/null
+++ b/tests/decode/err_field3.decode
@@ -0,0 +1,5 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose invalid field position.
+%field	31:2
diff --git a/tests/decode/err_field4.decode b/tests/decode/err_field4.decode
new file mode 100644
index 0000000000..888ce4729b
--- /dev/null
+++ b/tests/decode/err_field4.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose duplicate field name.
+%field	0:1
+%field	0:1
diff --git a/tests/decode/err_field5.decode b/tests/decode/err_field5.decode
new file mode 100644
index 0000000000..b0c62af866
--- /dev/null
+++ b/tests/decode/err_field5.decode
@@ -0,0 +1,5 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose duplicate function specifier.
+%field	0:1	!function=a !function=a
diff --git a/tests/decode/err_init1.decode b/tests/decode/err_init1.decode
new file mode 100644
index 0000000000..da855bd00a
--- /dev/null
+++ b/tests/decode/err_init1.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose uninitialized member in pattern.
+&args	a b
+insn	00000000 00000000 00000000 b:8	&args
diff --git a/tests/decode/err_init2.decode b/tests/decode/err_init2.decode
new file mode 100644
index 0000000000..b58de30098
--- /dev/null
+++ b/tests/decode/err_init2.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose member initialized twice in pattern.
+&args	a b
+insn	00000000 00000000 a:8 b:8	&args a=1
diff --git a/tests/decode/err_init3.decode b/tests/decode/err_init3.decode
new file mode 100644
index 0000000000..96790abfdc
--- /dev/null
+++ b/tests/decode/err_init3.decode
@@ -0,0 +1,7 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose member initialized twice in pattern + format.
+&args	a
+@format	........ ........ a:16	&args
+insn	00000000 00000000 a:16	@format
diff --git a/tests/decode/err_init4.decode b/tests/decode/err_init4.decode
new file mode 100644
index 0000000000..4336d4632f
--- /dev/null
+++ b/tests/decode/err_init4.decode
@@ -0,0 +1,7 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose uninitialized member in pattern + format.
+&args	a b
+@format	........ ........ a:16			&args
+insn	00000000 00000000 ........ ........	@format
diff --git a/tests/decode/err_overlap1.decode b/tests/decode/err_overlap1.decode
new file mode 100644
index 0000000000..1ae63472dc
--- /dev/null
+++ b/tests/decode/err_overlap1.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose field overlapping fixedbits.
+%field	0:1
+insn	00000000 00000000 00000000 00000000 	%field
diff --git a/tests/decode/err_overlap2.decode b/tests/decode/err_overlap2.decode
new file mode 100644
index 0000000000..1d6d7a3930
--- /dev/null
+++ b/tests/decode/err_overlap2.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose field overlapping fixedbits w/format.
+@format		........ ........ ........ ....... fld:1
+insn		00000000 00000000 00000000 00000000		@format
diff --git a/tests/decode/err_overlap3.decode b/tests/decode/err_overlap3.decode
new file mode 100644
index 0000000000..3ab0c4850a
--- /dev/null
+++ b/tests/decode/err_overlap3.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose field overlapping unspecified bits.
+%field	0:1
+insn	00000000 00000000 00000000 --------	%field
diff --git a/tests/decode/err_overlap4.decode b/tests/decode/err_overlap4.decode
new file mode 100644
index 0000000000..53c5399d39
--- /dev/null
+++ b/tests/decode/err_overlap4.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose fixed bits overlapping unspecified bits.
+@format		........ ........ ........ .......-
+insn		00000000 00000000 00000000 00000000	@format
diff --git a/tests/decode/err_overlap5.decode b/tests/decode/err_overlap5.decode
new file mode 100644
index 0000000000..df0e31fffb
--- /dev/null
+++ b/tests/decode/err_overlap5.decode
@@ -0,0 +1,5 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose overlapping sub-fields.
+%field	3:5 0:5
diff --git a/tests/decode/err_overlap6.decode b/tests/decode/err_overlap6.decode
new file mode 100644
index 0000000000..cc69fc8fdd
--- /dev/null
+++ b/tests/decode/err_overlap6.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose overlapping fixed bits w/format.
+@format		........ ........ ........ .......1
+insn		00000000 00000000 00000000 00000000	@format
diff --git a/tests/decode/err_overlap7.decode b/tests/decode/err_overlap7.decode
new file mode 100644
index 0000000000..6f555295ab
--- /dev/null
+++ b/tests/decode/err_overlap7.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose overlapping patterns.
+insn1		00000000 00000000 00000000 00000000
+insn2		00000000 00000000 00000000 00000000
diff --git a/tests/decode/err_overlap8.decode b/tests/decode/err_overlap8.decode
new file mode 100644
index 0000000000..df4bae8f1c
--- /dev/null
+++ b/tests/decode/err_overlap8.decode
@@ -0,0 +1,5 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose not specified bit (. vs -).
+insn	00000000 00000000 00000000 0000000.
diff --git a/tests/decode/err_overlap9.decode b/tests/decode/err_overlap9.decode
new file mode 100644
index 0000000000..58b6ac121a
--- /dev/null
+++ b/tests/decode/err_overlap9.decode
@@ -0,0 +1,6 @@
+# This work is licensed under the terms of the GNU LGPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+# Diagnose not specified bit (. vs -) w/format.
+@format		........ a:8      ........ b:7 .
+insn		00000000 ........ 00000000 ........	@format
diff --git a/ui/console.c b/ui/console.c
index 36584d039e..e22931a396 100644
--- a/ui/console.c
+++ b/ui/console.c
@@ -1760,14 +1760,24 @@ void dpy_gl_scanout_dmabuf(QemuConsole *con,
     con->gl->ops->dpy_gl_scanout_dmabuf(con->gl, dmabuf);
 }
 
-void dpy_gl_cursor_dmabuf(QemuConsole *con,
-                          QemuDmaBuf *dmabuf,
-                          uint32_t pos_x, uint32_t pos_y)
+void dpy_gl_cursor_dmabuf(QemuConsole *con, QemuDmaBuf *dmabuf,
+                          bool have_hot, uint32_t hot_x, uint32_t hot_y)
 {
     assert(con->gl);
 
     if (con->gl->ops->dpy_gl_cursor_dmabuf) {
-        con->gl->ops->dpy_gl_cursor_dmabuf(con->gl, dmabuf, pos_x, pos_y);
+        con->gl->ops->dpy_gl_cursor_dmabuf(con->gl, dmabuf,
+                                           have_hot, hot_x, hot_y);
+    }
+}
+
+void dpy_gl_cursor_position(QemuConsole *con,
+                            uint32_t pos_x, uint32_t pos_y)
+{
+    assert(con->gl);
+
+    if (con->gl->ops->dpy_gl_cursor_position) {
+        con->gl->ops->dpy_gl_cursor_position(con->gl, pos_x, pos_y);
     }
 }
 
diff --git a/ui/curses.c b/ui/curses.c
index 479b77bd03..597e47fd4a 100644
--- a/ui/curses.c
+++ b/ui/curses.c
@@ -271,7 +271,8 @@ static void curses_refresh(DisplayChangeListener *dcl)
                     keysym = chr;
             }
 
-            keycode = keysym2scancode(kbd_layout, keysym & KEYSYM_MASK);
+            keycode = keysym2scancode(kbd_layout, keysym & KEYSYM_MASK,
+                                      false, false, false);
             if (keycode == 0)
                 continue;
 
diff --git a/ui/egl-headless.c b/ui/egl-headless.c
index 38b3766548..b33e0b21fd 100644
--- a/ui/egl-headless.c
+++ b/ui/egl-headless.c
@@ -84,21 +84,30 @@ static void egl_scanout_dmabuf(DisplayChangeListener *dcl,
 }
 
 static void egl_cursor_dmabuf(DisplayChangeListener *dcl,
-                              QemuDmaBuf *dmabuf,
-                              uint32_t pos_x, uint32_t pos_y)
+                              QemuDmaBuf *dmabuf, bool have_hot,
+                              uint32_t hot_x, uint32_t hot_y)
 {
     egl_dpy *edpy = container_of(dcl, egl_dpy, dcl);
 
-    edpy->pos_x = pos_x;
-    edpy->pos_y = pos_y;
-
-    egl_dmabuf_import_texture(dmabuf);
-    if (!dmabuf->texture) {
-        return;
+    if (dmabuf) {
+        egl_dmabuf_import_texture(dmabuf);
+        if (!dmabuf->texture) {
+            return;
+        }
+        egl_fb_setup_for_tex(&edpy->cursor_fb, dmabuf->width, dmabuf->height,
+                             dmabuf->texture, false);
+    } else {
+        egl_fb_destroy(&edpy->cursor_fb);
     }
+}
 
-    egl_fb_setup_for_tex(&edpy->cursor_fb, dmabuf->width, dmabuf->height,
-                         dmabuf->texture, false);
+static void egl_cursor_position(DisplayChangeListener *dcl,
+                                uint32_t pos_x, uint32_t pos_y)
+{
+    egl_dpy *edpy = container_of(dcl, egl_dpy, dcl);
+
+    edpy->pos_x = pos_x;
+    edpy->pos_y = pos_y;
 }
 
 static void egl_release_dmabuf(DisplayChangeListener *dcl,
@@ -150,6 +159,7 @@ static const DisplayChangeListenerOps egl_ops = {
     .dpy_gl_scanout_texture  = egl_scanout_texture,
     .dpy_gl_scanout_dmabuf   = egl_scanout_dmabuf,
     .dpy_gl_cursor_dmabuf    = egl_cursor_dmabuf,
+    .dpy_gl_cursor_position  = egl_cursor_position,
     .dpy_gl_release_dmabuf   = egl_release_dmabuf,
     .dpy_gl_update           = egl_scanout_flush,
 };
diff --git a/ui/egl-helpers.c b/ui/egl-helpers.c
index 5fa60ef4e8..16dc3ded36 100644
--- a/ui/egl-helpers.c
+++ b/ui/egl-helpers.c
@@ -83,7 +83,7 @@ void egl_fb_setup_new_tex(egl_fb *fb, int width, int height)
 
     glGenTextures(1, &texture);
     glBindTexture(GL_TEXTURE_2D, texture);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, width, height,
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height,
                  0, GL_BGRA, GL_UNSIGNED_BYTE, 0);
 
     egl_fb_setup_for_tex(fb, width, height, texture, true);
diff --git a/ui/keymaps.c b/ui/keymaps.c
index f9762d1497..43fe604724 100644
--- a/ui/keymaps.c
+++ b/ui/keymaps.c
@@ -28,6 +28,15 @@
 #include "trace.h"
 #include "qemu/error-report.h"
 
+struct keysym2code {
+    uint32_t count;
+    uint16_t keycodes[4];
+};
+
+struct kbd_layout_t {
+    GHashTable *hash;
+};
+
 static int get_keysym(const name2keysym_t *table,
                       const char *name)
 {
@@ -48,46 +57,26 @@ static int get_keysym(const name2keysym_t *table,
 }
 
 
-static void add_to_key_range(struct key_range **krp, int code) {
-    struct key_range *kr;
-    for (kr = *krp; kr; kr = kr->next) {
-        if (code >= kr->start && code <= kr->end) {
-            break;
-        }
-        if (code == kr->start - 1) {
-            kr->start--;
-            break;
-        }
-        if (code == kr->end + 1) {
-            kr->end++;
-            break;
-        }
-    }
-    if (kr == NULL) {
-        kr = g_malloc0(sizeof(*kr));
-        kr->start = kr->end = code;
-        kr->next = *krp;
-        *krp = kr;
-    }
-}
+static void add_keysym(char *line, int keysym, int keycode, kbd_layout_t *k)
+{
+    struct keysym2code *keysym2code;
 
-static void add_keysym(char *line, int keysym, int keycode, kbd_layout_t *k) {
-    if (keysym < MAX_NORMAL_KEYCODE) {
-        trace_keymap_add("normal", keysym, keycode, line);
-        k->keysym2keycode[keysym] = keycode;
-    } else {
-        if (k->extra_count >= MAX_EXTRA_COUNT) {
-            warn_report("Could not assign keysym %s (0x%x)"
-                        " because of memory constraints.", line, keysym);
+    keysym2code = g_hash_table_lookup(k->hash, GINT_TO_POINTER(keysym));
+    if (keysym2code) {
+        if (keysym2code->count < ARRAY_SIZE(keysym2code->keycodes)) {
+            keysym2code->keycodes[keysym2code->count++] = keycode;
         } else {
-            trace_keymap_add("extra", keysym, keycode, line);
-            k->keysym2keycode_extra[k->extra_count].
-            keysym = keysym;
-            k->keysym2keycode_extra[k->extra_count].
-            keycode = keycode;
-            k->extra_count++;
+            warn_report("more than %zd keycodes for keysym %d",
+                        ARRAY_SIZE(keysym2code->keycodes), keysym);
         }
+        return;
     }
+
+    keysym2code = g_new0(struct keysym2code, 1);
+    keysym2code->keycodes[0] = keycode;
+    keysym2code->count = 1;
+    g_hash_table_replace(k->hash, GINT_TO_POINTER(keysym), keysym2code);
+    trace_keymap_add(keysym, keycode, line);
 }
 
 static kbd_layout_t *parse_keyboard_layout(const name2keysym_t *table,
@@ -111,6 +100,7 @@ static kbd_layout_t *parse_keyboard_layout(const name2keysym_t *table,
 
     if (!k) {
         k = g_new0(kbd_layout_t, 1);
+        k->hash = g_hash_table_new(NULL, NULL);
     }
 
     for(;;) {
@@ -147,13 +137,6 @@ static kbd_layout_t *parse_keyboard_layout(const name2keysym_t *table,
                     const char *rest = line + offset + 1;
                     int keycode = strtol(rest, NULL, 0);
 
-                    if (strstr(rest, "numlock")) {
-                        add_to_key_range(&k->keypad_range, keycode);
-                        add_to_key_range(&k->numlock_range, keysym);
-                        /* fprintf(stderr, "keypad keysym %04x keycode %d\n",
-                                   keysym, keycode); */
-                    }
-
                     if (strstr(rest, "shift")) {
                         keycode |= SCANCODE_SHIFT;
                     }
@@ -186,59 +169,79 @@ static kbd_layout_t *parse_keyboard_layout(const name2keysym_t *table,
 }
 
 
-void *init_keyboard_layout(const name2keysym_t *table, const char *language)
+kbd_layout_t *init_keyboard_layout(const name2keysym_t *table,
+                                   const char *language)
 {
     return parse_keyboard_layout(table, language, NULL);
 }
 
 
-int keysym2scancode(void *kbd_layout, int keysym)
+int keysym2scancode(kbd_layout_t *k, int keysym,
+                    bool shift, bool altgr, bool ctrl)
 {
-    kbd_layout_t *k = kbd_layout;
-    if (keysym < MAX_NORMAL_KEYCODE) {
-        if (k->keysym2keycode[keysym] == 0) {
-            trace_keymap_unmapped(keysym);
-            warn_report("no scancode found for keysym %d", keysym);
-        }
-        return k->keysym2keycode[keysym];
-    } else {
-        int i;
+    static const uint32_t mask =
+        SCANCODE_SHIFT | SCANCODE_ALTGR | SCANCODE_CTRL;
+    uint32_t mods, i;
+    struct keysym2code *keysym2code;
+
 #ifdef XK_ISO_Left_Tab
-        if (keysym == XK_ISO_Left_Tab) {
-            keysym = XK_Tab;
-        }
+    if (keysym == XK_ISO_Left_Tab) {
+        keysym = XK_Tab;
+    }
 #endif
-        for (i = 0; i < k->extra_count; i++) {
-            if (k->keysym2keycode_extra[i].keysym == keysym) {
-                return k->keysym2keycode_extra[i].keycode;
-            }
-        }
+
+    keysym2code = g_hash_table_lookup(k->hash, GINT_TO_POINTER(keysym));
+    if (!keysym2code) {
+        trace_keymap_unmapped(keysym);
+        warn_report("no scancode found for keysym %d", keysym);
+        return 0;
     }
-    return 0;
-}
 
-int keycode_is_keypad(void *kbd_layout, int keycode)
-{
-    kbd_layout_t *k = kbd_layout;
-    struct key_range *kr;
+    if (keysym2code->count == 1) {
+        return keysym2code->keycodes[0];
+    }
+
+    /*
+     * We have multiple keysym -> keycode mappings.
+     *
+     * Check whenever we find one mapping where the modifier state of
+     * the mapping matches the current user interface modifier state.
+     * If so, prefer that one.
+     */
+    mods = 0;
+    if (shift) {
+        mods |= SCANCODE_SHIFT;
+    }
+    if (altgr) {
+        mods |= SCANCODE_ALTGR;
+    }
+    if (ctrl) {
+        mods |= SCANCODE_CTRL;
+    }
 
-    for (kr = k->keypad_range; kr; kr = kr->next) {
-        if (keycode >= kr->start && keycode <= kr->end) {
-            return 1;
+    for (i = 0; i < keysym2code->count; i++) {
+        if ((keysym2code->keycodes[i] & mask) == mods) {
+            return keysym2code->keycodes[i];
         }
     }
-    return 0;
+    return keysym2code->keycodes[0];
 }
 
-int keysym_is_numlock(void *kbd_layout, int keysym)
+int keycode_is_keypad(kbd_layout_t *k, int keycode)
 {
-    kbd_layout_t *k = kbd_layout;
-    struct key_range *kr;
+    if (keycode >= 0x47 && keycode <= 0x53) {
+        return true;
+    }
+    return false;
+}
 
-    for (kr = k->numlock_range; kr; kr = kr->next) {
-        if (keysym >= kr->start && keysym <= kr->end) {
-            return 1;
-        }
+int keysym_is_numlock(kbd_layout_t *k, int keysym)
+{
+    switch (keysym) {
+    case 0xffb0 ... 0xffb9:  /* KP_0 .. KP_9 */
+    case 0xffac:             /* KP_Separator */
+    case 0xffae:             /* KP_Decimal   */
+        return true;
     }
-    return 0;
+    return false;
 }
diff --git a/ui/keymaps.h b/ui/keymaps.h
index 8757465529..0693588225 100644
--- a/ui/keymaps.h
+++ b/ui/keymaps.h
@@ -32,25 +32,6 @@ typedef struct {
 	int keysym;
 } name2keysym_t;
 
-struct key_range {
-    int start;
-    int end;
-    struct key_range *next;
-};
-
-#define MAX_NORMAL_KEYCODE 512
-#define MAX_EXTRA_COUNT 256
-typedef struct {
-    uint16_t keysym2keycode[MAX_NORMAL_KEYCODE];
-    struct {
-	int keysym;
-	uint16_t keycode;
-    } keysym2keycode_extra[MAX_EXTRA_COUNT];
-    int extra_count;
-    struct key_range *keypad_range;
-    struct key_range *numlock_range;
-} kbd_layout_t;
-
 /* scancode without modifiers */
 #define SCANCODE_KEYMASK 0xff
 /* scancode without grey or up bit */
@@ -69,10 +50,13 @@ typedef struct {
 #define SCANCODE_ALT    0x400
 #define SCANCODE_ALTGR  0x800
 
+typedef struct kbd_layout_t kbd_layout_t;
 
-void *init_keyboard_layout(const name2keysym_t *table, const char *language);
-int keysym2scancode(void *kbd_layout, int keysym);
-int keycode_is_keypad(void *kbd_layout, int keycode);
-int keysym_is_numlock(void *kbd_layout, int keysym);
+kbd_layout_t *init_keyboard_layout(const name2keysym_t *table,
+                                   const char *language);
+int keysym2scancode(kbd_layout_t *k, int keysym,
+                    bool shift, bool altgr, bool ctrl);
+int keycode_is_keypad(kbd_layout_t *k, int keycode);
+int keysym_is_numlock(kbd_layout_t *k, int keysym);
 
 #endif /* QEMU_KEYMAPS_H */
diff --git a/ui/sdl.c b/ui/sdl.c
index 963cdf77a7..c4ae7ab05d 100644
--- a/ui/sdl.c
+++ b/ui/sdl.c
@@ -201,6 +201,9 @@ static kbd_layout_t *kbd_layout = NULL;
 
 static uint8_t sdl_keyevent_to_keycode_generic(const SDL_KeyboardEvent *ev)
 {
+    bool shift = modifiers_state[0x2a] || modifiers_state[0x36];
+    bool altgr = modifiers_state[0xb8];
+    bool ctrl  = modifiers_state[0x1d] || modifiers_state[0x9d];
     int keysym;
     /* workaround for X11+SDL bug with AltGR */
     keysym = ev->keysym.sym;
@@ -210,7 +213,8 @@ static uint8_t sdl_keyevent_to_keycode_generic(const SDL_KeyboardEvent *ev)
     if (keysym == 92 && ev->keysym.scancode == 133) {
         keysym = 0xa5;
     }
-    return keysym2scancode(kbd_layout, keysym) & SCANCODE_KEYMASK;
+    return keysym2scancode(kbd_layout, keysym,
+                           shift, altgr, ctrl) & SCANCODE_KEYMASK;
 }
 
 
diff --git a/ui/sdl2.c b/ui/sdl2.c
index 6e96a4a24c..b5a0fa1d13 100644
--- a/ui/sdl2.c
+++ b/ui/sdl2.c
@@ -39,7 +39,6 @@ static int gui_grab; /* if true, all keyboard/mouse events are grabbed */
 
 static int gui_saved_grab;
 static int gui_fullscreen;
-static int gui_key_modifier_pressed;
 static int gui_keysym;
 static int gui_grab_code = KMOD_LALT | KMOD_LCTRL;
 static SDL_Cursor *sdl_cursor_normal;
@@ -331,8 +330,7 @@ static void handle_keydown(SDL_Event *ev)
 {
     int win;
     struct sdl2_console *scon = get_scon_from_window(ev->key.windowID);
-
-    gui_key_modifier_pressed = get_mod_state();
+    int gui_key_modifier_pressed = get_mod_state();
 
     if (!scon->ignore_hotkeys && gui_key_modifier_pressed && !ev->key.repeat) {
         switch (ev->key.keysym.scancode) {
@@ -413,18 +411,12 @@ static void handle_keydown(SDL_Event *ev)
 
 static void handle_keyup(SDL_Event *ev)
 {
-    int mod_state;
     struct sdl2_console *scon = get_scon_from_window(ev->key.windowID);
+    int gui_key_modifier_pressed = get_mod_state();
 
     scon->ignore_hotkeys = false;
 
-    if (!alt_grab) {
-        mod_state = (ev->key.keysym.mod & gui_grab_code);
-    } else {
-        mod_state = (ev->key.keysym.mod & (gui_grab_code | KMOD_LSHIFT));
-    }
-    if (!mod_state && gui_key_modifier_pressed) {
-        gui_key_modifier_pressed = 0;
+    if (!gui_key_modifier_pressed) {
         gui_keysym = 0;
     }
     if (!gui_keysym) {
diff --git a/ui/trace-events b/ui/trace-events
index 34229e6747..861b68a305 100644
--- a/ui/trace-events
+++ b/ui/trace-events
@@ -78,7 +78,7 @@ qemu_spice_create_update(uint32_t left, uint32_t right, uint32_t top, uint32_t b
 
 # ui/keymaps.c
 keymap_parse(const char *file) "file %s"
-keymap_add(const char *type, int sym, int code, const char *line) "%-6s sym=0x%04x code=0x%04x (line: %s)"
+keymap_add(int sym, int code, const char *line) "sym=0x%04x code=0x%04x (line: %s)"
 keymap_unmapped(int sym) "sym=0x%04x"
 
 # ui/x_keymap.c
diff --git a/ui/vnc.c b/ui/vnc.c
index a77b568b57..d19f86c7f4 100644
--- a/ui/vnc.c
+++ b/ui/vnc.c
@@ -1734,7 +1734,8 @@ static void reset_keys(VncState *vs)
 
 static void press_key(VncState *vs, int keysym)
 {
-    int keycode = keysym2scancode(vs->vd->kbd_layout, keysym) & SCANCODE_KEYMASK;
+    int keycode = keysym2scancode(vs->vd->kbd_layout, keysym,
+                                  false, false, false) & SCANCODE_KEYMASK;
     qemu_input_event_send_key_number(vs->vd->dcl.con, keycode, true);
     qemu_input_event_send_key_delay(vs->vd->key_delay_ms);
     qemu_input_event_send_key_number(vs->vd->dcl.con, keycode, false);
@@ -1993,6 +1994,9 @@ static const char *code2name(int keycode)
 
 static void key_event(VncState *vs, int down, uint32_t sym)
 {
+    bool shift = vs->modifiers_state[0x2a] || vs->modifiers_state[0x36];
+    bool altgr = vs->modifiers_state[0xb8];
+    bool ctrl  = vs->modifiers_state[0x1d] || vs->modifiers_state[0x9d];
     int keycode;
     int lsym = sym;
 
@@ -2000,7 +2004,8 @@ static void key_event(VncState *vs, int down, uint32_t sym)
         lsym = lsym - 'A' + 'a';
     }
 
-    keycode = keysym2scancode(vs->vd->kbd_layout, lsym & 0xFFFF) & SCANCODE_KEYMASK;
+    keycode = keysym2scancode(vs->vd->kbd_layout, lsym & 0xFFFF,
+                              shift, altgr, ctrl) & SCANCODE_KEYMASK;
     trace_vnc_key_event_map(down, sym, keycode, code2name(keycode));
     do_key_event(vs, down, keycode, sym);
 }