1 files changed, 817 insertions, 48 deletions
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index e1eef954e6..59eac97d10 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -83,6 +83,7 @@ this code that are retained.
  * target-dependent and needs the TARGET_* macros.
  */
 #include "qemu/osdep.h"
+#include <math.h>
 #include "qemu/bitops.h"
 #include "fpu/softfloat.h"
 
@@ -95,6 +96,324 @@ this code that are retained.
 *----------------------------------------------------------------------------*/
 #include "fpu/softfloat-macros.h"
 
+/*
+ * Hardfloat
+ *
+ * Fast emulation of guest FP instructions is challenging for two reasons.
+ * First, FP instruction semantics are similar but not identical, particularly
+ * when handling NaNs. Second, emulating at reasonable speed the guest FP
+ * exception flags is not trivial: reading the host's flags register with a
+ * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
+ * and trapping on every FP exception is not fast nor pleasant to work with.
+ *
+ * We address these challenges by leveraging the host FPU for a subset of the
+ * operations. To do this we expand on the idea presented in this paper:
+ *
+ * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
+ * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
+ *
+ * The idea is thus to leverage the host FPU to (1) compute FP operations
+ * and (2) identify whether FP exceptions occurred while avoiding
+ * expensive exception flag register accesses.
+ *
+ * An important optimization shown in the paper is that given that exception
+ * flags are rarely cleared by the guest, we can avoid recomputing some flags.
+ * This is particularly useful for the inexact flag, which is very frequently
+ * raised in floating-point workloads.
+ *
+ * We optimize the code further by deferring to soft-fp whenever FP exception
+ * detection might get hairy. Two examples: (1) when at least one operand is
+ * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
+ * and the result is < the minimum normal.
+ */
+#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
+    static inline void name(soft_t *a, float_status *s)                 \
+    {                                                                   \
+        if (unlikely(soft_t ## _is_denormal(*a))) {                     \
+            *a = soft_t ## _set_sign(soft_t ## _zero,                   \
+                                     soft_t ## _is_neg(*a));            \
+            s->float_exception_flags |= float_flag_input_denormal;      \
+        }                                                               \
+    }
+
+GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
+GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
+#undef GEN_INPUT_FLUSH__NOCHECK
+
+#define GEN_INPUT_FLUSH1(name, soft_t)                  \
+    static inline void name(soft_t *a, float_status *s) \
+    {                                                   \
+        if (likely(!s->flush_inputs_to_zero)) {         \
+            return;                                     \
+        }                                               \
+        soft_t ## _input_flush__nocheck(a, s);          \
+    }
+
+GEN_INPUT_FLUSH1(float32_input_flush1, float32)
+GEN_INPUT_FLUSH1(float64_input_flush1, float64)
+#undef GEN_INPUT_FLUSH1
+
+#define GEN_INPUT_FLUSH2(name, soft_t)                                  \
+    static inline void name(soft_t *a, soft_t *b, float_status *s)      \
+    {                                                                   \
+        if (likely(!s->flush_inputs_to_zero)) {                         \
+            return;                                                     \
+        }                                                               \
+        soft_t ## _input_flush__nocheck(a, s);                          \
+        soft_t ## _input_flush__nocheck(b, s);                          \
+    }
+
+GEN_INPUT_FLUSH2(float32_input_flush2, float32)
+GEN_INPUT_FLUSH2(float64_input_flush2, float64)
+#undef GEN_INPUT_FLUSH2
+
+#define GEN_INPUT_FLUSH3(name, soft_t)                                  \
+    static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
+    {                                                                   \
+        if (likely(!s->flush_inputs_to_zero)) {                         \
+            return;                                                     \
+        }                                                               \
+        soft_t ## _input_flush__nocheck(a, s);                          \
+        soft_t ## _input_flush__nocheck(b, s);                          \
+        soft_t ## _input_flush__nocheck(c, s);                          \
+    }
+
+GEN_INPUT_FLUSH3(float32_input_flush3, float32)
+GEN_INPUT_FLUSH3(float64_input_flush3, float64)
+#undef GEN_INPUT_FLUSH3
+
+/*
+ * Choose whether to use fpclassify or float32/64_* primitives in the generated
+ * hardfloat functions. Each combination of number of inputs and float size
+ * gets its own value.
+ */
+#if defined(__x86_64__)
+# define QEMU_HARDFLOAT_1F32_USE_FP 0
+# define QEMU_HARDFLOAT_1F64_USE_FP 1
+# define QEMU_HARDFLOAT_2F32_USE_FP 0
+# define QEMU_HARDFLOAT_2F64_USE_FP 1
+# define QEMU_HARDFLOAT_3F32_USE_FP 0
+# define QEMU_HARDFLOAT_3F64_USE_FP 1
+#else
+# define QEMU_HARDFLOAT_1F32_USE_FP 0
+# define QEMU_HARDFLOAT_1F64_USE_FP 0
+# define QEMU_HARDFLOAT_2F32_USE_FP 0
+# define QEMU_HARDFLOAT_2F64_USE_FP 0
+# define QEMU_HARDFLOAT_3F32_USE_FP 0
+# define QEMU_HARDFLOAT_3F64_USE_FP 0
+#endif
+
+/*
+ * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
+ * float{32,64}_is_infinity when !USE_FP.
+ * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
+ * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
+ */
+#if defined(__x86_64__) || defined(__aarch64__)
+# define QEMU_HARDFLOAT_USE_ISINF   1
+#else
+# define QEMU_HARDFLOAT_USE_ISINF   0
+#endif
+
+/*
+ * Some targets clear the FP flags before most FP operations. This prevents
+ * the use of hardfloat, since hardfloat relies on the inexact flag being
+ * already set.
+ */
+#if defined(TARGET_PPC) || defined(__FAST_MATH__)
+# if defined(__FAST_MATH__)
+#  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
+    IEEE implementation
+# endif
+# define QEMU_NO_HARDFLOAT 1
+# define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
+#else
+# define QEMU_NO_HARDFLOAT 0
+# define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
+#endif
+
+static inline bool can_use_fpu(const float_status *s)
+{
+    if (QEMU_NO_HARDFLOAT) {
+        return false;
+    }
+    return likely(s->float_exception_flags & float_flag_inexact &&
+                  s->float_rounding_mode == float_round_nearest_even);
+}
+
+/*
+ * Hardfloat generation functions. Each operation can have two flavors:
+ * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
+ * most condition checks, or native ones (e.g. fpclassify).
+ *
+ * The flavor is chosen by the callers. Instead of using macros, we rely on the
+ * compiler to propagate constants and inline everything into the callers.
+ *
+ * We only generate functions for operations with two inputs, since only
+ * these are common enough to justify consolidating them into common code.
+ */
+
+typedef union {
+    float32 s;
+    float h;
+} union_float32;
+
+typedef union {
+    float64 s;
+    double h;
+} union_float64;
+
+typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
+typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
+
+typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
+typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
+typedef float   (*hard_f32_op2_fn)(float a, float b);
+typedef double  (*hard_f64_op2_fn)(double a, double b);
+
+/* 2-input is-zero-or-normal */
+static inline bool f32_is_zon2(union_float32 a, union_float32 b)
+{
+    if (QEMU_HARDFLOAT_2F32_USE_FP) {
+        /*
+         * Not using a temp variable for consecutive fpclassify calls ends up
+         * generating faster code.
+         */
+        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
+               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
+    }
+    return float32_is_zero_or_normal(a.s) &&
+           float32_is_zero_or_normal(b.s);
+}
+
+static inline bool f64_is_zon2(union_float64 a, union_float64 b)
+{
+    if (QEMU_HARDFLOAT_2F64_USE_FP) {
+        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
+               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
+    }
+    return float64_is_zero_or_normal(a.s) &&
+           float64_is_zero_or_normal(b.s);
+}
+
+/* 3-input is-zero-or-normal */
+static inline
+bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
+{
+    if (QEMU_HARDFLOAT_3F32_USE_FP) {
+        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
+               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
+               (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
+    }
+    return float32_is_zero_or_normal(a.s) &&
+           float32_is_zero_or_normal(b.s) &&
+           float32_is_zero_or_normal(c.s);
+}
+
+static inline
+bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
+{
+    if (QEMU_HARDFLOAT_3F64_USE_FP) {
+        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
+               (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
+               (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
+    }
+    return float64_is_zero_or_normal(a.s) &&
+           float64_is_zero_or_normal(b.s) &&
+           float64_is_zero_or_normal(c.s);
+}
+
+static inline bool f32_is_inf(union_float32 a)
+{
+    if (QEMU_HARDFLOAT_USE_ISINF) {
+        return isinf(a.h);
+    }
+    return float32_is_infinity(a.s);
+}
+
+static inline bool f64_is_inf(union_float64 a)
+{
+    if (QEMU_HARDFLOAT_USE_ISINF) {
+        return isinf(a.h);
+    }
+    return float64_is_infinity(a.s);
+}
+
+/* Note: @fast_test and @post can be NULL */
+static inline float32
+float32_gen2(float32 xa, float32 xb, float_status *s,
+             hard_f32_op2_fn hard, soft_f32_op2_fn soft,
+             f32_check_fn pre, f32_check_fn post,
+             f32_check_fn fast_test, soft_f32_op2_fn fast_op)
+{
+    union_float32 ua, ub, ur;
+
+    ua.s = xa;
+    ub.s = xb;
+
+    if (unlikely(!can_use_fpu(s))) {
+        goto soft;
+    }
+
+    float32_input_flush2(&ua.s, &ub.s, s);
+    if (unlikely(!pre(ua, ub))) {
+        goto soft;
+    }
+    if (fast_test && fast_test(ua, ub)) {
+        return fast_op(ua.s, ub.s, s);
+    }
+
+    ur.h = hard(ua.h, ub.h);
+    if (unlikely(f32_is_inf(ur))) {
+        s->float_exception_flags |= float_flag_overflow;
+    } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
+        if (post == NULL || post(ua, ub)) {
+            goto soft;
+        }
+    }
+    return ur.s;
+
+ soft:
+    return soft(ua.s, ub.s, s);
+}
+
+static inline float64
+float64_gen2(float64 xa, float64 xb, float_status *s,
+             hard_f64_op2_fn hard, soft_f64_op2_fn soft,
+             f64_check_fn pre, f64_check_fn post,
+             f64_check_fn fast_test, soft_f64_op2_fn fast_op)
+{
+    union_float64 ua, ub, ur;
+
+    ua.s = xa;
+    ub.s = xb;
+
+    if (unlikely(!can_use_fpu(s))) {
+        goto soft;
+    }
+
+    float64_input_flush2(&ua.s, &ub.s, s);
+    if (unlikely(!pre(ua, ub))) {
+        goto soft;
+    }
+    if (fast_test && fast_test(ua, ub)) {
+        return fast_op(ua.s, ub.s, s);
+    }
+
+    ur.h = hard(ua.h, ub.h);
+    if (unlikely(f64_is_inf(ur))) {
+        s->float_exception_flags |= float_flag_overflow;
+    } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
+        if (post == NULL || post(ua, ub)) {
+            goto soft;
+        }
+    }
+    return ur.s;
+
+ soft:
+    return soft(ua.s, ub.s, s);
+}
+
 /*----------------------------------------------------------------------------
 | Returns the fraction bits of the half-precision floating-point value `a'.
 *----------------------------------------------------------------------------*/
@@ -336,8 +655,8 @@ static inline float64 float64_pack_raw(FloatParts p)
 #include "softfloat-specialize.h"
 
 /* Canonicalize EXP and FRAC, setting CLS.  */
-static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
-                               float_status *status)
+static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
+                                  float_status *status)
 {
     if (part.exp == parm->exp_max && !parm->arm_althp) {
         if (part.frac == 0) {
@@ -513,7 +832,7 @@ static FloatParts round_canonical(FloatParts p, float_status *s,
 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
                                             const FloatFmt *params)
 {
-    return canonicalize(float16_unpack_raw(f), params, s);
+    return sf_canonicalize(float16_unpack_raw(f), params, s);
 }
 
 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
@@ -534,7 +853,7 @@ static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
 
 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
 {
-    return canonicalize(float32_unpack_raw(f), &float32_params, s);
+    return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
 }
 
 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
@@ -544,7 +863,7 @@ static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
 
 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
 {
-    return canonicalize(float64_unpack_raw(f), &float64_params, s);
+    return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
 }
 
 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
@@ -735,49 +1054,128 @@ float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
     return float16_round_pack_canonical(pr, status);
 }
 
-float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *status)
+float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pb = float16_unpack_canonical(b, status);
+    FloatParts pr = addsub_floats(pa, pb, true, status);
+
+    return float16_round_pack_canonical(pr, status);
+}
+
+static float32 QEMU_SOFTFLOAT_ATTR
+soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
 {
     FloatParts pa = float32_unpack_canonical(a, status);
     FloatParts pb = float32_unpack_canonical(b, status);
-    FloatParts pr = addsub_floats(pa, pb, false, status);
+    FloatParts pr = addsub_floats(pa, pb, subtract, status);
 
     return float32_round_pack_canonical(pr, status);
 }
 
-float64 QEMU_FLATTEN float64_add(float64 a, float64 b, float_status *status)
+static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
+{
+    return soft_f32_addsub(a, b, false, status);
+}
+
+static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
+{
+    return soft_f32_addsub(a, b, true, status);
+}
+
+static float64 QEMU_SOFTFLOAT_ATTR
+soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
 {
     FloatParts pa = float64_unpack_canonical(a, status);
     FloatParts pb = float64_unpack_canonical(b, status);
-    FloatParts pr = addsub_floats(pa, pb, false, status);
+    FloatParts pr = addsub_floats(pa, pb, subtract, status);
 
     return float64_round_pack_canonical(pr, status);
 }
 
-float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
+static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
 {
-    FloatParts pa = float16_unpack_canonical(a, status);
-    FloatParts pb = float16_unpack_canonical(b, status);
-    FloatParts pr = addsub_floats(pa, pb, true, status);
+    return soft_f64_addsub(a, b, false, status);
+}
 
-    return float16_round_pack_canonical(pr, status);
+static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
+{
+    return soft_f64_addsub(a, b, true, status);
 }
 
-float32 QEMU_FLATTEN float32_sub(float32 a, float32 b, float_status *status)
+static float hard_f32_add(float a, float b)
 {
-    FloatParts pa = float32_unpack_canonical(a, status);
-    FloatParts pb = float32_unpack_canonical(b, status);
-    FloatParts pr = addsub_floats(pa, pb, true, status);
+    return a + b;
+}
 
-    return float32_round_pack_canonical(pr, status);
+static float hard_f32_sub(float a, float b)
+{
+    return a - b;
 }
 
-float64 QEMU_FLATTEN float64_sub(float64 a, float64 b, float_status *status)
+static double hard_f64_add(double a, double b)
 {
-    FloatParts pa = float64_unpack_canonical(a, status);
-    FloatParts pb = float64_unpack_canonical(b, status);
-    FloatParts pr = addsub_floats(pa, pb, true, status);
+    return a + b;
+}
 
-    return float64_round_pack_canonical(pr, status);
+static double hard_f64_sub(double a, double b)
+{
+    return a - b;
+}
+
+static bool f32_addsub_post(union_float32 a, union_float32 b)
+{
+    if (QEMU_HARDFLOAT_2F32_USE_FP) {
+        return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
+    }
+    return !(float32_is_zero(a.s) && float32_is_zero(b.s));
+}
+
+static bool f64_addsub_post(union_float64 a, union_float64 b)
+{
+    if (QEMU_HARDFLOAT_2F64_USE_FP) {
+        return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
+    } else {
+        return !(float64_is_zero(a.s) && float64_is_zero(b.s));
+    }
+}
+
+static float32 float32_addsub(float32 a, float32 b, float_status *s,
+                              hard_f32_op2_fn hard, soft_f32_op2_fn soft)
+{
+    return float32_gen2(a, b, s, hard, soft,
+                        f32_is_zon2, f32_addsub_post, NULL, NULL);
+}
+
+static float64 float64_addsub(float64 a, float64 b, float_status *s,
+                              hard_f64_op2_fn hard, soft_f64_op2_fn soft)
+{
+    return float64_gen2(a, b, s, hard, soft,
+                        f64_is_zon2, f64_addsub_post, NULL, NULL);
+}
+
+float32 QEMU_FLATTEN
+float32_add(float32 a, float32 b, float_status *s)
+{
+    return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
+}
+
+float32 QEMU_FLATTEN
+float32_sub(float32 a, float32 b, float_status *s)
+{
+    return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
+}
+
+float64 QEMU_FLATTEN
+float64_add(float64 a, float64 b, float_status *s)
+{
+    return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
+}
+
+float64 QEMU_FLATTEN
+float64_sub(float64 a, float64 b, float_status *s)
+{
+    return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
 }
 
 /*
@@ -838,7 +1236,8 @@ float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
     return float16_round_pack_canonical(pr, status);
 }
 
-float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status)
+static float32 QEMU_SOFTFLOAT_ATTR
+soft_f32_mul(float32 a, float32 b, float_status *status)
 {
     FloatParts pa = float32_unpack_canonical(a, status);
     FloatParts pb = float32_unpack_canonical(b, status);
@@ -847,7 +1246,8 @@ float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status)
     return float32_round_pack_canonical(pr, status);
 }
 
-float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status)
+static float64 QEMU_SOFTFLOAT_ATTR
+soft_f64_mul(float64 a, float64 b, float_status *status)
 {
     FloatParts pa = float64_unpack_canonical(a, status);
     FloatParts pb = float64_unpack_canonical(b, status);
@@ -856,6 +1256,54 @@ float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status)
     return float64_round_pack_canonical(pr, status);
 }
 
+static float hard_f32_mul(float a, float b)
+{
+    return a * b;
+}
+
+static double hard_f64_mul(double a, double b)
+{
+    return a * b;
+}
+
+static bool f32_mul_fast_test(union_float32 a, union_float32 b)
+{
+    return float32_is_zero(a.s) || float32_is_zero(b.s);
+}
+
+static bool f64_mul_fast_test(union_float64 a, union_float64 b)
+{
+    return float64_is_zero(a.s) || float64_is_zero(b.s);
+}
+
+static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
+{
+    bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
+
+    return float32_set_sign(float32_zero, signbit);
+}
+
+static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
+{
+    bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
+
+    return float64_set_sign(float64_zero, signbit);
+}
+
+float32 QEMU_FLATTEN
+float32_mul(float32 a, float32 b, float_status *s)
+{
+    return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
+                        f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
+}
+
+float64 QEMU_FLATTEN
+float64_mul(float64 a, float64 b, float_status *s)
+{
+    return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
+                        f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
+}
+
 /*
  * Returns the result of multiplying the floating-point values `a' and
  * `b' then adding 'c', with no intermediate rounding step after the
@@ -1070,8 +1518,9 @@ float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
     return float16_round_pack_canonical(pr, status);
 }
 
-float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
-                                                int flags, float_status *status)
+static float32 QEMU_SOFTFLOAT_ATTR
+soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
+                float_status *status)
 {
     FloatParts pa = float32_unpack_canonical(a, status);
     FloatParts pb = float32_unpack_canonical(b, status);
@@ -1081,8 +1530,9 @@ float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
     return float32_round_pack_canonical(pr, status);
 }
 
-float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
-                                                int flags, float_status *status)
+static float64 QEMU_SOFTFLOAT_ATTR
+soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
+                float_status *status)
 {
     FloatParts pa = float64_unpack_canonical(a, status);
     FloatParts pb = float64_unpack_canonical(b, status);
@@ -1092,6 +1542,128 @@ float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
     return float64_round_pack_canonical(pr, status);
 }
 
+float32 QEMU_FLATTEN
+float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
+{
+    union_float32 ua, ub, uc, ur;
+
+    ua.s = xa;
+    ub.s = xb;
+    uc.s = xc;
+
+    if (unlikely(!can_use_fpu(s))) {
+        goto soft;
+    }
+    if (unlikely(flags & float_muladd_halve_result)) {
+        goto soft;
+    }
+
+    float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
+    if (unlikely(!f32_is_zon3(ua, ub, uc))) {
+        goto soft;
+    }
+    /*
+     * When (a || b) == 0, there's no need to check for under/over flow,
+     * since we know the addend is (normal || 0) and the product is 0.
+     */
+    if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
+        union_float32 up;
+        bool prod_sign;
+
+        prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
+        prod_sign ^= !!(flags & float_muladd_negate_product);
+        up.s = float32_set_sign(float32_zero, prod_sign);
+
+        if (flags & float_muladd_negate_c) {
+            uc.h = -uc.h;
+        }
+        ur.h = up.h + uc.h;
+    } else {
+        if (flags & float_muladd_negate_product) {
+            ua.h = -ua.h;
+        }
+        if (flags & float_muladd_negate_c) {
+            uc.h = -uc.h;
+        }
+
+        ur.h = fmaf(ua.h, ub.h, uc.h);
+
+        if (unlikely(f32_is_inf(ur))) {
+            s->float_exception_flags |= float_flag_overflow;
+        } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
+            goto soft;
+        }
+    }
+    if (flags & float_muladd_negate_result) {
+        return float32_chs(ur.s);
+    }
+    return ur.s;
+
+ soft:
+    return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
+}
+
+float64 QEMU_FLATTEN
+float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
+{
+    union_float64 ua, ub, uc, ur;
+
+    ua.s = xa;
+    ub.s = xb;
+    uc.s = xc;
+
+    if (unlikely(!can_use_fpu(s))) {
+        goto soft;
+    }
+    if (unlikely(flags & float_muladd_halve_result)) {
+        goto soft;
+    }
+
+    float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
+    if (unlikely(!f64_is_zon3(ua, ub, uc))) {
+        goto soft;
+    }
+    /*
+     * When (a || b) == 0, there's no need to check for under/over flow,
+     * since we know the addend is (normal || 0) and the product is 0.
+     */
+    if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
+        union_float64 up;
+        bool prod_sign;
+
+        prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
+        prod_sign ^= !!(flags & float_muladd_negate_product);
+        up.s = float64_set_sign(float64_zero, prod_sign);
+
+        if (flags & float_muladd_negate_c) {
+            uc.h = -uc.h;
+        }
+        ur.h = up.h + uc.h;
+    } else {
+        if (flags & float_muladd_negate_product) {
+            ua.h = -ua.h;
+        }
+        if (flags & float_muladd_negate_c) {
+            uc.h = -uc.h;
+        }
+
+        ur.h = fma(ua.h, ub.h, uc.h);
+
+        if (unlikely(f64_is_inf(ur))) {
+            s->float_exception_flags |= float_flag_overflow;
+        } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
+            goto soft;
+        }
+    }
+    if (flags & float_muladd_negate_result) {
+        return float64_chs(ur.s);
+    }
+    return ur.s;
+
+ soft:
+    return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
+}
+
 /*
  * Returns the result of dividing the floating-point value `a' by the
  * corresponding value `b'. The operation is performed according to
@@ -1180,7 +1752,8 @@ float16 float16_div(float16 a, float16 b, float_status *status)
     return float16_round_pack_canonical(pr, status);
 }
 
-float32 float32_div(float32 a, float32 b, float_status *status)
+static float32 QEMU_SOFTFLOAT_ATTR
+soft_f32_div(float32 a, float32 b, float_status *status)
 {
     FloatParts pa = float32_unpack_canonical(a, status);
     FloatParts pb = float32_unpack_canonical(b, status);
@@ -1189,7 +1762,8 @@ float32 float32_div(float32 a, float32 b, float_status *status)
     return float32_round_pack_canonical(pr, status);
 }
 
-float64 float64_div(float64 a, float64 b, float_status *status)
+static float64 QEMU_SOFTFLOAT_ATTR
+soft_f64_div(float64 a, float64 b, float_status *status)
 {
     FloatParts pa = float64_unpack_canonical(a, status);
     FloatParts pb = float64_unpack_canonical(b, status);
@@ -1198,6 +1772,64 @@ float64 float64_div(float64 a, float64 b, float_status *status)
     return float64_round_pack_canonical(pr, status);
 }
 
+static float hard_f32_div(float a, float b)
+{
+    return a / b;
+}
+
+static double hard_f64_div(double a, double b)
+{
+    return a / b;
+}
+
+static bool f32_div_pre(union_float32 a, union_float32 b)
+{
+    if (QEMU_HARDFLOAT_2F32_USE_FP) {
+        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
+               fpclassify(b.h) == FP_NORMAL;
+    }
+    return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
+}
+
+static bool f64_div_pre(union_float64 a, union_float64 b)
+{
+    if (QEMU_HARDFLOAT_2F64_USE_FP) {
+        return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
+               fpclassify(b.h) == FP_NORMAL;
+    }
+    return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
+}
+
+static bool f32_div_post(union_float32 a, union_float32 b)
+{
+    if (QEMU_HARDFLOAT_2F32_USE_FP) {
+        return fpclassify(a.h) != FP_ZERO;
+    }
+    return !float32_is_zero(a.s);
+}
+
+static bool f64_div_post(union_float64 a, union_float64 b)
+{
+    if (QEMU_HARDFLOAT_2F64_USE_FP) {
+        return fpclassify(a.h) != FP_ZERO;
+    }
+    return !float64_is_zero(a.s);
+}
+
+float32 QEMU_FLATTEN
+float32_div(float32 a, float32 b, float_status *s)
+{
+    return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
+                        f32_div_pre, f32_div_post, NULL, NULL);
+}
+
+float64 QEMU_FLATTEN
+float64_div(float64 a, float64 b, float_status *s)
+{
+    return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
+                        f64_div_pre, f64_div_post, NULL, NULL);
+}
+
 /*
  * Float to Float conversions
  *
@@ -2271,28 +2903,109 @@ static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
     }
 }
 
-#define COMPARE(sz)                                                     \
-int float ## sz ## _compare(float ## sz a, float ## sz b,               \
-                            float_status *s)                            \
+#define COMPARE(name, attr, sz)                                         \
+static int attr                                                         \
+name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
 {                                                                       \
     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
-    return compare_floats(pa, pb, false, s);                            \
-}                                                                       \
-int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
-                                  float_status *s)                      \
-{                                                                       \
-    FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
-    FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
-    return compare_floats(pa, pb, true, s);                             \
+    return compare_floats(pa, pb, is_quiet, s);                         \
 }
 
-COMPARE(16)
-COMPARE(32)
-COMPARE(64)
+COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
+COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
+COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
 
 #undef COMPARE
 
+int float16_compare(float16 a, float16 b, float_status *s)
+{
+    return soft_f16_compare(a, b, false, s);
+}
+
+int float16_compare_quiet(float16 a, float16 b, float_status *s)
+{
+    return soft_f16_compare(a, b, true, s);
+}
+
+static int QEMU_FLATTEN
+f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
+{
+    union_float32 ua, ub;
+
+    ua.s = xa;
+    ub.s = xb;
+
+    if (QEMU_NO_HARDFLOAT) {
+        goto soft;
+    }
+
+    float32_input_flush2(&ua.s, &ub.s, s);
+    if (isgreaterequal(ua.h, ub.h)) {
+        if (isgreater(ua.h, ub.h)) {
+            return float_relation_greater;
+        }
+        return float_relation_equal;
+    }
+    if (likely(isless(ua.h, ub.h))) {
+        return float_relation_less;
+    }
+    /* The only condition remaining is unordered.
+     * Fall through to set flags.
+     */
+ soft:
+    return soft_f32_compare(ua.s, ub.s, is_quiet, s);
+}
+
+int float32_compare(float32 a, float32 b, float_status *s)
+{
+    return f32_compare(a, b, false, s);
+}
+
+int float32_compare_quiet(float32 a, float32 b, float_status *s)
+{
+    return f32_compare(a, b, true, s);
+}
+
+static int QEMU_FLATTEN
+f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
+{
+    union_float64 ua, ub;
+
+    ua.s = xa;
+    ub.s = xb;
+
+    if (QEMU_NO_HARDFLOAT) {
+        goto soft;
+    }
+
+    float64_input_flush2(&ua.s, &ub.s, s);
+    if (isgreaterequal(ua.h, ub.h)) {
+        if (isgreater(ua.h, ub.h)) {
+            return float_relation_greater;
+        }
+        return float_relation_equal;
+    }
+    if (likely(isless(ua.h, ub.h))) {
+        return float_relation_less;
+    }
+    /* The only condition remaining is unordered.
+     * Fall through to set flags.
+     */
+ soft:
+    return soft_f64_compare(ua.s, ub.s, is_quiet, s);
+}
+
+int float64_compare(float64 a, float64 b, float_status *s)
+{
+    return f64_compare(a, b, false, s);
+}
+
+int float64_compare_quiet(float64 a, float64 b, float_status *s)
+{
+    return f64_compare(a, b, true, s);
+}
+
 /* Multiply A by 2 raised to the power N.  */
 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
 {
@@ -2412,20 +3125,76 @@ float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
     return float16_round_pack_canonical(pr, status);
 }
 
-float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
+static float32 QEMU_SOFTFLOAT_ATTR
+soft_f32_sqrt(float32 a, float_status *status)
 {
     FloatParts pa = float32_unpack_canonical(a, status);
     FloatParts pr = sqrt_float(pa, status, &float32_params);
     return float32_round_pack_canonical(pr, status);
 }
 
-float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
+static float64 QEMU_SOFTFLOAT_ATTR
+soft_f64_sqrt(float64 a, float_status *status)
 {
     FloatParts pa = float64_unpack_canonical(a, status);
     FloatParts pr = sqrt_float(pa, status, &float64_params);
     return float64_round_pack_canonical(pr, status);
 }
 
+float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
+{
+    union_float32 ua, ur;
+
+    ua.s = xa;
+    if (unlikely(!can_use_fpu(s))) {
+        goto soft;
+    }
+
+    float32_input_flush1(&ua.s, s);
+    if (QEMU_HARDFLOAT_1F32_USE_FP) {
+        if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
+                       fpclassify(ua.h) == FP_ZERO) ||
+                     signbit(ua.h))) {
+            goto soft;
+        }
+    } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
+                        float32_is_neg(ua.s))) {
+        goto soft;
+    }
+    ur.h = sqrtf(ua.h);
+    return ur.s;
+
+ soft:
+    return soft_f32_sqrt(ua.s, s);
+}
+
+float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
+{
+    union_float64 ua, ur;
+
+    ua.s = xa;
+    if (unlikely(!can_use_fpu(s))) {
+        goto soft;
+    }
+
+    float64_input_flush1(&ua.s, s);
+    if (QEMU_HARDFLOAT_1F64_USE_FP) {
+        if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
+                       fpclassify(ua.h) == FP_ZERO) ||
+                     signbit(ua.h))) {
+            goto soft;
+        }
+    } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
+                        float64_is_neg(ua.s))) {
+        goto soft;
+    }
+    ur.h = sqrt(ua.h);
+    return ur.s;
+
+ soft:
+    return soft_f64_sqrt(ua.s, s);
+}
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated NaN.
 *----------------------------------------------------------------------------*/