summary refs log tree commit diff stats
path: root/include
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2024-05-03 08:13:51 -0700
committerRichard Henderson <richard.henderson@linaro.org>2024-05-03 08:13:51 -0700
commit909aff7eaf6335aeeb4962fb0ac2a6c571c96af2 (patch)
tree6da740ce80908710c400399e25b784660d59386c /include
parent4977ce198d2390bff8c71ad5cb1a5f6aa24b56fb (diff)
parenta06d9eddb015a9f5895161b0a3958a2e4be21579 (diff)
downloadfocaccia-qemu-909aff7eaf6335aeeb4962fb0ac2a6c571c96af2.tar.gz
focaccia-qemu-909aff7eaf6335aeeb4962fb0ac2a6c571c96af2.zip
Merge tag 'pull-misc-20240503' of https://gitlab.com/rth7680/qemu into staging
util/bufferiszero:
  - Remove sse4.1 and avx512 variants
  - Reorganize for early test for acceleration
  - Remove useless prefetches
  - Optimize sse2, avx2 and integer variants
  - Add simd acceleration for aarch64
  - Add bufferiszero-bench

# -----BEGIN PGP SIGNATURE-----
#
# iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmY0/qMdHHJpY2hhcmQu
# aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV+ULQf/T2JSdvG6/EjDCf4N
# cnSGiUV2MIeByw8tkrc/fWCNdlulHhk9gbg9l+f2muwK8H/k2BdynbrQnt1Ymmtk
# xzM6+PNOcByaovSAkvNweZVbrQX36Yih9S7f3n+xcxfVuvvYhKSLHXLkeqO96LMd
# rN+WRpxhReaU3n8/FO7o3S26SRpk7X9kRfShaT7U7ytHGjGsXUvMKIRs30hbsJTB
# yjed0a0u54FoSlN6AEqjWdgzaWP8nT65+8Yxe3dzB9hx09UiolZo60eHqYy7Mkno
# N6aMOB6gUUbCiKZ3Qk+1zEX97vl26NH3zt5tIIJTWDoIkC3f9qbg1x5hwWLQ3rra
# rM8h8w==
# =DnZO
# -----END PGP SIGNATURE-----
# gpg: Signature made Fri 03 May 2024 08:11:31 AM PDT
# gpg:                using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg:                issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [ultimate]

* tag 'pull-misc-20240503' of https://gitlab.com/rth7680/qemu:
  tests/bench: Add bufferiszero-bench
  util/bufferiszero: Add simd acceleration for aarch64
  util/bufferiszero: Simplify test_buffer_is_zero_next_accel
  util/bufferiszero: Introduce biz_accel_fn typedef
  util/bufferiszero: Improve scalar variant
  util/bufferiszero: Optimize SSE2 and AVX2 variants
  util/bufferiszero: Remove useless prefetches
  util/bufferiszero: Reorganize for early test for acceleration
  util/bufferiszero: Remove AVX512 variant
  util/bufferiszero: Remove SSE4.1 variant

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'include')
-rw-r--r--include/qemu/cutils.h32
1 files changed, 31 insertions, 1 deletions
diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h
index 92c927a6a3..741dade7cf 100644
--- a/include/qemu/cutils.h
+++ b/include/qemu/cutils.h
@@ -187,9 +187,39 @@ char *freq_to_str(uint64_t freq_hz);
 /* used to print char* safely */
 #define STR_OR_NULL(str) ((str) ? (str) : "null")
 
-bool buffer_is_zero(const void *buf, size_t len);
+/*
+ * Check if a buffer is all zeroes.
+ */
+
+bool buffer_is_zero_ool(const void *vbuf, size_t len);
+bool buffer_is_zero_ge256(const void *vbuf, size_t len);
 bool test_buffer_is_zero_next_accel(void);
 
+static inline bool buffer_is_zero_sample3(const char *buf, size_t len)
+{
+    /*
+     * For any reasonably sized buffer, these three samples come from
+     * three different cachelines.  In qemu-img usage, we find that
+     * each byte eliminates more than half of all buffer testing.
+     * It is therefore critical to performance that the byte tests
+     * short-circuit, so that we do not pull in additional cache lines.
+     * Do not "optimize" this to !(a | b | c).
+     */
+    return !buf[0] && !buf[len - 1] && !buf[len / 2];
+}
+
+#ifdef __OPTIMIZE__
+static inline bool buffer_is_zero(const void *buf, size_t len)
+{
+    return (__builtin_constant_p(len) && len >= 256
+            ? buffer_is_zero_sample3(buf, len) &&
+              buffer_is_zero_ge256(buf, len)
+            : buffer_is_zero_ool(buf, len));
+}
+#else
+#define buffer_is_zero  buffer_is_zero_ool
+#endif
+
 /*
  * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128)
  * Input is limited to 14-bit numbers