From 8db9edbc1852210d60eaf5b42240e37fa2c1c969 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Mon, 1 Jun 2026 00:48:45 +0100 Subject: [PATCH 1/6] EXP-050: straight-line nested-if integer-part scan (ported from ffc EXP-026/028) Peel the first 5 iterations of the integer-part digit loop into nested ifs, eliminating the loop back-edge for the common 1-5 digit integer case. Identical semantics (i = 10*i + digit). Biggest win on inputs with multi-digit integer parts (mesh 3D coordinates). ARM Graviton4 (canonical MB/s, vs upstream 7790aa6 baseline): GCC: random +0.05%, canada +4.0%, mesh +34.3% (c/f 55.7->41.4) Clang: random +4.9%, canada +2.8%, mesh +5.1% Correctness: 14/14 core+supplemental pass. Co-Authored-By: Claude Opus 4.8 --- include/fast_float/ascii_number.h | 35 +++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 12c2fddc..257b43b6 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -354,13 +354,36 @@ parse_number_string(UC const *p, UC const *pend, uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) - while ((p != pend) && is_integer(*p)) { - // a multiplication by 10 is cheaper than an arbitrary integer - // multiplication - i = 10 * i + - uint64_t(*p - - UC('0')); // might overflow, we will handle the overflow later + // Straight-line unroll of the integer-part scan: most integer parts are + // 1-5 digits, so peeling the first iterations eliminates the loop back-edge + // for the common case (ported from ffc EXP-026/028). Semantics are identical + // to the original `while` loop: i = 10*i + digit, advancing p. + if ((p != pend) && is_integer(*p)) { + i = uint64_t(*p - UC('0')); ++p; + if ((p != pend) && is_integer(*p)) { + i = 10 * i + uint64_t(*p - UC('0')); + ++p; + if ((p != pend) && is_integer(*p)) { + i = 10 * i + uint64_t(*p - UC('0')); + ++p; + if ((p != pend) && is_integer(*p)) { + i = 10 * i + uint64_t(*p - UC('0')); + ++p; + if ((p != pend) && is_integer(*p)) { + i = 10 * i + uint64_t(*p - UC('0')); + ++p; + while ((p != pend) && is_integer(*p)) { + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication + i = 10 * i + + uint64_t(*p - UC('0')); // might overflow, handled later + ++p; + } + } + } + } + } } UC const *const end_of_integer_part = p; int64_t digit_count = int64_t(end_of_integer_part - start_digits); From ee849467021be4fbef05e90913528cb100127bc1 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Mon, 1 Jun 2026 00:56:52 +0100 Subject: [PATCH 2/6] EXP-052: 2x unroll of char loop_parse_if_eight_digits (ported from ffc EXP-044) Clang/AArch64-gated 16-digit-per-iteration unroll of the fraction SWAR loop; eliminates the back-edge for typical 17-digit [0,1] mantissas. GCC keeps the auto-unrolled simple loop. ARM Graviton4 (canonical fast_float MB/s vs EXP-050): Clang: random +2.8% (1365.7 from 1328.8), mesh +1.7%, canada +0.5% GCC: unchanged (#else path) Correctness: 14/14 pass. Co-Authored-By: Claude Opus 4.8 --- include/fast_float/ascii_number.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 257b43b6..6c79509b 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -259,6 +259,34 @@ fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void loop_parse_if_eight_digits(char const *&p, char const *const pend, uint64_t &i) { // optimizes better than parse_if_eight_digits_unrolled() for UC = char. +#if defined(__aarch64__) && defined(__clang__) + // 2x unroll (ported from ffc EXP-044): on Clang/AArch64, consuming 16 digits + // per iteration eliminates the loop back-edge for typical fractions (e.g. the + // 17-digit mantissas of uniform [0,1] inputs) and keeps the SWAR constants + // resident. GCC auto-unrolls the simple loop well, so it keeps the original. + while ((pend - p) >= 16) { + uint64_t val1 = read8_to_u64(p); + if (!is_made_of_eight_digits_fast(val1)) { + break; + } + uint64_t val2 = read8_to_u64(p + 8); + if (!is_made_of_eight_digits_fast(val2)) { + i = i * 100000000 + parse_eight_digits_unrolled(val1); + p += 8; + return; // val2 is not a full digit block; caller's byte loop handles rest + } + i = (i * 100000000 + parse_eight_digits_unrolled(val1)) * 100000000 + + parse_eight_digits_unrolled(val2); // in rare cases overflows, that's ok + p += 16; + } + if ((pend - p) >= 8) { + uint64_t val = read8_to_u64(p); + if (is_made_of_eight_digits_fast(val)) { + i = i * 100000000 + parse_eight_digits_unrolled(val); + p += 8; + } + } +#else while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) { i = i * 100000000 + @@ -266,6 +294,7 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend, p)); // in rare cases, this will overflow, but that's ok p += 8; } +#endif } enum class parse_error { From a30c1f3d3f42dde8c34478215edbe3ce3234b350 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Mon, 1 Jun 2026 01:04:30 +0100 Subject: [PATCH 3/6] EXP-053: 4-digit SWAR follow-up in loop_parse_if_eight_digits, GCC path (ffc EXP-001) After the 8-digit block loop, consume a remaining 4-7 digit run in one SWAR step (reusing fast_float's existing read4_to_u32 / is_made_of_four_digits_fast / parse_four_digits_unrolled) instead of byte-by-byte. GCC path only: on Clang the follow-up's presence bloated the 2x-unroll codegen and regressed random -6.2%. ARM Graviton4 (canonical fast_float MB/s vs EXP-052): GCC: canada +2.6% (948.1 from 924.0, i/f 248.7->229.7), random/mesh flat Clang: unchanged (EXP-052 path preserved) Correctness: 14/14 pass. Co-Authored-By: Claude Opus 4.8 --- include/fast_float/ascii_number.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 6c79509b..371f3da3 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -294,6 +294,17 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend, p)); // in rare cases, this will overflow, but that's ok p += 8; } + // 4-digit SWAR follow-up (ported from ffc EXP-001): consume a remaining 4-7 + // digit run in one step rather than byte-by-byte. GCC path only — on Clang + // the follow-up's presence bloated the 2x-unroll codegen and regressed random. + if ((pend - p) >= 4) { + uint32_t const val4 = read4_to_u32(p); + if (is_made_of_four_digits_fast(val4)) { + i = i * 10000 + + parse_four_digits_unrolled(val4); // in rare cases overflows, that's ok + p += 4; + } + } #endif } From 3ff2c0b8945408f63b381af4f22a6167e52b486d Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Mon, 1 Jun 2026 09:15:26 +0100 Subject: [PATCH 4/6] EXP-053: clang-format (reflow comment + expression wrap; no semantic change) Pre-clear the lint_and_format_check CI gate. clang-format-18 (CI pins 17; LLVM base style is identical for these constructs). Behavior/benchmarks unchanged. Co-Authored-By: Claude Opus 4.8 --- include/fast_float/ascii_number.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index 371f3da3..14caa0d2 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -296,12 +296,13 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend, } // 4-digit SWAR follow-up (ported from ffc EXP-001): consume a remaining 4-7 // digit run in one step rather than byte-by-byte. GCC path only — on Clang - // the follow-up's presence bloated the 2x-unroll codegen and regressed random. + // the follow-up's presence bloated the 2x-unroll codegen and regressed + // random. if ((pend - p) >= 4) { uint32_t const val4 = read4_to_u32(p); if (is_made_of_four_digits_fast(val4)) { - i = i * 10000 + - parse_four_digits_unrolled(val4); // in rare cases overflows, that's ok + i = i * 10000 + parse_four_digits_unrolled( + val4); // in rare cases overflows, that's ok p += 4; } } From f4f36e04f7c7546086624f3995b26ea81bcaf675 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Fri, 3 Jul 2026 14:13:44 +0100 Subject: [PATCH 5/6] EXP-062: enable the 4-digit SWAR fraction follow-up on all compilers (drop __clang__ gate) --- include/fast_float/ascii_number.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h index e431cbcd..6fa640db 100644 --- a/include/fast_float/ascii_number.h +++ b/include/fast_float/ascii_number.h @@ -268,10 +268,9 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend, } // Consume a remaining 4-7 digit run in a single SWAR step instead of // byte-by-byte (reuses the existing 4-digit helpers). The parsed result is - // identical either way. Gated to clang: on gcc the extra 4-digit check - // regresses inputs whose remainder is shorter than 4 digits (it becomes pure - // overhead there); clang does not show that. -#if defined(__clang__) + // identical either way. Historically gated to clang because gcc regressed on + // short remainders, but that verdict predates the span-elision restructure; + // with the leaner hot path the 4-digit step now wins on gcc as well. if ((pend - p) >= 4) { uint32_t const val4 = read4_to_u32(p); if (is_made_of_four_digits_fast(val4)) { @@ -280,7 +279,6 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend, p += 4; } } -#endif } enum class parse_error { From 5082489e352ce4904da4d2f652760b8781cae561 Mon Sep 17 00:00:00 2001 From: fcostaoliveira Date: Fri, 3 Jul 2026 14:15:29 +0100 Subject: [PATCH 6/6] EXP-063: test the mode-independent mantissa bound before the rounds_to_nearest probe --- include/fast_float/parse_number.h | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h index 7d338f3b..117ec696 100644 --- a/include/fast_float/parse_number.h +++ b/include/fast_float/parse_number.h @@ -198,7 +198,14 @@ clinger_fast_path_impl(uint64_t mantissa, int64_t exponent, bool is_negative, // We proceed optimistically, assuming that detail::rounds_to_nearest() // returns true. if (binary_format::min_exponent_fast_path() <= exponent && - exponent <= binary_format::max_exponent_fast_path()) { + exponent <= binary_format::max_exponent_fast_path() && + mantissa <= binary_format::max_mantissa_fast_path()) { + // The mantissa bound above is a necessary condition for BOTH branches + // below: the rounding-mode-dependent branch checks the tighter + // max_mantissa_fast_path(exponent) <= max_mantissa_fast_path(). Testing + // it before detail::rounds_to_nearest() spares long-mantissa inputs + // (which can never take the fast path) the volatile-float probe. + // // Unfortunately, the conventional Clinger's fast path is only possible // when the system rounds to the nearest float. // @@ -209,18 +216,16 @@ clinger_fast_path_impl(uint64_t mantissa, int64_t exponent, bool is_negative, if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) { // We have that fegetround() == FE_TONEAREST. // Next is Clinger's fast path. - if (mantissa <= binary_format::max_mantissa_fast_path()) { - value = T(mantissa); - if (exponent < 0) { - value = value / binary_format::exact_power_of_ten(-exponent); - } else { - value = value * binary_format::exact_power_of_ten(exponent); - } - if (is_negative) { - value = -value; - } - return true; + value = T(mantissa); + if (exponent < 0) { + value = value / binary_format::exact_power_of_ten(-exponent); + } else { + value = value * binary_format::exact_power_of_ten(exponent); + } + if (is_negative) { + value = -value; } + return true; } else { // We do not have that fegetround() == FE_TONEAREST. // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's