From 8db9edbc1852210d60eaf5b42240e37fa2c1c969 Mon Sep 17 00:00:00 2001
From: fcostaoliveira <filipe@redis.com>
Date: Mon, 1 Jun 2026 00:48:45 +0100
Subject: [PATCH 1/6] EXP-050: straight-line nested-if integer-part scan
 (ported from ffc EXP-026/028)

Peel the first 5 iterations of the integer-part digit loop into nested ifs,
eliminating the loop back-edge for the common 1-5 digit integer case. Identical
semantics (i = 10*i + digit). Biggest win on inputs with multi-digit integer
parts (mesh 3D coordinates).

ARM Graviton4 (canonical MB/s, vs upstream 7790aa6 baseline):
  GCC:   random +0.05%, canada +4.0%, mesh +34.3% (c/f 55.7->41.4)
  Clang: random +4.9%,  canada +2.8%, mesh +5.1%
Correctness: 14/14 core+supplemental pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 include/fast_float/ascii_number.h | 35 +++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 12c2fddc..257b43b6 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -354,13 +354,36 @@ parse_number_string(UC const *p, UC const *pend,
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
-  while ((p != pend) && is_integer(*p)) {
-    // a multiplication by 10 is cheaper than an arbitrary integer
-    // multiplication
-    i = 10 * i +
-        uint64_t(*p -
-                 UC('0')); // might overflow, we will handle the overflow later
+  // Straight-line unroll of the integer-part scan: most integer parts are
+  // 1-5 digits, so peeling the first iterations eliminates the loop back-edge
+  // for the common case (ported from ffc EXP-026/028). Semantics are identical
+  // to the original `while` loop: i = 10*i + digit, advancing p.
+  if ((p != pend) && is_integer(*p)) {
+    i = uint64_t(*p - UC('0'));
     ++p;
+    if ((p != pend) && is_integer(*p)) {
+      i = 10 * i + uint64_t(*p - UC('0'));
+      ++p;
+      if ((p != pend) && is_integer(*p)) {
+        i = 10 * i + uint64_t(*p - UC('0'));
+        ++p;
+        if ((p != pend) && is_integer(*p)) {
+          i = 10 * i + uint64_t(*p - UC('0'));
+          ++p;
+          if ((p != pend) && is_integer(*p)) {
+            i = 10 * i + uint64_t(*p - UC('0'));
+            ++p;
+            while ((p != pend) && is_integer(*p)) {
+              // a multiplication by 10 is cheaper than an arbitrary integer
+              // multiplication
+              i = 10 * i +
+                  uint64_t(*p - UC('0')); // might overflow, handled later
+              ++p;
+            }
+          }
+        }
+      }
+    }
   }
   UC const *const end_of_integer_part = p;
   int64_t digit_count = int64_t(end_of_integer_part - start_digits);

From ee849467021be4fbef05e90913528cb100127bc1 Mon Sep 17 00:00:00 2001
From: fcostaoliveira <filipe@redis.com>
Date: Mon, 1 Jun 2026 00:56:52 +0100
Subject: [PATCH 2/6] EXP-052: 2x unroll of char loop_parse_if_eight_digits
 (ported from ffc EXP-044)

Clang/AArch64-gated 16-digit-per-iteration unroll of the fraction SWAR loop;
eliminates the back-edge for typical 17-digit [0,1] mantissas. GCC keeps the
auto-unrolled simple loop.

ARM Graviton4 (canonical fast_float MB/s vs EXP-050):
  Clang: random +2.8% (1365.7 from 1328.8), mesh +1.7%, canada +0.5%
  GCC:   unchanged (#else path)
Correctness: 14/14 pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 include/fast_float/ascii_number.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 257b43b6..6c79509b 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -259,6 +259,34 @@ fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
 loop_parse_if_eight_digits(char const *&p, char const *const pend,
                            uint64_t &i) {
   // optimizes better than parse_if_eight_digits_unrolled() for UC = char.
+#if defined(__aarch64__) && defined(__clang__)
+  // 2x unroll (ported from ffc EXP-044): on Clang/AArch64, consuming 16 digits
+  // per iteration eliminates the loop back-edge for typical fractions (e.g. the
+  // 17-digit mantissas of uniform [0,1] inputs) and keeps the SWAR constants
+  // resident. GCC auto-unrolls the simple loop well, so it keeps the original.
+  while ((pend - p) >= 16) {
+    uint64_t val1 = read8_to_u64(p);
+    if (!is_made_of_eight_digits_fast(val1)) {
+      break;
+    }
+    uint64_t val2 = read8_to_u64(p + 8);
+    if (!is_made_of_eight_digits_fast(val2)) {
+      i = i * 100000000 + parse_eight_digits_unrolled(val1);
+      p += 8;
+      return; // val2 is not a full digit block; caller's byte loop handles rest
+    }
+    i = (i * 100000000 + parse_eight_digits_unrolled(val1)) * 100000000 +
+        parse_eight_digits_unrolled(val2); // in rare cases overflows, that's ok
+    p += 16;
+  }
+  if ((pend - p) >= 8) {
+    uint64_t val = read8_to_u64(p);
+    if (is_made_of_eight_digits_fast(val)) {
+      i = i * 100000000 + parse_eight_digits_unrolled(val);
+      p += 8;
+    }
+  }
+#else
   while ((std::distance(p, pend) >= 8) &&
          is_made_of_eight_digits_fast(read8_to_u64(p))) {
     i = i * 100000000 +
@@ -266,6 +294,7 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend,
             p)); // in rare cases, this will overflow, but that's ok
     p += 8;
   }
+#endif
 }
 
 enum class parse_error {

From a30c1f3d3f42dde8c34478215edbe3ce3234b350 Mon Sep 17 00:00:00 2001
From: fcostaoliveira <filipe@redis.com>
Date: Mon, 1 Jun 2026 01:04:30 +0100
Subject: [PATCH 3/6] EXP-053: 4-digit SWAR follow-up in
 loop_parse_if_eight_digits, GCC path (ffc EXP-001)

After the 8-digit block loop, consume a remaining 4-7 digit run in one SWAR step
(reusing fast_float's existing read4_to_u32 / is_made_of_four_digits_fast /
parse_four_digits_unrolled) instead of byte-by-byte. GCC path only: on Clang the
follow-up's presence bloated the 2x-unroll codegen and regressed random -6.2%.

ARM Graviton4 (canonical fast_float MB/s vs EXP-052):
  GCC:   canada +2.6% (948.1 from 924.0, i/f 248.7->229.7), random/mesh flat
  Clang: unchanged (EXP-052 path preserved)
Correctness: 14/14 pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 include/fast_float/ascii_number.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 6c79509b..371f3da3 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -294,6 +294,17 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend,
             p)); // in rare cases, this will overflow, but that's ok
     p += 8;
   }
+  // 4-digit SWAR follow-up (ported from ffc EXP-001): consume a remaining 4-7
+  // digit run in one step rather than byte-by-byte. GCC path only — on Clang
+  // the follow-up's presence bloated the 2x-unroll codegen and regressed random.
+  if ((pend - p) >= 4) {
+    uint32_t const val4 = read4_to_u32(p);
+    if (is_made_of_four_digits_fast(val4)) {
+      i = i * 10000 +
+          parse_four_digits_unrolled(val4); // in rare cases overflows, that's ok
+      p += 4;
+    }
+  }
 #endif
 }
 

From 3ff2c0b8945408f63b381af4f22a6167e52b486d Mon Sep 17 00:00:00 2001
From: fcostaoliveira <filipe@redis.com>
Date: Mon, 1 Jun 2026 09:15:26 +0100
Subject: [PATCH 4/6] EXP-053: clang-format (reflow comment + expression wrap;
 no semantic change)

Pre-clear the lint_and_format_check CI gate. clang-format-18 (CI pins 17; LLVM base
style is identical for these constructs). Behavior/benchmarks unchanged.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 include/fast_float/ascii_number.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index 371f3da3..14caa0d2 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -296,12 +296,13 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend,
   }
   // 4-digit SWAR follow-up (ported from ffc EXP-001): consume a remaining 4-7
   // digit run in one step rather than byte-by-byte. GCC path only — on Clang
-  // the follow-up's presence bloated the 2x-unroll codegen and regressed random.
+  // the follow-up's presence bloated the 2x-unroll codegen and regressed
+  // random.
   if ((pend - p) >= 4) {
     uint32_t const val4 = read4_to_u32(p);
     if (is_made_of_four_digits_fast(val4)) {
-      i = i * 10000 +
-          parse_four_digits_unrolled(val4); // in rare cases overflows, that's ok
+      i = i * 10000 + parse_four_digits_unrolled(
+                          val4); // in rare cases overflows, that's ok
       p += 4;
     }
   }

From f4f36e04f7c7546086624f3995b26ea81bcaf675 Mon Sep 17 00:00:00 2001
From: fcostaoliveira <filipe@redis.com>
Date: Fri, 3 Jul 2026 14:13:44 +0100
Subject: [PATCH 5/6] EXP-062: enable the 4-digit SWAR fraction follow-up on
 all compilers (drop __clang__ gate)

---
 include/fast_float/ascii_number.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/fast_float/ascii_number.h b/include/fast_float/ascii_number.h
index e431cbcd..6fa640db 100644
--- a/include/fast_float/ascii_number.h
+++ b/include/fast_float/ascii_number.h
@@ -268,10 +268,9 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend,
   }
   // Consume a remaining 4-7 digit run in a single SWAR step instead of
   // byte-by-byte (reuses the existing 4-digit helpers). The parsed result is
-  // identical either way. Gated to clang: on gcc the extra 4-digit check
-  // regresses inputs whose remainder is shorter than 4 digits (it becomes pure
-  // overhead there); clang does not show that.
-#if defined(__clang__)
+  // identical either way. Historically gated to clang because gcc regressed on
+  // short remainders, but that verdict predates the span-elision restructure;
+  // with the leaner hot path the 4-digit step now wins on gcc as well.
   if ((pend - p) >= 4) {
     uint32_t const val4 = read4_to_u32(p);
     if (is_made_of_four_digits_fast(val4)) {
@@ -280,7 +279,6 @@ loop_parse_if_eight_digits(char const *&p, char const *const pend,
       p += 4;
     }
   }
-#endif
 }
 
 enum class parse_error {

From 5082489e352ce4904da4d2f652760b8781cae561 Mon Sep 17 00:00:00 2001
From: fcostaoliveira <filipe@redis.com>
Date: Fri, 3 Jul 2026 14:15:29 +0100
Subject: [PATCH 6/6] EXP-063: test the mode-independent mantissa bound before
 the rounds_to_nearest probe

---
 include/fast_float/parse_number.h | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/include/fast_float/parse_number.h b/include/fast_float/parse_number.h
index 7d338f3b..117ec696 100644
--- a/include/fast_float/parse_number.h
+++ b/include/fast_float/parse_number.h
@@ -198,7 +198,14 @@ clinger_fast_path_impl(uint64_t mantissa, int64_t exponent, bool is_negative,
   // We proceed optimistically, assuming that detail::rounds_to_nearest()
   // returns true.
   if (binary_format<T>::min_exponent_fast_path() <= exponent &&
-      exponent <= binary_format<T>::max_exponent_fast_path()) {
+      exponent <= binary_format<T>::max_exponent_fast_path() &&
+      mantissa <= binary_format<T>::max_mantissa_fast_path()) {
+    // The mantissa bound above is a necessary condition for BOTH branches
+    // below: the rounding-mode-dependent branch checks the tighter
+    // max_mantissa_fast_path(exponent) <= max_mantissa_fast_path(). Testing
+    // it before detail::rounds_to_nearest() spares long-mantissa inputs
+    // (which can never take the fast path) the volatile-float probe.
+    //
     // Unfortunately, the conventional Clinger's fast path is only possible
     // when the system rounds to the nearest float.
     //
@@ -209,18 +216,16 @@ clinger_fast_path_impl(uint64_t mantissa, int64_t exponent, bool is_negative,
     if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) {
       // We have that fegetround() == FE_TONEAREST.
       // Next is Clinger's fast path.
-      if (mantissa <= binary_format<T>::max_mantissa_fast_path()) {
-        value = T(mantissa);
-        if (exponent < 0) {
-          value = value / binary_format<T>::exact_power_of_ten(-exponent);
-        } else {
-          value = value * binary_format<T>::exact_power_of_ten(exponent);
-        }
-        if (is_negative) {
-          value = -value;
-        }
-        return true;
+      value = T(mantissa);
+      if (exponent < 0) {
+        value = value / binary_format<T>::exact_power_of_ten(-exponent);
+      } else {
+        value = value * binary_format<T>::exact_power_of_ten(exponent);
+      }
+      if (is_negative) {
+        value = -value;
       }
+      return true;
     } else {
       // We do not have that fegetround() == FE_TONEAREST.
       // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's