From 193daa9c81bb6f591cf35503ba4537e6c070c44f Mon Sep 17 00:00:00 2001
From: Pierre Le Marre <dev@wismill.eu>
Date: Thu, 13 Jun 2024 10:26:59 +0200
Subject: [PATCH] bench: Avoid lists and excessive inlining

This should make the benches faster and more reliable to run.
---
 unicode-data/bench/Unicode/Char/Bench.hs | 166 +++++++++++++++++++----
 unicode-data/unicode-data.cabal          |   3 +
 2 files changed, 143 insertions(+), 26 deletions(-)

diff --git a/unicode-data/bench/Unicode/Char/Bench.hs b/unicode-data/bench/Unicode/Char/Bench.hs
index 97b0947e..e4ce220d 100644
--- a/unicode-data/bench/Unicode/Char/Bench.hs
+++ b/unicode-data/bench/Unicode/Char/Bench.hs
@@ -1,4 +1,5 @@
 {-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE CPP #-}
 
 module Unicode.Char.Bench
     ( Bench(..)
@@ -8,23 +9,31 @@ module Unicode.Char.Bench
     , benchCharsNF
     ) where
 
-import Control.DeepSeq (NFData, deepseq, force)
-import Control.Exception (evaluate)
-import Test.Tasty.Bench (Benchmark, bgroup, bench, bcompare, env, nf)
-import Test.Tasty.Options
-    ( IsOption(defaultValue, optionHelp, optionName, parseValue) )
-
+import Control.DeepSeq (NFData (..), deepseq)
+import Control.Exception (evaluate, assert)
+import Data.Char (ord)
 import qualified Data.Char as Char
+import Foreign (Storable (..))
+import qualified GHC.Exts as Exts
+import GHC.IO (IO (..))
+import Test.Tasty.Bench (Benchmark, bcompare, bench, bgroup, env, nf)
+import Test.Tasty.Options (
+    IsOption (defaultValue, optionHelp, optionName, parseValue),
+ )
+#if !MIN_VERSION_base(4,15,0)
+import qualified GHC.Magic as Exts (noinline)
+#endif
+
 import qualified Unicode.Char.General as G
 
--- | A unit benchmark
-data Bench a = Bench
-  { _title :: !String  -- ^ Name
-  , _func :: Char -> a -- ^ Function to benchmark
-  }
+--------------------------------------------------------------------------------
+-- Char range
+--------------------------------------------------------------------------------
 
+-- | Characters range
 data CharRange = CharRange !Char !Char
 
+-- | Characters range configurable from CLI
 instance IsOption CharRange where
     defaultValue = CharRange minBound maxBound
     parseValue = \case
@@ -38,8 +47,19 @@ instance IsOption CharRange where
     optionName = pure "chars"
     optionHelp = pure "Range of chars to test"
 
+--------------------------------------------------------------------------------
+-- Characters benchmark
+--------------------------------------------------------------------------------
+
+-- | A unit benchmark
+data Bench a = Bench
+  { _title :: !String  -- ^ Name
+  , _func :: Char -> a -- ^ Function to benchmark
+  }
+
+-- | Helper to compare benchmarks of function from this package to ones in base.
 {-# INLINE bgroup' #-}
-bgroup' :: NFData a => String -> CharRange -> [Bench a] -> Benchmark
+bgroup' :: (NFData a) => String -> CharRange -> [Bench a] -> Benchmark
 bgroup' groupTitle charRange bs = bgroup groupTitle
     [ benchChars' title f
     | Bench title f <- bs
@@ -55,6 +75,7 @@ bgroup' groupTitle charRange bs = bgroup groupTitle
     bcompare' ref = bcompare
         (mconcat ["$NF == \"", ref, "\" && $(NF-1) == \"", groupTitle, "\""])
 
+-- | Helper to bench a char function on a filtered char range
 {-# INLINE benchChars #-}
 benchChars
     :: (NFData a)
@@ -67,6 +88,13 @@ benchChars t charRange = benchCharsNF t charRange isValid
     -- Filter out: Surrogates, Private Use Areas and unsassigned code points
     isValid c = G.generalCategory c < G.Surrogate
 
+-- | Pinned array of characters
+data Chars = Chars !Exts.ByteArray# !Int
+
+instance NFData Chars where
+    rnf (Chars !_ !_) = ()
+
+-- | Helper that handle the creation of the pinned chars array and the loop over it
 {-# INLINE benchCharsNF #-}
 benchCharsNF
     :: forall a. (NFData a)
@@ -75,18 +103,104 @@ benchCharsNF
     -> (Char -> Bool)
     -> (Char -> a)
     -> Benchmark
-benchCharsNF t charRange isValid f =
-    -- Avoid side-effects with garbage collection (see tasty-bench doc)
-    env
-        (evaluate (force chars'))     -- initialize
-        (bench t . nf (foldString f)) -- benchmark
+benchCharsNF title charRange isValid f =
+    -- Avoid side-effects with garbage collection (see tasty-bench doc for env).
+    -- We use pinned ByteArray# instead of lists to avoid that GC kicks in.
+    env (initialize isValid charRange >>= evaluate) (bench title . nf go)
+    where
+    -- Loop over the pinned char array. The loop itself does not allocate.
+    go (Chars cs len) = foldr
+        (\(Exts.I# k) ->
+            let c = Exts.indexWideCharArray# cs (k Exts.-# 1#)
+            -- `inline` is necessary to avoid excessive inlining, resulting
+            -- in benchmarking empty loop iterations, i.e. not the function.
+            -- We could use `inline` with more care at call site, but then we
+            -- would have to test the functions one by one and everytime we
+            -- modify them. Using it here is a hammer but more secure and
+            -- maintainable.
+            -- Note that we may improve this by controling the inlining for each
+            -- phase.
+            in deepseq (Exts.noinline f (Exts.C# c)))
+        ()
+        [1..len]
+
+-- | Create a byte array of the chars to bench
+initialize :: (Char -> Bool) -> CharRange -> IO Chars
+initialize isValid charRange = IO $ \s1 ->
+    case Exts.newPinnedByteArray# initialLength s1 of { (# s2, ma #) ->
+    -- Write the filtered char range
+    case writeChars isValid ma 0# s2 start end of { (# s3, filteredCount #) ->
+    -- Duplicate to get enough chars to bench
+    case tile ma 0# finalLength filteredLength s3 of { s4 ->
+    case Exts.unsafeFreezeByteArray# ma s4 of { (# s5, a #) ->
+        (# s5, Chars a (Exts.I# (replications Exts.*# filteredCount)) #)
+    }}
+       where
+       -- Ensure to have enough chars
+       replications = case Exts.quotInt# targetCharsCount filteredCount of
+           0# -> 1#
+           r# -> r#
+       filteredLength = filteredCount Exts.*# wcharSize
+       finalLength = filteredLength Exts.*# replications
+   }}
+    where
+    targetCharsCount = 0x10FFFF#
+    !(CharRange start end) = assert
+        (ord end - ord start + 1 < Exts.I# targetCharsCount)
+        charRange
+    !initialLength = targetCharsCount Exts.*# wcharSize
+    !(Exts.I# wcharSize) = sizeOf 'x'
+
+-- | Write a range of chars that match the given predicate
+writeChars ::
+    (Char -> Bool) ->
+    Exts.MutableByteArray# d ->
+    Exts.Int# ->
+    Exts.State# d ->
+    Char ->
+    Char ->
+    (# Exts.State# d, Exts.Int# #)
+writeChars isValid ma = go
+    where
+    go i s c1@(Exts.C# c1#) !c2 = if c1 < c2
+        then go i' s' (succ c1) c2
+        else (# s', i' #)
+        where
+        !(# s', i' #) = if isValid c1
+            then (# Exts.writeWideCharArray# ma i c1# s, i Exts.+# 1# #)
+            else (# s, i #)
+
+-- | Duplicate a portion of an array
+--
+-- Adapted from Data.Text.Array.tile
+tile ::
+    -- | Mutable array
+    Exts.MutableByteArray# s ->
+    -- | Start of the portion to duplicate
+    Exts.Int# ->
+    -- | Total length of the duplicate
+    Exts.Int# ->
+    -- | Length of the portion to duplicate
+    Exts.Int# ->
+    Exts.State# s ->
+    Exts.State# s
+tile dest destOff totalLen = go
     where
-    CharRange l u = charRange
-    chars = filter isValid [l..u]
-    -- Ensure to have sufficiently chars
-    n = 0x10FFFF `div` length chars
-    chars' = mconcat (replicate n chars)
-
-{-# INLINE foldString #-}
-foldString :: forall a. (NFData a) => (Char -> a) -> String -> ()
-foldString f = foldr (deepseq . f) ()
+    go l s
+        | Exts.isTrue# ((2# Exts.*# l) Exts.># totalLen) =
+            Exts.copyMutableByteArray#
+                dest
+                destOff
+                dest
+                (destOff Exts.+# l)
+                (totalLen Exts.-# l)
+                s
+        | otherwise =
+            case Exts.copyMutableByteArray#
+                    dest
+                    destOff
+                    dest
+                    (destOff Exts.+# l)
+                    l
+                    s of
+                        s' -> go (2# Exts.*# l) s'
diff --git a/unicode-data/unicode-data.cabal b/unicode-data/unicode-data.cabal
index 348ea65d..7d5aa957 100644
--- a/unicode-data/unicode-data.cabal
+++ b/unicode-data/unicode-data.cabal
@@ -148,6 +148,9 @@ benchmark bench
     tasty-bench >= 0.2.5 && < 0.4,
     tasty       >= 1.4.1 && < 1.6,
     unicode-data
+  if impl(ghc < 9.0)
+    -- Required for noinline
+    build-depends: ghc-prim
   -- [NOTE] Recommendation of tasty-bench to reduce garbage collection noisiness
   ghc-options: -O2 -fdicts-strict -rtsopts -with-rtsopts=-A32m
   -- [NOTE] Recommendation of tasty-bench for comparison against baseline