ucd2haskell: Refactor

This is a huge refactor of `ucd2haskell`, motivated by similar work in `ghc-internal`. This will prevent this tool from further bit-rotting. - Remove dependency on `streamly`. This package is overkilled and has an instable API. The version we use is not supported by recent GHCs and non-trivial migration seems to be required at each new version. Furthermore we currently process `String`s, so there is no much benefit. - Mimic the `Fold` type from `streamly` for basic features. Although not mandatory, this avoid changing all the logic. - Use `ByteString` parsers from `unicode-data-parser` [1]. These parsers are shared with the corresponding `ucd2haskell` tool in `base` (now `ghc-internal`). We now have a clear separation between parsers and generators. The Unicode files being very stable, this package should be very stable as well. - Move generators to independent modules. This speeds the compilation up and add more structure to the code base. - Remove many anti-patterns and share more code. The files *generated* by this tool remain identical, although I left some comments to further improve them. [1]: https://hackage.haskell.org/package/unicode-data-parser
composewell · Jun 7, 2024 · 4e4ebcc · 4e4ebcc
1 parent b93e747
commit 4e4ebcc
Show file tree

Hide file tree

Showing 27 changed files with 3,278 additions and 3,260 deletions.
diff --git a/ucd2haskell/exe/Parser/Text.hs b/ucd2haskell/exe/Parser/Text.hs
diff --git a/ucd2haskell/exe/UCD2Haskell.hs b/ucd2haskell/exe/UCD2Haskell.hs
@@ -8,10 +8,14 @@
 module Main where
 
 import GHC.Generics (Generic)
-import Parser.Text (genCoreModules, genNamesModules, genScriptsModules, genSecurityModules)
 import System.FilePath ((</>))
 import WithCli (HasArguments(..), withCli)
 
+import qualified UCD2Haskell.Generator.Core as Core
+import qualified UCD2Haskell.Generator.Names as Names
+import qualified UCD2Haskell.Generator.Scripts as Scripts
+import qualified UCD2Haskell.Generator.Security as Security
+
 data CLIOptions =
     CLIOptions
         { input :: FilePath
@@ -29,10 +33,10 @@ data CLIOptions =
 
 cliClient :: CLIOptions -> IO ()
 cliClient opts
-    = genCoreModules (input opts </> "ucd") (output_core opts) (core_prop opts)
-    *> genNamesModules (input opts </> "ucd") (output_names opts)
-    *> genScriptsModules (input opts </> "ucd") (output_scripts opts)
-    *> genSecurityModules (input opts </> "security") (output_security opts)
+    = Core.generateModules (input opts </> "ucd") (output_core opts) (core_prop opts)
+    *> Names.generateModules (input opts </> "ucd") (output_names opts)
+    *> Scripts.generateModules (input opts </> "ucd") (output_scripts opts)
+    *> Security.generateModules (input opts </> "security") (output_security opts)
 
 main :: IO ()
 main = withCli cliClient
diff --git a/ucd2haskell/exe/UCD2Haskell/Common.hs b/ucd2haskell/exe/UCD2Haskell/Common.hs
@@ -0,0 +1,195 @@
+{-# LANGUAGE ExistentialQuantification #-}
+-- |
+-- Copyright   : (c) 2024 Pierre Le Marre
+-- License     : Apache-2.0
+-- Maintainer  : [email protected]
+-- Stability   : experimental
+--
+--
+module UCD2Haskell.Common
+    ( -- * Fold
+      Fold(..)
+    , distribute
+    , filterFold
+    , rmapFold
+    , runFold
+
+    -- * Formatting
+    , showB
+    , showPaddedHex
+    , showPaddedHexB
+    , showPaddedHeX
+    , showPaddedHeXB
+    , showHexCodepoint
+    , showHexCodepointB
+    , showHexCodepointBS
+
+    -- * Hangul
+    , jamoLCount
+    , jamoVCount
+    , jamoTCount
+    , hangulFirst
+    , hangulLast
+    , isHangul
+    , isHangulRange
+    , filterNonHangul
+
+      -- * Miscellaneous
+    , allRange
+    , mkHaskellConstructor
+    ) where
+
+import Data.Foldable (Foldable(..))
+import Numeric (showHex)
+import Data.Char (toUpper, ord, isAlphaNum)
+import qualified Data.ByteString.Builder as BB
+import qualified Unicode.CharacterDatabase.Parser.UnicodeData as UD
+import qualified Unicode.CharacterDatabase.Parser.Common as U
+import qualified Data.ByteString.Short as BS
+import qualified Data.ByteString.Char8 as B8
+
+--------------------------------------------------------------------------------
+-- Fold that mimimc Streamly’s one
+--------------------------------------------------------------------------------
+
+data Fold a b = forall s. Fold
+  { _step :: s -> a -> s
+  , _initial :: s
+  , _final :: s -> b }
+
+data Pair a b = Pair !a !b
+
+teeWith :: (a -> b -> c) -> Fold x a -> Fold x b -> Fold x c
+teeWith f (Fold stepL initialL finalL) (Fold stepR initialR finalR) =
+  Fold step initial final
+  where
+    step (Pair sL sR) x = Pair (stepL sL x) (stepR sR x)
+    initial = Pair initialL initialR
+    final (Pair sL sR) = f (finalL sL) (finalR sR)
+
+distribute :: [Fold a b] -> Fold a [b]
+distribute = foldr (teeWith (:)) (Fold const () (const []))
+
+{-# INLINE filterFold #-}
+filterFold :: (a -> Bool) -> Fold a b -> Fold a b
+filterFold p (Fold step initial done) = Fold step' initial done
+  where
+    step' s a = if p a then step s a else s
+
+{-# INLINE rmapFold #-}
+rmapFold :: (b -> c) -> Fold a b -> Fold a c
+rmapFold f (Fold step initial final) = Fold step initial (f . final)
+
+runFold :: Fold a b -> [a] -> b
+runFold (Fold step initial final) = final . foldl' step initial
+
+--------------------------------------------------------------------------------
+-- Formatting
+--------------------------------------------------------------------------------
+
+-- | /Warning:/ the use of 'BB.string7' make it unsafe if applied to non-ASCII.
+showB :: (Show a) => a -> BB.Builder
+showB = BB.string7 . show
+
+showPaddedHex :: Int -> String
+showPaddedHex cp =
+    let hex = showHex cp mempty
+        padding = 4 - length hex
+    in replicate padding '0' <> hex
+
+showPaddedHexB :: Int -> BB.Builder
+showPaddedHexB = BB.string7 . showPaddedHex
+
+showPaddedHeX :: Int -> String
+showPaddedHeX = fmap toUpper . showPaddedHex
+
+showPaddedHeXB :: Int -> BB.Builder
+showPaddedHeXB = BB.string7 . showPaddedHeX
+
+showHexCodepoint :: Char -> String
+showHexCodepoint = showPaddedHeX . ord
+
+showHexCodepointB :: Char -> BB.Builder
+showHexCodepointB = BB.string7 . showHexCodepoint
+
+showHexCodepointBS :: Char -> BS.ShortByteString
+showHexCodepointBS = BS.toShort . B8.pack . showPaddedHeX . ord
+
+--------------------------------------------------------------------------------
+-- Hangul
+--------------------------------------------------------------------------------
+
+-- This bit of code is duplicated but this duplication allows us to reduce 2
+-- dependencies on the executable.
+
+jamoLCount :: Int
+jamoLCount = 19
+
+jamoVCount :: Int
+jamoVCount = 21
+
+jamoTCount :: Int
+jamoTCount = 28
+
+hangulFirst :: Int
+hangulFirst = 0xac00
+
+hangulLast :: Int
+hangulLast = hangulFirst + jamoLCount * jamoVCount * jamoTCount - 1
+
+isHangul :: Char -> Bool
+isHangul c = n >= hangulFirst && n <= hangulLast
+    where n = ord c
+
+isHangulRange :: U.CodePointRange -> Bool
+isHangulRange = \case
+    U.SingleChar c -> isHangul c
+    U.CharRange start end ->
+        ord start >= hangulFirst && ord end <= hangulLast
+
+filterNonHangul :: Fold UD.Entry a -> Fold UD.Entry a
+filterNonHangul = filterFold (not . isHangulRange . UD.range)
+
+--------------------------------------------------------------------------------
+-- Miscellaneous
+--------------------------------------------------------------------------------
+
+allRange :: (Char -> Bool) -> U.CodePointRange -> Bool
+allRange predicate = \case
+    U.SingleChar c -> predicate c
+    U.CharRange start end -> all predicate [start..end]
+
+-- -- Make a valid Haskell constructor (in CamelCase) from an identifier.
+-- mkHaskellConstructor :: String -> String
+-- mkHaskellConstructor = reverse . fst . foldl' convert (mempty, True)
+--     where
+
+--     convert (acc, newWord) = \case
+--         -- Skip the following and start a new word
+--         ' ' -> (acc, True)
+--         '-' -> (acc, True)
+--         '_' -> (acc, True)
+--         -- Letter or number
+--         c   -> if isAscii c && isAlphaNum c
+--             then ( if newWord then toUpper c : acc else c : acc
+--                  , False)
+--             else error ("Unsupported character: " <> show c)
+
+-- Make a valid Haskell constructor (in CamelCase) from an identifier.
+mkHaskellConstructor :: BS.ShortByteString -> BB.Builder
+mkHaskellConstructor = fst . BS.foldl' convert (mempty, True)
+    where
+
+    convert (acc, newWord) = \case
+        -- Skip the following and start a new word
+        0x20 -> (acc, True) -- Space
+        0x2d -> (acc, True) -- Hyphen
+        0x5f -> (acc, True) -- Underscore
+        -- Letter or number
+        c   -> if isAlphaNum (word82Char c)
+            then ( acc <> BB.word8 if newWord then toUpper' c else c
+                 , False )
+            else error ("Unsupported character: " <> show (word82Char c))
+    word82Char = toEnum . fromIntegral
+    char2Word8 = fromIntegral . fromEnum
+    toUpper' = char2Word8 . toUpper . word82Char