Skip to content

Commit

Permalink
ucd2haskell: Refactor
Browse files Browse the repository at this point in the history
This is a huge refactor of `ucd2haskell`, motivated by similar work in
`ghc-internal`. This will prevent this tool from further bit-rotting.

- Remove dependency on `streamly`. This package is overkilled and has
  an instable API. The version we use is not supported by recent GHCs
  and non-trivial migration seems to be required at each new version.
  Furthermore we currently process `String`s, so there is no much benefit.
- Mimic the `Fold` type from `streamly` for basic features. Although not
  mandatory, this avoid changing all the logic.
- Use `ByteString` parsers from `unicode-data-parser` [1]. These parsers
  are shared with the corresponding `ucd2haskell` tool in `base` (now
  `ghc-internal`). We now have a clear separation between parsers and
  generators. The Unicode files being very stable, this package should be
  very stable as well.
- Move generators to independent modules. This speeds the compilation up
  and add more structure to the code base.
- Remove many anti-patterns and share more code.

The files *generated* by this tool remain identical, although I left some
comments to further improve them.

[1]: https://hackage.haskell.org/package/unicode-data-parser
  • Loading branch information
wismill committed Jun 7, 2024
1 parent b93e747 commit 4e4ebcc
Show file tree
Hide file tree
Showing 27 changed files with 3,278 additions and 3,260 deletions.
3,245 changes: 0 additions & 3,245 deletions ucd2haskell/exe/Parser/Text.hs

This file was deleted.

14 changes: 9 additions & 5 deletions ucd2haskell/exe/UCD2Haskell.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@
module Main where

import GHC.Generics (Generic)
import Parser.Text (genCoreModules, genNamesModules, genScriptsModules, genSecurityModules)
import System.FilePath ((</>))
import WithCli (HasArguments(..), withCli)

import qualified UCD2Haskell.Generator.Core as Core
import qualified UCD2Haskell.Generator.Names as Names
import qualified UCD2Haskell.Generator.Scripts as Scripts
import qualified UCD2Haskell.Generator.Security as Security

data CLIOptions =
CLIOptions
{ input :: FilePath
Expand All @@ -29,10 +33,10 @@ data CLIOptions =

cliClient :: CLIOptions -> IO ()
cliClient opts
= genCoreModules (input opts </> "ucd") (output_core opts) (core_prop opts)
*> genNamesModules (input opts </> "ucd") (output_names opts)
*> genScriptsModules (input opts </> "ucd") (output_scripts opts)
*> genSecurityModules (input opts </> "security") (output_security opts)
= Core.generateModules (input opts </> "ucd") (output_core opts) (core_prop opts)
*> Names.generateModules (input opts </> "ucd") (output_names opts)
*> Scripts.generateModules (input opts </> "ucd") (output_scripts opts)
*> Security.generateModules (input opts </> "security") (output_security opts)

main :: IO ()
main = withCli cliClient
195 changes: 195 additions & 0 deletions ucd2haskell/exe/UCD2Haskell/Common.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
{-# LANGUAGE ExistentialQuantification #-}
-- |
-- Copyright : (c) 2024 Pierre Le Marre
-- License : Apache-2.0
-- Maintainer : [email protected]
-- Stability : experimental
--
--
module UCD2Haskell.Common
( -- * Fold
Fold(..)
, distribute
, filterFold
, rmapFold
, runFold

-- * Formatting
, showB
, showPaddedHex
, showPaddedHexB
, showPaddedHeX
, showPaddedHeXB
, showHexCodepoint
, showHexCodepointB
, showHexCodepointBS

-- * Hangul
, jamoLCount
, jamoVCount
, jamoTCount
, hangulFirst
, hangulLast
, isHangul
, isHangulRange
, filterNonHangul

-- * Miscellaneous
, allRange
, mkHaskellConstructor
) where

import Data.Foldable (Foldable(..))
import Numeric (showHex)
import Data.Char (toUpper, ord, isAlphaNum)
import qualified Data.ByteString.Builder as BB
import qualified Unicode.CharacterDatabase.Parser.UnicodeData as UD
import qualified Unicode.CharacterDatabase.Parser.Common as U
import qualified Data.ByteString.Short as BS
import qualified Data.ByteString.Char8 as B8

--------------------------------------------------------------------------------
-- Fold that mimimc Streamly’s one
--------------------------------------------------------------------------------

data Fold a b = forall s. Fold
{ _step :: s -> a -> s
, _initial :: s
, _final :: s -> b }

data Pair a b = Pair !a !b

teeWith :: (a -> b -> c) -> Fold x a -> Fold x b -> Fold x c
teeWith f (Fold stepL initialL finalL) (Fold stepR initialR finalR) =
Fold step initial final
where
step (Pair sL sR) x = Pair (stepL sL x) (stepR sR x)
initial = Pair initialL initialR
final (Pair sL sR) = f (finalL sL) (finalR sR)

distribute :: [Fold a b] -> Fold a [b]
distribute = foldr (teeWith (:)) (Fold const () (const []))

{-# INLINE filterFold #-}
filterFold :: (a -> Bool) -> Fold a b -> Fold a b
filterFold p (Fold step initial done) = Fold step' initial done
where
step' s a = if p a then step s a else s

{-# INLINE rmapFold #-}
rmapFold :: (b -> c) -> Fold a b -> Fold a c
rmapFold f (Fold step initial final) = Fold step initial (f . final)

runFold :: Fold a b -> [a] -> b
runFold (Fold step initial final) = final . foldl' step initial

--------------------------------------------------------------------------------
-- Formatting
--------------------------------------------------------------------------------

-- | /Warning:/ the use of 'BB.string7' make it unsafe if applied to non-ASCII.
showB :: (Show a) => a -> BB.Builder
showB = BB.string7 . show

showPaddedHex :: Int -> String
showPaddedHex cp =
let hex = showHex cp mempty
padding = 4 - length hex
in replicate padding '0' <> hex

showPaddedHexB :: Int -> BB.Builder
showPaddedHexB = BB.string7 . showPaddedHex

showPaddedHeX :: Int -> String
showPaddedHeX = fmap toUpper . showPaddedHex

showPaddedHeXB :: Int -> BB.Builder
showPaddedHeXB = BB.string7 . showPaddedHeX

showHexCodepoint :: Char -> String
showHexCodepoint = showPaddedHeX . ord

showHexCodepointB :: Char -> BB.Builder
showHexCodepointB = BB.string7 . showHexCodepoint

showHexCodepointBS :: Char -> BS.ShortByteString
showHexCodepointBS = BS.toShort . B8.pack . showPaddedHeX . ord

--------------------------------------------------------------------------------
-- Hangul
--------------------------------------------------------------------------------

-- This bit of code is duplicated but this duplication allows us to reduce 2
-- dependencies on the executable.

jamoLCount :: Int
jamoLCount = 19

jamoVCount :: Int
jamoVCount = 21

jamoTCount :: Int
jamoTCount = 28

hangulFirst :: Int
hangulFirst = 0xac00

hangulLast :: Int
hangulLast = hangulFirst + jamoLCount * jamoVCount * jamoTCount - 1

isHangul :: Char -> Bool
isHangul c = n >= hangulFirst && n <= hangulLast
where n = ord c

isHangulRange :: U.CodePointRange -> Bool
isHangulRange = \case
U.SingleChar c -> isHangul c
U.CharRange start end ->
ord start >= hangulFirst && ord end <= hangulLast

filterNonHangul :: Fold UD.Entry a -> Fold UD.Entry a
filterNonHangul = filterFold (not . isHangulRange . UD.range)

--------------------------------------------------------------------------------
-- Miscellaneous
--------------------------------------------------------------------------------

allRange :: (Char -> Bool) -> U.CodePointRange -> Bool
allRange predicate = \case
U.SingleChar c -> predicate c
U.CharRange start end -> all predicate [start..end]

-- -- Make a valid Haskell constructor (in CamelCase) from an identifier.
-- mkHaskellConstructor :: String -> String
-- mkHaskellConstructor = reverse . fst . foldl' convert (mempty, True)
-- where

-- convert (acc, newWord) = \case
-- -- Skip the following and start a new word
-- ' ' -> (acc, True)
-- '-' -> (acc, True)
-- '_' -> (acc, True)
-- -- Letter or number
-- c -> if isAscii c && isAlphaNum c
-- then ( if newWord then toUpper c : acc else c : acc
-- , False)
-- else error ("Unsupported character: " <> show c)

-- Make a valid Haskell constructor (in CamelCase) from an identifier.
mkHaskellConstructor :: BS.ShortByteString -> BB.Builder
mkHaskellConstructor = fst . BS.foldl' convert (mempty, True)
where

convert (acc, newWord) = \case
-- Skip the following and start a new word
0x20 -> (acc, True) -- Space
0x2d -> (acc, True) -- Hyphen
0x5f -> (acc, True) -- Underscore
-- Letter or number
c -> if isAlphaNum (word82Char c)
then ( acc <> BB.word8 if newWord then toUpper' c else c
, False )
else error ("Unsupported character: " <> show (word82Char c))
word82Char = toEnum . fromIntegral
char2Word8 = fromIntegral . fromEnum
toUpper' = char2Word8 . toUpper . word82Char
Loading

0 comments on commit 4e4ebcc

Please sign in to comment.