From a20ed7328927f57487ff4da49ffa4b25e0fc64e2 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Fri, 14 Jun 2024 19:18:29 +0200 Subject: [PATCH] core: Add optional comparison to ICU (generalCategory) --- experimental/icu/cbits/icu.c | 4 + experimental/icu/cbits/icu.h | 7 +- experimental/icu/icu.cabal | 2 + experimental/icu/lib/ICU/Char.chs | 136 ++++++++++++++++++++++++++++++ experimental/icu/lib/ICU/Char.hsc | 55 ------------ unicode-data/test/ICU/CharSpec.hs | 94 +++++++++++++++++++++ unicode-data/test/Main.hs | 11 ++- unicode-data/unicode-data.cabal | 12 +++ 8 files changed, 262 insertions(+), 59 deletions(-) create mode 100644 experimental/icu/lib/ICU/Char.chs delete mode 100644 experimental/icu/lib/ICU/Char.hsc create mode 100644 unicode-data/test/ICU/CharSpec.hs diff --git a/experimental/icu/cbits/icu.c b/experimental/icu/cbits/icu.c index c6563fb0..1d7b6e12 100644 --- a/experimental/icu/cbits/icu.c +++ b/experimental/icu/cbits/icu.c @@ -17,3 +17,7 @@ int32_t __hs_u_charName( UChar32 codepoint void __hs_u_charAge( UChar32 c, UVersionInfo versionArray ) { u_charAge(c, versionArray); } + +int8_t __hs_u_charType(UChar32 c) { + return u_charType(c); +} diff --git a/experimental/icu/cbits/icu.h b/experimental/icu/cbits/icu.h index edc5fb85..e6ea6382 100644 --- a/experimental/icu/cbits/icu.h +++ b/experimental/icu/cbits/icu.h @@ -10,12 +10,13 @@ int32_t __hs_u_charName( UChar32 codepoint , UCharNameChoice nameChoice , char * buffer , int32_t bufferLength ); +static const int __hs_U_UNICODE_CHAR_NAME = U_UNICODE_CHAR_NAME; +static const int __hs_U_CHAR_NAME_ALIAS = U_CHAR_NAME_ALIAS; // typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH]; void __hs_u_charAge( UChar32 c, UVersionInfo versionArray ); - static const int __hs_U_MAX_VERSION_LENGTH = U_MAX_VERSION_LENGTH; -static const int __hs_U_UNICODE_CHAR_NAME = U_UNICODE_CHAR_NAME; -static const int __hs_U_CHAR_NAME_ALIAS = U_CHAR_NAME_ALIAS; + +int8_t __hs_u_charType(UChar32 c); #endif diff --git a/experimental/icu/icu.cabal b/experimental/icu/icu.cabal index c09a88ac..ce81908d 100644 --- a/experimental/icu/icu.cabal +++ b/experimental/icu/icu.cabal @@ -71,3 +71,5 @@ library extra-libraries: icuuc pkgconfig-depends: icu-uc >= 72.1 + build-tool-depends: + c2hs:c2hs diff --git a/experimental/icu/lib/ICU/Char.chs b/experimental/icu/lib/ICU/Char.chs new file mode 100644 index 00000000..15daad09 --- /dev/null +++ b/experimental/icu/lib/ICU/Char.chs @@ -0,0 +1,136 @@ +-- | +-- Module : ICU.Char +-- Copyright : (c) 2023 Pierre Le Marre +-- License : Apache-2.0 +-- Maintainer : streamly@composewell.com +-- Stability : experimental +-- +-- Unicode character general properties +-- +-- @since 0.3.0 + +module ICU.Char + ( unicodeVersion + , charAge + , UGeneralCategory(..) + , toGeneralCategory + , charType + ) where + +#include + +import Data.Char (ord) +import qualified Data.Char as Char +import Data.Int (Int8) +import Data.Version (Version, makeVersion) +import Data.Word (Word32) +import Foreign (Ptr) +import Foreign.C (CInt) +import Foreign.Marshal.Array (allocaArray, peekArray) +import System.IO.Unsafe (unsafePerformIO) + +type UChar32 = Word32 + +foreign import capi "icu.h value __hs_U_MAX_VERSION_LENGTH" maxVersionLength :: Int + +foreign import ccall unsafe "icu.h __hs_u_getUnicodeVersion" u_getUnicodeVersion + :: Ptr Int8 -> IO () + +-- | ICU Unicode version +unicodeVersion :: Version +unicodeVersion + = makeVersion + . fmap fromIntegral + . unsafePerformIO + $ allocaArray + maxVersionLength + (\ptr -> u_getUnicodeVersion ptr *> peekArray maxVersionLength ptr) + +foreign import ccall unsafe "icu.h __hs_u_charAge" u_charAge + :: UChar32 -> Ptr Int8 -> IO () + +-- | Character age +charAge :: Char -> Version +charAge c + = makeVersion + . fmap fromIntegral + . unsafePerformIO + $ allocaArray + maxVersionLength + (\ptr -> u_charAge cp ptr *> peekArray maxVersionLength ptr) + where + cp = fromIntegral (ord c) + +foreign import ccall safe "icu.h __hs_u_charType" u_charType + :: UChar32 -> Int8 + +{#enum define UGeneralCategory { + U_UNASSIGNED as Unassigned, + U_UPPERCASE_LETTER as UppercaseLetter, + U_LOWERCASE_LETTER as LowercaseLetter, + U_TITLECASE_LETTER as TitlecaseLetter, + U_MODIFIER_LETTER as ModifierLetter, + U_OTHER_LETTER as OtherLetter, + U_NON_SPACING_MARK as NonSpacingMark, + U_ENCLOSING_MARK as EnclosingMark, + U_COMBINING_SPACING_MARK as CombiningSpacingMark, + U_DECIMAL_DIGIT_NUMBER as DecimalDigitNumber, + U_LETTER_NUMBER as LetterNumber, + U_OTHER_NUMBER as OtherNumber, + U_SPACE_SEPARATOR as SpaceSeparator, + U_LINE_SEPARATOR as LineSeparator, + U_PARAGRAPH_SEPARATOR as ParagraphSeparator, + U_CONTROL_CHAR as ControlChar, + U_FORMAT_CHAR as FormatChar, + U_PRIVATE_USE_CHAR as PrivateUseChar, + U_SURROGATE as Surrogate, + U_DASH_PUNCTUATION as DashPunctuation, + U_START_PUNCTUATION as StartPunctuation, + U_END_PUNCTUATION as EndPunctuation, + U_CONNECTOR_PUNCTUATION as ConnectorPunctuation, + U_OTHER_PUNCTUATION as OtherPunctuation, + U_MATH_SYMBOL as MathSymbol, + U_CURRENCY_SYMBOL as CurrencySymbol, + U_MODIFIER_SYMBOL as ModifierSymbol, + U_OTHER_SYMBOL as OtherSymbol, + U_INITIAL_PUNCTUATION as InitialPunctuation, + U_FINAL_PUNCTUATION as FinalPunctuation + } + deriving (Bounded, Eq, Ord, Show) #} + +-- | General category +charType :: Char -> UGeneralCategory +charType = toEnum . fromIntegral . u_charType . fromIntegral . ord + +toGeneralCategory :: UGeneralCategory -> Char.GeneralCategory +toGeneralCategory = \case + Unassigned -> Char.NotAssigned + UppercaseLetter -> Char.UppercaseLetter + LowercaseLetter -> Char.LowercaseLetter + TitlecaseLetter -> Char.TitlecaseLetter + ModifierLetter -> Char.ModifierLetter + OtherLetter -> Char.OtherLetter + NonSpacingMark -> Char.NonSpacingMark + EnclosingMark -> Char.EnclosingMark + CombiningSpacingMark -> Char.SpacingCombiningMark + DecimalDigitNumber -> Char.DecimalNumber + LetterNumber -> Char.LetterNumber + OtherNumber -> Char.OtherNumber + SpaceSeparator -> Char.Space + LineSeparator -> Char.LineSeparator + ParagraphSeparator -> Char.ParagraphSeparator + ControlChar -> Char.Control + FormatChar -> Char.Format + PrivateUseChar -> Char.PrivateUse + Surrogate -> Char.Surrogate + DashPunctuation -> Char.DashPunctuation + StartPunctuation -> Char.OpenPunctuation + EndPunctuation -> Char.ClosePunctuation + ConnectorPunctuation -> Char.ConnectorPunctuation + OtherPunctuation -> Char.OtherPunctuation + MathSymbol -> Char.MathSymbol + CurrencySymbol -> Char.CurrencySymbol + ModifierSymbol -> Char.ModifierSymbol + OtherSymbol -> Char.OtherSymbol + InitialPunctuation -> Char.InitialQuote + FinalPunctuation -> Char.FinalQuote diff --git a/experimental/icu/lib/ICU/Char.hsc b/experimental/icu/lib/ICU/Char.hsc deleted file mode 100644 index 4263e20a..00000000 --- a/experimental/icu/lib/ICU/Char.hsc +++ /dev/null @@ -1,55 +0,0 @@ --- | --- Module : ICU.Char --- Copyright : (c) 2023 Pierre Le Marre --- License : Apache-2.0 --- Maintainer : streamly@composewell.com --- Stability : experimental --- --- Unicode character general properties --- --- @since 0.3.0 - -module ICU.Char - ( unicodeVersion - , charAge - ) where - -import Data.Char (ord) -import Data.Int (Int8) -import Data.Version (Version, makeVersion) -import Data.Word (Word32) -import Foreign (Ptr) -import Foreign.Marshal.Array (allocaArray, peekArray) -import System.IO.Unsafe (unsafePerformIO) - -type UChar32 = Word32 - -foreign import capi "icu.h value __hs_U_MAX_VERSION_LENGTH" maxVersionLength :: Int - -foreign import ccall unsafe "icu.h __hs_u_getUnicodeVersion" u_getUnicodeVersion - :: Ptr Int8 -> IO () - --- | ICU Unicode version -unicodeVersion :: Version -unicodeVersion - = makeVersion - . fmap fromIntegral - . unsafePerformIO - $ allocaArray - maxVersionLength - (\ptr -> u_getUnicodeVersion ptr *> peekArray maxVersionLength ptr) - -foreign import ccall unsafe "icu.h __hs_u_charAge" u_charAge - :: UChar32 -> Ptr Int8 -> IO () - --- | Character age -charAge :: Char -> Version -charAge c - = makeVersion - . fmap fromIntegral - . unsafePerformIO - $ allocaArray - maxVersionLength - (\ptr -> u_charAge cp ptr *> peekArray maxVersionLength ptr) - where - cp = fromIntegral (ord c) diff --git a/unicode-data/test/ICU/CharSpec.hs b/unicode-data/test/ICU/CharSpec.hs new file mode 100644 index 00000000..ec901ec7 --- /dev/null +++ b/unicode-data/test/ICU/CharSpec.hs @@ -0,0 +1,94 @@ +{-# LANGUAGE CPP, BlockArguments, GADTs #-} + +module ICU.CharSpec + ( spec + ) where + +import Control.Applicative (Alternative(..)) +import Data.Foldable (traverse_) +import Data.Version (showVersion, versionBranch) +import Numeric (showHex) +import Test.Hspec + ( describe + , expectationFailure + , it + , pendingWith + , Spec + , HasCallStack, SpecWith ) + +import qualified ICU.Char as ICU +import qualified Unicode.Char as U + +spec :: Spec +spec = do + describe "General" do + checkAndGatherErrors + "charType" + (GeneralCategory . U.generalCategory) + (GeneralCategory . ICU.toGeneralCategory . ICU.charType) + -- TODO: other functions + where + ourUnicodeVersion = versionBranch U.unicodeVersion + theirUnicodeVersion = versionBranch ICU.unicodeVersion + showCodePoint c = ("U+" ++) . fmap U.toUpper . showHex (U.ord c) + + -- There is no feature to display warnings other than `trace`, so + -- hack our own: + -- 1. Compare given functions in pure code and gather warning & errors + -- 2. Create dummy spec that throw an expectation failure, if relevant. + -- 3. Create pending spec for each Char that raises a Unicode version + -- mismatch between ICU and unicode-data. + checkAndGatherErrors + :: forall a. (HasCallStack, Eq a, Show a) + => String + -> (Char -> a) + -> (Char -> a) + -> SpecWith () + checkAndGatherErrors label f fRef = do + it label (maybe (pure ()) expectationFailure err) + if null ws + then pure () + else describe (label ++ " (Unicode version conflict)") + (traverse_ mkWarning ws) + where + Acc ws err = foldr check (Acc [] Nothing) [minBound..maxBound] + check c acc + -- Test passed + | n == nRef = acc + -- Unicode version mismatch: char is not mapped in one of the libs: + -- add warning. + | age' > ourUnicodeVersion || age' > theirUnicodeVersion + = acc{warnings=c : warnings acc} + -- Error + | otherwise = + let !msg = mconcat + [ showCodePoint c ": expected " + , show nRef + , ", got ", show n, "" ] + in acc{firstError = firstError acc <|> Just msg} + where + !n = f c + !nRef = fRef c + age = ICU.charAge c + age' = take 3 (versionBranch age) + mkWarning c = it (showCodePoint c "") . pendingWith $ mconcat + [ "Incompatible ICU Unicode version: expected " + , showVersion U.unicodeVersion + , ", got: " + , showVersion ICU.unicodeVersion + , " (ICU character age is: " + , showVersion (ICU.charAge c) + , ")" ] + +-- | Helper to compare our GeneralCategory to 'Data.Char.GeneralCategory'. +data GeneralCategory = forall c. (Show c, Enum c) => GeneralCategory c + +instance Show GeneralCategory where + show (GeneralCategory a) = show a + +instance Eq GeneralCategory where + GeneralCategory a == GeneralCategory b = fromEnum a == fromEnum b + +-- | Warning accumulator +data Acc = Acc { warnings :: ![Char], firstError :: !(Maybe String) } + diff --git a/unicode-data/test/Main.hs b/unicode-data/test/Main.hs index ca05c0c9..a9b04359 100644 --- a/unicode-data/test/Main.hs +++ b/unicode-data/test/Main.hs @@ -1,10 +1,19 @@ +{-# LANGUAGE CPP #-} + module Main where import Test.Hspec import qualified Unicode.CharSpec +#ifdef HAS_ICU +import qualified ICU.CharSpec as ICU +#endif main :: IO () main = hspec spec spec :: Spec -spec = describe "Unicode.Char" Unicode.CharSpec.spec +spec = do + describe "Unicode.Char" Unicode.CharSpec.spec +#ifdef HAS_ICU + describe "ICU.Char" ICU.spec +#endif diff --git a/unicode-data/unicode-data.cabal b/unicode-data/unicode-data.cabal index 7d5aa957..d9caab88 100644 --- a/unicode-data/unicode-data.cabal +++ b/unicode-data/unicode-data.cabal @@ -63,6 +63,12 @@ common compile-options -fwarn-tabs default-language: Haskell2010 +flag dev-has-icu + description: + Use ICU for test and benchmark. Intended for development on the repository. + manual: True + default: False + library import: default-extensions, compile-options exposed-modules: @@ -125,6 +131,12 @@ test-suite test base >= 4.7 && < 4.21 , hspec >= 2.0 && < 2.12 , unicode-data + if flag(dev-has-icu) + cpp-options: -DHAS_ICU + other-modules: + ICU.CharSpec + build-depends: + icu benchmark bench import: default-extensions, compile-options