Skip to content

Commit

Permalink
core: Add optional comparison to ICU (generalCategory)
Browse files Browse the repository at this point in the history
  • Loading branch information
wismill committed Jun 14, 2024
1 parent c524fd9 commit a20ed73
Show file tree
Hide file tree
Showing 8 changed files with 262 additions and 59 deletions.
4 changes: 4 additions & 0 deletions experimental/icu/cbits/icu.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,7 @@ int32_t __hs_u_charName( UChar32 codepoint
void __hs_u_charAge( UChar32 c, UVersionInfo versionArray ) {
u_charAge(c, versionArray);
}

int8_t __hs_u_charType(UChar32 c) {
return u_charType(c);
}
7 changes: 4 additions & 3 deletions experimental/icu/cbits/icu.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ int32_t __hs_u_charName( UChar32 codepoint
, UCharNameChoice nameChoice
, char * buffer
, int32_t bufferLength );
static const int __hs_U_UNICODE_CHAR_NAME = U_UNICODE_CHAR_NAME;
static const int __hs_U_CHAR_NAME_ALIAS = U_CHAR_NAME_ALIAS;

// typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH];
void __hs_u_charAge( UChar32 c, UVersionInfo versionArray );

static const int __hs_U_MAX_VERSION_LENGTH = U_MAX_VERSION_LENGTH;
static const int __hs_U_UNICODE_CHAR_NAME = U_UNICODE_CHAR_NAME;
static const int __hs_U_CHAR_NAME_ALIAS = U_CHAR_NAME_ALIAS;

int8_t __hs_u_charType(UChar32 c);

#endif
2 changes: 2 additions & 0 deletions experimental/icu/icu.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,5 @@ library
extra-libraries: icuuc
pkgconfig-depends:
icu-uc >= 72.1
build-tool-depends:
c2hs:c2hs
136 changes: 136 additions & 0 deletions experimental/icu/lib/ICU/Char.chs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
-- |
-- Module : ICU.Char
-- Copyright : (c) 2023 Pierre Le Marre
-- License : Apache-2.0
-- Maintainer : [email protected]
-- Stability : experimental
--
-- Unicode character general properties
--
-- @since 0.3.0

module ICU.Char
( unicodeVersion
, charAge
, UGeneralCategory(..)
, toGeneralCategory
, charType
) where

#include <unicode/uchar.h>

import Data.Char (ord)
import qualified Data.Char as Char
import Data.Int (Int8)
import Data.Version (Version, makeVersion)
import Data.Word (Word32)
import Foreign (Ptr)
import Foreign.C (CInt)
import Foreign.Marshal.Array (allocaArray, peekArray)
import System.IO.Unsafe (unsafePerformIO)

type UChar32 = Word32

foreign import capi "icu.h value __hs_U_MAX_VERSION_LENGTH" maxVersionLength :: Int

foreign import ccall unsafe "icu.h __hs_u_getUnicodeVersion" u_getUnicodeVersion
:: Ptr Int8 -> IO ()

-- | ICU Unicode version
unicodeVersion :: Version
unicodeVersion
= makeVersion
. fmap fromIntegral
. unsafePerformIO
$ allocaArray
maxVersionLength
(\ptr -> u_getUnicodeVersion ptr *> peekArray maxVersionLength ptr)

foreign import ccall unsafe "icu.h __hs_u_charAge" u_charAge
:: UChar32 -> Ptr Int8 -> IO ()

-- | Character age
charAge :: Char -> Version
charAge c
= makeVersion
. fmap fromIntegral
. unsafePerformIO
$ allocaArray
maxVersionLength
(\ptr -> u_charAge cp ptr *> peekArray maxVersionLength ptr)
where
cp = fromIntegral (ord c)

foreign import ccall safe "icu.h __hs_u_charType" u_charType
:: UChar32 -> Int8

{#enum define UGeneralCategory {
U_UNASSIGNED as Unassigned,
U_UPPERCASE_LETTER as UppercaseLetter,
U_LOWERCASE_LETTER as LowercaseLetter,
U_TITLECASE_LETTER as TitlecaseLetter,
U_MODIFIER_LETTER as ModifierLetter,
U_OTHER_LETTER as OtherLetter,
U_NON_SPACING_MARK as NonSpacingMark,
U_ENCLOSING_MARK as EnclosingMark,
U_COMBINING_SPACING_MARK as CombiningSpacingMark,
U_DECIMAL_DIGIT_NUMBER as DecimalDigitNumber,
U_LETTER_NUMBER as LetterNumber,
U_OTHER_NUMBER as OtherNumber,
U_SPACE_SEPARATOR as SpaceSeparator,
U_LINE_SEPARATOR as LineSeparator,
U_PARAGRAPH_SEPARATOR as ParagraphSeparator,
U_CONTROL_CHAR as ControlChar,
U_FORMAT_CHAR as FormatChar,
U_PRIVATE_USE_CHAR as PrivateUseChar,
U_SURROGATE as Surrogate,
U_DASH_PUNCTUATION as DashPunctuation,
U_START_PUNCTUATION as StartPunctuation,
U_END_PUNCTUATION as EndPunctuation,
U_CONNECTOR_PUNCTUATION as ConnectorPunctuation,
U_OTHER_PUNCTUATION as OtherPunctuation,
U_MATH_SYMBOL as MathSymbol,
U_CURRENCY_SYMBOL as CurrencySymbol,
U_MODIFIER_SYMBOL as ModifierSymbol,
U_OTHER_SYMBOL as OtherSymbol,
U_INITIAL_PUNCTUATION as InitialPunctuation,
U_FINAL_PUNCTUATION as FinalPunctuation
}
deriving (Bounded, Eq, Ord, Show) #}

-- | General category
charType :: Char -> UGeneralCategory
charType = toEnum . fromIntegral . u_charType . fromIntegral . ord

toGeneralCategory :: UGeneralCategory -> Char.GeneralCategory
toGeneralCategory = \case
Unassigned -> Char.NotAssigned
UppercaseLetter -> Char.UppercaseLetter
LowercaseLetter -> Char.LowercaseLetter
TitlecaseLetter -> Char.TitlecaseLetter
ModifierLetter -> Char.ModifierLetter
OtherLetter -> Char.OtherLetter
NonSpacingMark -> Char.NonSpacingMark
EnclosingMark -> Char.EnclosingMark
CombiningSpacingMark -> Char.SpacingCombiningMark
DecimalDigitNumber -> Char.DecimalNumber
LetterNumber -> Char.LetterNumber
OtherNumber -> Char.OtherNumber
SpaceSeparator -> Char.Space
LineSeparator -> Char.LineSeparator
ParagraphSeparator -> Char.ParagraphSeparator
ControlChar -> Char.Control
FormatChar -> Char.Format
PrivateUseChar -> Char.PrivateUse
Surrogate -> Char.Surrogate
DashPunctuation -> Char.DashPunctuation
StartPunctuation -> Char.OpenPunctuation
EndPunctuation -> Char.ClosePunctuation
ConnectorPunctuation -> Char.ConnectorPunctuation
OtherPunctuation -> Char.OtherPunctuation
MathSymbol -> Char.MathSymbol
CurrencySymbol -> Char.CurrencySymbol
ModifierSymbol -> Char.ModifierSymbol
OtherSymbol -> Char.OtherSymbol
InitialPunctuation -> Char.InitialQuote
FinalPunctuation -> Char.FinalQuote
55 changes: 0 additions & 55 deletions experimental/icu/lib/ICU/Char.hsc

This file was deleted.

94 changes: 94 additions & 0 deletions unicode-data/test/ICU/CharSpec.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{-# LANGUAGE CPP, BlockArguments, GADTs #-}

module ICU.CharSpec
( spec
) where

import Control.Applicative (Alternative(..))
import Data.Foldable (traverse_)
import Data.Version (showVersion, versionBranch)
import Numeric (showHex)
import Test.Hspec
( describe
, expectationFailure
, it
, pendingWith
, Spec
, HasCallStack, SpecWith )

import qualified ICU.Char as ICU
import qualified Unicode.Char as U

spec :: Spec
spec = do
describe "General" do
checkAndGatherErrors
"charType"
(GeneralCategory . U.generalCategory)
(GeneralCategory . ICU.toGeneralCategory . ICU.charType)
-- TODO: other functions
where
ourUnicodeVersion = versionBranch U.unicodeVersion
theirUnicodeVersion = versionBranch ICU.unicodeVersion
showCodePoint c = ("U+" ++) . fmap U.toUpper . showHex (U.ord c)

-- There is no feature to display warnings other than `trace`, so
-- hack our own:
-- 1. Compare given functions in pure code and gather warning & errors
-- 2. Create dummy spec that throw an expectation failure, if relevant.
-- 3. Create pending spec for each Char that raises a Unicode version
-- mismatch between ICU and unicode-data.
checkAndGatherErrors
:: forall a. (HasCallStack, Eq a, Show a)
=> String
-> (Char -> a)
-> (Char -> a)
-> SpecWith ()
checkAndGatherErrors label f fRef = do
it label (maybe (pure ()) expectationFailure err)
if null ws
then pure ()
else describe (label ++ " (Unicode version conflict)")
(traverse_ mkWarning ws)
where
Acc ws err = foldr check (Acc [] Nothing) [minBound..maxBound]
check c acc
-- Test passed
| n == nRef = acc
-- Unicode version mismatch: char is not mapped in one of the libs:
-- add warning.
| age' > ourUnicodeVersion || age' > theirUnicodeVersion
= acc{warnings=c : warnings acc}
-- Error
| otherwise =
let !msg = mconcat
[ showCodePoint c ": expected "
, show nRef
, ", got ", show n, "" ]
in acc{firstError = firstError acc <|> Just msg}
where
!n = f c
!nRef = fRef c
age = ICU.charAge c
age' = take 3 (versionBranch age)
mkWarning c = it (showCodePoint c "") . pendingWith $ mconcat
[ "Incompatible ICU Unicode version: expected "
, showVersion U.unicodeVersion
, ", got: "
, showVersion ICU.unicodeVersion
, " (ICU character age is: "
, showVersion (ICU.charAge c)
, ")" ]

-- | Helper to compare our GeneralCategory to 'Data.Char.GeneralCategory'.
data GeneralCategory = forall c. (Show c, Enum c) => GeneralCategory c

instance Show GeneralCategory where
show (GeneralCategory a) = show a

instance Eq GeneralCategory where
GeneralCategory a == GeneralCategory b = fromEnum a == fromEnum b

-- | Warning accumulator
data Acc = Acc { warnings :: ![Char], firstError :: !(Maybe String) }

11 changes: 10 additions & 1 deletion unicode-data/test/Main.hs
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
{-# LANGUAGE CPP #-}

module Main where

import Test.Hspec
import qualified Unicode.CharSpec
#ifdef HAS_ICU
import qualified ICU.CharSpec as ICU
#endif

main :: IO ()
main = hspec spec

spec :: Spec
spec = describe "Unicode.Char" Unicode.CharSpec.spec
spec = do
describe "Unicode.Char" Unicode.CharSpec.spec
#ifdef HAS_ICU
describe "ICU.Char" ICU.spec
#endif
12 changes: 12 additions & 0 deletions unicode-data/unicode-data.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ common compile-options
-fwarn-tabs
default-language: Haskell2010

flag dev-has-icu
description:
Use ICU for test and benchmark. Intended for development on the repository.
manual: True
default: False

library
import: default-extensions, compile-options
exposed-modules:
Expand Down Expand Up @@ -125,6 +131,12 @@ test-suite test
base >= 4.7 && < 4.21
, hspec >= 2.0 && < 2.12
, unicode-data
if flag(dev-has-icu)
cpp-options: -DHAS_ICU
other-modules:
ICU.CharSpec
build-depends:
icu

benchmark bench
import: default-extensions, compile-options
Expand Down

0 comments on commit a20ed73

Please sign in to comment.