composewell · wismill · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/experimental/icu/cbits/icu.c b/experimental/icu/cbits/icu.c
@@ -7,6 +7,14 @@ void __hs_u_getUnicodeVersion(UVersionInfo versionArray) {
     u_getUnicodeVersion(versionArray);
 }
 
+/*******************************************************************************
+ * Properties
+ ******************************************************************************/
+
+bool __hs_u_hasBinaryProperty(UChar32 c, UProperty which) {
+    return u_hasBinaryProperty(c, which);
+}
+
 /*******************************************************************************
  * Names
  ******************************************************************************/

diff --git a/experimental/icu/cbits/icu.h b/experimental/icu/cbits/icu.h
@@ -7,6 +7,12 @@
 
 void __hs_u_getUnicodeVersion(UVersionInfo versionArray);
 
+/*******************************************************************************
+ * Properties
+ ******************************************************************************/
+
+bool __hs_u_hasBinaryProperty(UChar32 c, UProperty which);
+
 /*******************************************************************************
  * Names
  ******************************************************************************/

diff --git a/experimental/icu/icu.cabal b/experimental/icu/icu.cabal
@@ -73,4 +73,4 @@ library
   pkgconfig-depends:
     icu-uc >= 72.1
   build-tool-depends:
-      c2hs:c2hs
+      c2hs:c2hs >= 0.28.8
diff --git a/experimental/icu/lib/ICU/Char.chs b/experimental/icu/lib/ICU/Char.chs
@@ -15,6 +15,7 @@ module ICU.Char
     , UGeneralCategory(..)
     , toGeneralCategory
     , charType
+    , isNoncharacter
     ) where
 
 #include <unicode/uchar.h>
@@ -134,3 +135,21 @@ toGeneralCategory = \case
     OtherSymbol -> Char.OtherSymbol
     InitialPunctuation -> Char.InitialQuote
     FinalPunctuation -> Char.FinalQuote
+
+{#enum define UProperty {
+    UCHAR_NONCHARACTER_CODE_POINT as NoncharacterCodePoint
+    }
+    deriving (Bounded, Eq, Ord, Show) #}
+
+foreign import ccall safe "icu.h __hs_u_hasBinaryProperty" u_hasBinaryProperty
+    :: UChar32 -> Int -> Bool
+
+-- hasBinaryProperty :: UChar32 -> Int -> Bool
+-- hasBinaryProperty = {#call pure u_hasBinaryProperty as __hs_u_hasBinaryProperty#}
+-- {#fun pure u_hasBinaryProperty as hasBinaryProperty
+--     {`UChar32', `Int'} -> `Bool' #}
+
+isNoncharacter :: Char -> Bool
+isNoncharacter c = u_hasBinaryProperty
+    (fromIntegral (ord c))
+    (fromEnum NoncharacterCodePoint)
diff --git a/unicode-data-names/Changelog.md b/unicode-data-names/Changelog.md
@@ -2,22 +2,23 @@
 
 ## 0.4.0 (July 2024)
 
-- Update to [Unicode 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/).
+- Updated to [Unicode 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/).
+- Added `label` and `nameOrLabel` to `Unicode.Char.General.Names`.
 
 ## 0.3.0 (July 2024)
 
-- Improve performance.
+- Improved performance.
 - Added opional support for `ByteString` API.
   Use the package flag `has-bytestring` to enable it.
 - Added opional support for `Text` API.
   Use the package flag `has-text` to enable it.
-- Add `unicodeVersion` to `Unicode.Char.General.Names`.
-- Fix the inlining of `Addr#` literals and reduce their size. This results in
+- Added `unicodeVersion` to `Unicode.Char.General.Names`.
+- Fixed the inlining of `Addr#` literals and reduce their size. This results in
   a sensible decrease of the executable size.
 
 ## 0.2.0 (September 2022)
 
-- Update to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/).
+- Updated to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/).
 
 ## 0.1.0 (June 2022)
 

diff --git a/unicode-data-names/bench/Main.hs b/unicode-data-names/bench/Main.hs
@@ -172,6 +172,11 @@ benchmarks charRange charFilter = bgroup "All"
                 ]
 #endif
             ]
+        , bgroup "nameOrLabel"
+            [ bgroup' "nameOrLabel" "String"
+                [ Bench "unicode-data" String.nameOrLabel
+                ]
+            ]
         , bgroup "nameAliasesByType"
             [ bgroup' "nameAliasesByType" "String"
                 [ Bench "unicode-data"
@@ -220,6 +225,11 @@ benchmarks charRange charFilter = bgroup "All"
                 ]
 #endif
             ]
+        , bgroup "label"
+            [ bgroup' "label" "String"
+                [ Bench "unicode-data"  String.label
+                ]
+            ]
         ]
     ]
   where

diff --git a/unicode-data-names/lib/Unicode/Char/General/Names.hs b/unicode-data-names/lib/Unicode/Char/General/Names.hs
@@ -16,29 +16,50 @@
 -- @since 0.1.0
 
 module Unicode.Char.General.Names
-    ( -- Unicode version
+    ( -- * Unicode version
       unicodeVersion
+
       -- * Name
     , name
     , nameOrAlias
+    , nameOrLabel
     , correctedName
+
       -- * Name Aliases
     , NameAliases.NameAliasType(..)
     , nameAliases
     , nameAliasesByType
     , nameAliasesWithTypes
+
+      -- * Label
+    , label
     ) where
 
 import Control.Applicative ((<|>))
-import GHC.Exts
-    ( Addr#, Char(..), Char#, Int#
-    , indexCharOffAddr#, plusAddr#, (+#), (-#), (<#), isTrue#, quotRemInt#
-    , dataToTag#, ord# )
+import Control.Monad ((>=>))
+import Foreign.C.String (peekCAStringLen)
+import GHC.Exts (
+    Addr#,
+    Char (..),
+    Char#,
+    Int#,
+    dataToTag#,
+    indexCharOffAddr#,
+    isTrue#,
+    ord#,
+    plusAddr#,
+    quotRemInt#,
+    (+#),
+    (-#),
+    (<#),
+ )
+import System.IO.Unsafe (unsafeDupablePerformIO)
 
 import Unicode.Internal.Bits.Names (unpackNBytes#)
+import qualified Unicode.Internal.Char.Label as Label
+import Unicode.Internal.Char.Names.Version (unicodeVersion)
 import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
 import qualified Unicode.Internal.Char.UnicodeData.NameAliases as NameAliases
-import Unicode.Internal.Char.Names.Version (unicodeVersion)
 
 -- | Name of a character, if defined.
 --
@@ -121,6 +142,15 @@ nameOrAlias c@(C# c#) = name c <|> case indexCharOffAddr# addr# 0# of
         0# -> go (t# +# 1#)
         i# -> unpackNBytes'# (addr# `plusAddr#` i#)
 
+-- | Returns a character’s 'name' if defined,
+-- otherwise returns its label between angle brackets.
+--
+-- @since 0.4.0
+nameOrLabel :: Char -> String
+nameOrLabel c = case name c of
+    Nothing -> '<' : label c ++ ">"
+    Just n  -> n
+
 -- | All name aliases of a character, if defined.
 -- The names are listed in the original order of the UCD.
 --
@@ -189,6 +219,24 @@ nameAliasesByType# addr# t = case indexCharOffAddr# (addr# `plusAddr#` t#) 0# of
     i#    -> unpackCStrings# (addr# `plusAddr#` ord# i#)
     where t# = dataToTag# t
 
+-- | Returns the label of a code point if it has no character name, otherwise
+-- returns @\"UNDEFINED\"@.
+--
+-- See subsection
+-- [“Code Point Labels”](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G135248)
+-- in section 4.8 “Name” of the Unicode Standard.
+--
+-- >>> label '\0'
+-- "control-0000"
+-- >>> label 'a'
+-- "UNDEFINED"
+-- >>> label '\xffff'
+-- "noncharacter-FFFF"
+--
+-- @since 0.4.0
+label :: Char -> String
+label = unsafeDupablePerformIO . (Label.label >=> peekCAStringLen)
+
 {-# INLINE unpackCStrings# #-}
 unpackCStrings# :: Addr# -> [String]
 unpackCStrings# = go

diff --git a/unicode-data-names/lib/Unicode/Internal/Char/Label.hs b/unicode-data-names/lib/Unicode/Internal/Char/Label.hs
@@ -0,0 +1,82 @@
+-- |
+-- Module      : Unicode.Char
+-- Copyright   : (c) 2024 Composewell Technologies and Contributors
+-- License     : Apache-2.0
+-- Maintainer  : [email protected]
+-- Stability   : experimental
+
+module Unicode.Internal.Char.Label
+    ( label
+    , addHexCodePoint
+    , intToDigiT
+    ) where
+
+import Data.Char (ord)
+import Data.Functor (($>))
+import Foreign.C.String (CString, CStringLen)
+import Foreign.C.Types (CChar (..))
+import Foreign.Marshal (allocaArray, copyArray)
+import Foreign.Storable (Storable (..))
+import GHC.Exts (Int (..), Int#, Ptr (..), isTrue#, quotRemInt#, (+#), (-#), (<=#))
+import Unicode.Char.General (CodePointType (..), codePointType)
+
+-- | Returns the label of a code point if it has no character name, otherwise
+-- returns @\"UNDEFINED\"@.
+--
+-- See subsection
+-- [“Code Point Labels”](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G135248)
+-- in section 4.8 “Name” of the Unicode Standard.
+--
+-- @since 0.4.0
+label :: Char -> IO CStringLen
+label c = case codePointType c of
+    ControlType      -> mkLabel 8#  "control-"#
+    PrivateUseType   -> mkLabel 12# "private-use-"#
+    SurrogateType    -> mkLabel 10# "surrogate-"#
+    NoncharacterType -> mkLabel 13# "noncharacter-"#
+    ReservedType     -> mkLabel 9#  "reserved-"#
+    _                -> pure (Ptr "UNDEFINED"#, 9)
+
+    where
+
+    mkLabel len s0 = allocaArray (I# len + 6) $ \s -> do
+        copyArray s (Ptr s0) (I# len)
+        len' <- addHexCodePoint s len len c
+        pure (s, len')
+
+-- | Appned the code point of a character using the Unicode Standard convention:
+-- hexadecimal codepoint padded with zeros if inferior to 4 characters.
+--
+-- It is the responsability of the caller to provide a 'CString' that can hold
+-- up to 6 characters from the provided index.
+addHexCodePoint
+    :: CString -- ^ Destination ASCII string
+    -> Int#    -- ^ String length
+    -> Int#    -- ^ Index
+    -> Char    -- ^ Character which code point will be added to the string
+    -> IO Int  -- ^ New size of the string
+addHexCodePoint s len i0 c
+    | isTrue# (cp# <=# 0x0000f#) = prependAt 3# <* pad0 0# <* pad0 1# <* pad0 2#
+    | isTrue# (cp# <=# 0x000ff#) = prependAt 3# <* pad0 0# <* pad0 1#
+    | isTrue# (cp# <=# 0x00fff#) = prependAt 3# <* pad0 0#
+    | isTrue# (cp# <=# 0x0ffff#) = prependAt 3#
+    | isTrue# (cp# <=# 0xfffff#) = prependAt 4#
+    | otherwise                  = prependAt 5#
+    where
+    !(I# cp#) = ord c
+    pad0 i = pokeElemOff s (I# (i0 +# i)) (CChar 0x30)
+    prependAt i = go (i0 +# i) (quotRemInt# cp# 16#) $> I# (len +# i +# 1#)
+    go i (# n#, d #) = do
+        pokeElemOff s (I# i) (intToDigiT d)
+        case n# of
+            0# -> pure ()
+            _  -> go (i -# 1#) (quotRemInt# n# 16#)
+
+-- | Convert an 'Int#' in the range 0..15 to the corresponding single digit
+-- 'CChar' in upper case.
+--
+-- Undefined for numbers outside the 0..15 range.
+intToDigiT :: Int# -> CChar
+intToDigiT i = if isTrue# (i <=# 9#)
+    then fromIntegral (I# (0x30# +# i))
+    else fromIntegral (I# (0x37# +# i))
diff --git a/unicode-data-names/test/Unicode/Char/General/NamesSpec.hs b/unicode-data-names/test/Unicode/Char/General/NamesSpec.hs
@@ -4,15 +4,25 @@ module Unicode.Char.General.NamesSpec
   ( spec
   ) where
 
-import GHC.Exts (Char(..), isTrue#, (<#), ord#, andI#)
-import Unicode.Char.General
-    ( generalCategory,
-      GeneralCategory(NotAssigned, Surrogate, PrivateUse) )
-import Unicode.Char.General.Names
-    ( NameAliasType (..), correctedName, name, nameOrAlias, nameAliasesWithTypes, nameAliases, nameAliasesByType )
-import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
 import Data.Foldable (traverse_)
-import Test.Hspec ( Spec, it, shouldBe, shouldSatisfy, describe )
+import GHC.Exts (Char (..), andI#, isTrue#, ord#, (<#))
+import Test.Hspec (Spec, describe, it, shouldBe, shouldSatisfy)
+import Unicode.Char.General (
+    GeneralCategory (NotAssigned, PrivateUse, Surrogate),
+    generalCategory,
+ )
+
+import Unicode.Char.General.Names (
+    NameAliasType (..),
+    correctedName,
+    label,
+    name,
+    nameAliases,
+    nameAliasesByType,
+    nameAliasesWithTypes,
+    nameOrAlias,
+ )
+import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
 
 spec :: Spec
 spec = do
@@ -132,3 +142,25 @@ spec = do
                             NotAssigned -> True
                             _           -> False
         traverse_ (`shouldSatisfy` checkName) [minBound..maxBound]
+    describe "label" do
+        it "Some characters" do
+            label '\x0000'   `shouldBe` "control-0000"
+            label '\x009F'   `shouldBe` "control-009F"
+            label 'a'        `shouldBe` "UNDEFINED"
+            label '1'        `shouldBe` "UNDEFINED"
+            label '\x1D0C5'  `shouldBe` "UNDEFINED"
+            label '\x2F89F'  `shouldBe` "UNDEFINED"
+            label '\xE000'   `shouldBe` "private-use-E000"
+            label '\x10FFFD' `shouldBe` "private-use-10FFFD"
+            label '\xD800'   `shouldBe` "surrogate-D800"
+            label '\xDFFF'   `shouldBe` "surrogate-DFFF"
+            label '\xFDD0'   `shouldBe` "noncharacter-FDD0"
+            label '\x10FFFF' `shouldBe` "noncharacter-10FFFF"
+            label '\x0378'   `shouldBe` "reserved-0378"
+            label '\x1FFFD'  `shouldBe` "reserved-1FFFD"
+            label '\xEFFFD'  `shouldBe` "reserved-EFFFD"
+    it "Every character has either a name or a label" do
+        let checkName c = case name c of
+                        Just _  -> True
+                        Nothing -> label c /= "UNDEFINED"
+        traverse_ (`shouldSatisfy` checkName) [minBound..maxBound]
diff --git a/unicode-data-names/unicode-data-names.cabal b/unicode-data-names/unicode-data-names.cabal
@@ -94,16 +94,18 @@ library
       -- Generated files
       -- This module structure is largely based on the UCD file names from which
       -- the properties are generated.
+      Unicode.Internal.Char.Label
+      Unicode.Internal.Char.Names.Version
       Unicode.Internal.Char.UnicodeData.DerivedName
       Unicode.Internal.Char.UnicodeData.NameAliases
-      Unicode.Internal.Char.Names.Version
   other-modules:
       -- Internal files
       Unicode.Internal.Bits.Names
 
   hs-source-dirs: lib
   build-depends:
       base             >= 4.7   && < 4.21,
+      unicode-data     >= 0.6   && < 0.7
   -- Support for raw string literals unpacking is included in base ≥ 4.15
   if impl(ghc < 9.0.0)
     build-depends: