Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Char label & misc. #133

Merged
merged 8 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions experimental/icu/cbits/icu.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ void __hs_u_getUnicodeVersion(UVersionInfo versionArray) {
u_getUnicodeVersion(versionArray);
}

/*******************************************************************************
* Properties
******************************************************************************/

bool __hs_u_hasBinaryProperty(UChar32 c, UProperty which) {
return u_hasBinaryProperty(c, which);
}

/*******************************************************************************
* Names
******************************************************************************/
Expand Down
6 changes: 6 additions & 0 deletions experimental/icu/cbits/icu.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@

void __hs_u_getUnicodeVersion(UVersionInfo versionArray);

/*******************************************************************************
* Properties
******************************************************************************/

bool __hs_u_hasBinaryProperty(UChar32 c, UProperty which);

/*******************************************************************************
* Names
******************************************************************************/
Expand Down
2 changes: 1 addition & 1 deletion experimental/icu/icu.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,4 @@ library
pkgconfig-depends:
icu-uc >= 72.1
build-tool-depends:
c2hs:c2hs
c2hs:c2hs >= 0.28.8
19 changes: 19 additions & 0 deletions experimental/icu/lib/ICU/Char.chs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ module ICU.Char
, UGeneralCategory(..)
, toGeneralCategory
, charType
, isNoncharacter
) where

#include <unicode/uchar.h>
Expand Down Expand Up @@ -134,3 +135,21 @@ toGeneralCategory = \case
OtherSymbol -> Char.OtherSymbol
InitialPunctuation -> Char.InitialQuote
FinalPunctuation -> Char.FinalQuote

{#enum define UProperty {
UCHAR_NONCHARACTER_CODE_POINT as NoncharacterCodePoint
}
deriving (Bounded, Eq, Ord, Show) #}

foreign import ccall safe "icu.h __hs_u_hasBinaryProperty" u_hasBinaryProperty
:: UChar32 -> Int -> Bool

-- hasBinaryProperty :: UChar32 -> Int -> Bool
-- hasBinaryProperty = {#call pure u_hasBinaryProperty as __hs_u_hasBinaryProperty#}
-- {#fun pure u_hasBinaryProperty as hasBinaryProperty
-- {`UChar32', `Int'} -> `Bool' #}

isNoncharacter :: Char -> Bool
isNoncharacter c = u_hasBinaryProperty
(fromIntegral (ord c))
(fromEnum NoncharacterCodePoint)
11 changes: 6 additions & 5 deletions unicode-data-names/Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,23 @@

## 0.4.0 (July 2024)

- Update to [Unicode 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/).
- Updated to [Unicode 15.1.0](https://www.unicode.org/versions/Unicode15.1.0/).
- Added `label` and `nameOrLabel` to `Unicode.Char.General.Names`.

## 0.3.0 (July 2024)

- Improve performance.
- Improved performance.
- Added opional support for `ByteString` API.
Use the package flag `has-bytestring` to enable it.
- Added opional support for `Text` API.
Use the package flag `has-text` to enable it.
- Add `unicodeVersion` to `Unicode.Char.General.Names`.
- Fix the inlining of `Addr#` literals and reduce their size. This results in
- Added `unicodeVersion` to `Unicode.Char.General.Names`.
- Fixed the inlining of `Addr#` literals and reduce their size. This results in
a sensible decrease of the executable size.

## 0.2.0 (September 2022)

- Update to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/).
- Updated to [Unicode 15.0.0](https://www.unicode.org/versions/Unicode15.0.0/).

## 0.1.0 (June 2022)

Expand Down
10 changes: 10 additions & 0 deletions unicode-data-names/bench/Main.hs
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,11 @@ benchmarks charRange charFilter = bgroup "All"
]
#endif
]
, bgroup "nameOrLabel"
[ bgroup' "nameOrLabel" "String"
[ Bench "unicode-data" String.nameOrLabel
]
]
, bgroup "nameAliasesByType"
[ bgroup' "nameAliasesByType" "String"
[ Bench "unicode-data"
Expand Down Expand Up @@ -220,6 +225,11 @@ benchmarks charRange charFilter = bgroup "All"
]
#endif
]
, bgroup "label"
[ bgroup' "label" "String"
[ Bench "unicode-data" String.label
]
]
]
]
where
Expand Down
60 changes: 54 additions & 6 deletions unicode-data-names/lib/Unicode/Char/General/Names.hs
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,50 @@
-- @since 0.1.0

module Unicode.Char.General.Names
( -- Unicode version
( -- * Unicode version
unicodeVersion

-- * Name
, name
, nameOrAlias
, nameOrLabel
, correctedName

-- * Name Aliases
, NameAliases.NameAliasType(..)
, nameAliases
, nameAliasesByType
, nameAliasesWithTypes

-- * Label
, label
) where

import Control.Applicative ((<|>))
import GHC.Exts
( Addr#, Char(..), Char#, Int#
, indexCharOffAddr#, plusAddr#, (+#), (-#), (<#), isTrue#, quotRemInt#
, dataToTag#, ord# )
import Control.Monad ((>=>))
import Foreign.C.String (peekCAStringLen)
import GHC.Exts (
Addr#,
Char (..),
Char#,
Int#,
dataToTag#,
indexCharOffAddr#,
isTrue#,
ord#,
plusAddr#,
quotRemInt#,
(+#),
(-#),
(<#),
)
import System.IO.Unsafe (unsafeDupablePerformIO)

import Unicode.Internal.Bits.Names (unpackNBytes#)
import qualified Unicode.Internal.Char.Label as Label
import Unicode.Internal.Char.Names.Version (unicodeVersion)
import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
import qualified Unicode.Internal.Char.UnicodeData.NameAliases as NameAliases
import Unicode.Internal.Char.Names.Version (unicodeVersion)

-- | Name of a character, if defined.
--
Expand Down Expand Up @@ -121,6 +142,15 @@ nameOrAlias c@(C# c#) = name c <|> case indexCharOffAddr# addr# 0# of
0# -> go (t# +# 1#)
i# -> unpackNBytes'# (addr# `plusAddr#` i#)

-- | Returns a character’s 'name' if defined,
-- otherwise returns its label between angle brackets.
--
-- @since 0.4.0
nameOrLabel :: Char -> String
nameOrLabel c = case name c of
Nothing -> '<' : label c ++ ">"
Just n -> n

-- | All name aliases of a character, if defined.
-- The names are listed in the original order of the UCD.
--
Expand Down Expand Up @@ -189,6 +219,24 @@ nameAliasesByType# addr# t = case indexCharOffAddr# (addr# `plusAddr#` t#) 0# of
i# -> unpackCStrings# (addr# `plusAddr#` ord# i#)
where t# = dataToTag# t

-- | Returns the label of a code point if it has no character name, otherwise
-- returns @\"UNDEFINED\"@.
--
-- See subsection
-- [“Code Point Labels”](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G135248)
-- in section 4.8 “Name” of the Unicode Standard.
--
-- >>> label '\0'
-- "control-0000"
-- >>> label 'a'
-- "UNDEFINED"
-- >>> label '\xffff'
-- "noncharacter-FFFF"
--
-- @since 0.4.0
label :: Char -> String
label = unsafeDupablePerformIO . (Label.label >=> peekCAStringLen)

{-# INLINE unpackCStrings# #-}
unpackCStrings# :: Addr# -> [String]
unpackCStrings# = go
Expand Down
82 changes: 82 additions & 0 deletions unicode-data-names/lib/Unicode/Internal/Char/Label.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
-- |
-- Module : Unicode.Char
-- Copyright : (c) 2024 Composewell Technologies and Contributors
-- License : Apache-2.0
-- Maintainer : [email protected]
-- Stability : experimental

module Unicode.Internal.Char.Label
( label
, addHexCodePoint
, intToDigiT
) where

import Data.Char (ord)
import Data.Functor (($>))
import Foreign.C.String (CString, CStringLen)
import Foreign.C.Types (CChar (..))
import Foreign.Marshal (allocaArray, copyArray)
import Foreign.Storable (Storable (..))
import GHC.Exts (Int (..), Int#, Ptr (..), isTrue#, quotRemInt#, (+#), (-#), (<=#))
import Unicode.Char.General (CodePointType (..), codePointType)

-- | Returns the label of a code point if it has no character name, otherwise
-- returns @\"UNDEFINED\"@.
--
-- See subsection
-- [“Code Point Labels”](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G135248)
-- in section 4.8 “Name” of the Unicode Standard.
--
-- @since 0.4.0
label :: Char -> IO CStringLen
label c = case codePointType c of
ControlType -> mkLabel 8# "control-"#
PrivateUseType -> mkLabel 12# "private-use-"#
SurrogateType -> mkLabel 10# "surrogate-"#
NoncharacterType -> mkLabel 13# "noncharacter-"#
ReservedType -> mkLabel 9# "reserved-"#
_ -> pure (Ptr "UNDEFINED"#, 9)

where

mkLabel len s0 = allocaArray (I# len + 6) $ \s -> do
copyArray s (Ptr s0) (I# len)
len' <- addHexCodePoint s len len c
pure (s, len')

-- | Appned the code point of a character using the Unicode Standard convention:
-- hexadecimal codepoint padded with zeros if inferior to 4 characters.
--
-- It is the responsability of the caller to provide a 'CString' that can hold
-- up to 6 characters from the provided index.
addHexCodePoint
:: CString -- ^ Destination ASCII string
-> Int# -- ^ String length
-> Int# -- ^ Index
-> Char -- ^ Character which code point will be added to the string
-> IO Int -- ^ New size of the string
addHexCodePoint s len i0 c
| isTrue# (cp# <=# 0x0000f#) = prependAt 3# <* pad0 0# <* pad0 1# <* pad0 2#
| isTrue# (cp# <=# 0x000ff#) = prependAt 3# <* pad0 0# <* pad0 1#
| isTrue# (cp# <=# 0x00fff#) = prependAt 3# <* pad0 0#
| isTrue# (cp# <=# 0x0ffff#) = prependAt 3#
| isTrue# (cp# <=# 0xfffff#) = prependAt 4#
| otherwise = prependAt 5#
where
!(I# cp#) = ord c
pad0 i = pokeElemOff s (I# (i0 +# i)) (CChar 0x30)
prependAt i = go (i0 +# i) (quotRemInt# cp# 16#) $> I# (len +# i +# 1#)
go i (# n#, d #) = do
pokeElemOff s (I# i) (intToDigiT d)
case n# of
0# -> pure ()
_ -> go (i -# 1#) (quotRemInt# n# 16#)

-- | Convert an 'Int#' in the range 0..15 to the corresponding single digit
-- 'CChar' in upper case.
--
-- Undefined for numbers outside the 0..15 range.
intToDigiT :: Int# -> CChar
intToDigiT i = if isTrue# (i <=# 9#)
then fromIntegral (I# (0x30# +# i))
else fromIntegral (I# (0x37# +# i))
48 changes: 40 additions & 8 deletions unicode-data-names/test/Unicode/Char/General/NamesSpec.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,25 @@ module Unicode.Char.General.NamesSpec
( spec
) where

import GHC.Exts (Char(..), isTrue#, (<#), ord#, andI#)
import Unicode.Char.General
( generalCategory,
GeneralCategory(NotAssigned, Surrogate, PrivateUse) )
import Unicode.Char.General.Names
( NameAliasType (..), correctedName, name, nameOrAlias, nameAliasesWithTypes, nameAliases, nameAliasesByType )
import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName
import Data.Foldable (traverse_)
import Test.Hspec ( Spec, it, shouldBe, shouldSatisfy, describe )
import GHC.Exts (Char (..), andI#, isTrue#, ord#, (<#))
import Test.Hspec (Spec, describe, it, shouldBe, shouldSatisfy)
import Unicode.Char.General (
GeneralCategory (NotAssigned, PrivateUse, Surrogate),
generalCategory,
)

import Unicode.Char.General.Names (
NameAliasType (..),
correctedName,
label,
name,
nameAliases,
nameAliasesByType,
nameAliasesWithTypes,
nameOrAlias,
)
import qualified Unicode.Internal.Char.UnicodeData.DerivedName as DerivedName

spec :: Spec
spec = do
Expand Down Expand Up @@ -132,3 +142,25 @@ spec = do
NotAssigned -> True
_ -> False
traverse_ (`shouldSatisfy` checkName) [minBound..maxBound]
describe "label" do
it "Some characters" do
label '\x0000' `shouldBe` "control-0000"
label '\x009F' `shouldBe` "control-009F"
label 'a' `shouldBe` "UNDEFINED"
label '1' `shouldBe` "UNDEFINED"
label '\x1D0C5' `shouldBe` "UNDEFINED"
label '\x2F89F' `shouldBe` "UNDEFINED"
label '\xE000' `shouldBe` "private-use-E000"
label '\x10FFFD' `shouldBe` "private-use-10FFFD"
label '\xD800' `shouldBe` "surrogate-D800"
label '\xDFFF' `shouldBe` "surrogate-DFFF"
label '\xFDD0' `shouldBe` "noncharacter-FDD0"
label '\x10FFFF' `shouldBe` "noncharacter-10FFFF"
label '\x0378' `shouldBe` "reserved-0378"
label '\x1FFFD' `shouldBe` "reserved-1FFFD"
label '\xEFFFD' `shouldBe` "reserved-EFFFD"
it "Every character has either a name or a label" do
let checkName c = case name c of
Just _ -> True
Nothing -> label c /= "UNDEFINED"
traverse_ (`shouldSatisfy` checkName) [minBound..maxBound]
4 changes: 3 additions & 1 deletion unicode-data-names/unicode-data-names.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -94,16 +94,18 @@ library
-- Generated files
-- This module structure is largely based on the UCD file names from which
-- the properties are generated.
Unicode.Internal.Char.Label
Unicode.Internal.Char.Names.Version
Unicode.Internal.Char.UnicodeData.DerivedName
Unicode.Internal.Char.UnicodeData.NameAliases
Unicode.Internal.Char.Names.Version
other-modules:
-- Internal files
Unicode.Internal.Bits.Names

hs-source-dirs: lib
build-depends:
base >= 4.7 && < 4.21,
unicode-data >= 0.6 && < 0.7
-- Support for raw string literals unpacking is included in base ≥ 4.15
if impl(ghc < 9.0.0)
build-depends:
Expand Down
Loading
Loading