From bf8bb533650119b3ce196aa95b009c6f4ebc1f04 Mon Sep 17 00:00:00 2001 From: Wismill Date: Wed, 28 Sep 2022 10:41:29 +0200 Subject: [PATCH] Last fixes before release (#94) * Remove allBlockRanges * When downloading files, create a separate directory for each Unicode version. This facilitate the work on various Unicode versions. * Improve docs --- README.md | 2 +- .../lib/Unicode/Text/Case.hs | 7 ++++ .../unicode-data-text/unicode-data-text.cabal | 4 +-- ucd.sh | 4 +-- unicode-data/bench/Main.hs | 3 -- unicode-data/exe/Parser/Text.hs | 10 +----- unicode-data/lib/Unicode/Char.hs | 2 +- unicode-data/lib/Unicode/Char/Case.hs | 34 ++++++++++++++++--- unicode-data/lib/Unicode/Char/Case/Compat.hs | 11 +++++- .../lib/Unicode/Char/General/Blocks.hs | 11 +----- unicode-data/lib/Unicode/Char/Numeric.hs | 6 ++-- .../lib/Unicode/Char/Numeric/Compat.hs | 1 + .../lib/Unicode/Internal/Char/Blocks.hs | 10 +----- 13 files changed, 60 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 329010b7..b63dcb26 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ See `unicode-data`’s [guide](unicode-data/README.md#unicode-database-version-u ## Licensing `unicode-data*` packages are an [open source](https://github.com/composewell/unicode-data) -project available under a liberal [Apache-2.0 license](LICENSE). +project available under a liberal [Apache-2.0 license](unicode-data/LICENSE). ## Contributing diff --git a/experimental/unicode-data-text/lib/Unicode/Text/Case.hs b/experimental/unicode-data-text/lib/Unicode/Text/Case.hs index e62b5c2a..090b4ed0 100644 --- a/experimental/unicode-data-text/lib/Unicode/Text/Case.hs +++ b/experimental/unicode-data-text/lib/Unicode/Text/Case.hs @@ -55,14 +55,17 @@ streamUnfold (C.Unfold step inject) = \case caseConvertStream :: C.Unfold Char Char -> T.Text -> T.Text caseConvertStream u t = TF.unstream (streamUnfold u (TF.stream t)) +-- | Convert to full upper case using 'T.Text' fusion. {-# INLINE toUpperFusion #-} toUpperFusion :: T.Text -> T.Text toUpperFusion = caseConvertStream C.upperCaseMapping +-- | Convert to full lower case using 'T.Text' fusion. {-# INLINE toLowerFusion #-} toLowerFusion :: T.Text -> T.Text toLowerFusion = caseConvertStream C.lowerCaseMapping +-- | Convert to full case fold using 'T.Text' fusion. {-# INLINE toCaseFoldFusion #-} toCaseFoldFusion :: T.Text -> T.Text toCaseFoldFusion = caseConvertStream C.caseFoldMapping @@ -105,6 +108,7 @@ streamUnfoldToTitle = case C.lowerCaseMapping of C.Yield c st' -> TF.Yield c (CC2 s st') {-# INLINE [0] streamUnfoldToTitle #-} +-- | Convert to full title case using 'T.Text' fusion. {-# INLINE toTitleFusion #-} toTitleFusion :: T.Text -> T.Text toTitleFusion = TF.unstream . streamUnfoldToTitle . TF.stream @@ -189,18 +193,21 @@ caseConvertText ascii (C.Unfold (step :: u -> C.Step u Char) inject) (T.Text src writeMapping (step st) (dstOff + d) {-# INLINE caseConvertText #-} +-- | Convert to full upper case /without/ 'T.Text' fusion. {-# INLINE toUpper #-} toUpper :: T.Text -> T.Text toUpper = caseConvertText (\w -> if w - 97 <= 25 then w - 32 else w) C.upperCaseMapping +-- | Convert to full lower case /without/ 'T.Text' fusion. {-# INLINE toLower #-} toLower :: T.Text -> T.Text toLower = caseConvertText (\w -> if w - 65 <= 25 then w + 32 else w) C.lowerCaseMapping +-- | Convert to full case fold /without/ 'T.Text' fusion. {-# INLINE toCaseFold #-} toCaseFold :: T.Text -> T.Text toCaseFold = caseConvertText diff --git a/experimental/unicode-data-text/unicode-data-text.cabal b/experimental/unicode-data-text/unicode-data-text.cabal index 30f20c38..5ba7bde0 100644 --- a/experimental/unicode-data-text/unicode-data-text.cabal +++ b/experimental/unicode-data-text/unicode-data-text.cabal @@ -4,8 +4,8 @@ version: 0.1.0 synopsis: Unicode features for “text” package description: @unicode-data-text@ provides Unicode features from - package - for the package. + package + for the package. homepage: http://github.com/composewell/unicode-data bug-reports: https://github.com/composewell/unicode-data/issues license: Apache-2.0 diff --git a/ucd.sh b/ucd.sh index 71ef41fc..8a428f26 100755 --- a/ucd.sh +++ b/ucd.sh @@ -44,7 +44,7 @@ SECURITY_FILES="\ # and verify the $checksum if $VERIFY_CHECKSUM is enabled # $1 = file:checksum download_file() { - local directory="data/$1" + local directory="data/$VERSION/$1" local url="$2" local pair="$3" local file="$(echo "$pair" | cut -f1 -d':')" @@ -81,7 +81,7 @@ download_files() { run_generator() { # Compile and run ucd2haskell cabal run --flag ucd2haskell ucd2haskell -- \ - --input ./data/ \ + --input "./data/$VERSION" \ --output-core ./unicode-data/lib/ \ --output-names ./unicode-data-names/lib/ \ --output-scripts ./unicode-data-scripts/lib/ \ diff --git a/unicode-data/bench/Main.hs b/unicode-data/bench/Main.hs index 8e1b0cba..35df7c93 100644 --- a/unicode-data/bench/Main.hs +++ b/unicode-data/bench/Main.hs @@ -134,9 +134,6 @@ main = defaultMain , bgroup "blockDefinition" [ benchRangeNF "unicode-data" (show . B.blockDefinition) ] - , bgroup "allBlockRanges" - [ benchChars "unicode-data" (const B.allBlockRanges) - ] ] , bgroup "Unicode.Char.General.Compat" [ bgroup' "isAlpha" diff --git a/unicode-data/exe/Parser/Text.hs b/unicode-data/exe/Parser/Text.hs index 75dfd98f..0836c599 100644 --- a/unicode-data/exe/Parser/Text.hs +++ b/unicode-data/exe/Parser/Text.hs @@ -315,7 +315,7 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial , "{-# OPTIONS_HADDOCK hide #-}" , "" , "module " <> moduleName - , "(Block(..), BlockDefinition(..), block, blockDefinition, allBlockRanges)" + , "(Block(..), BlockDefinition(..), block, blockDefinition)" , "where" , "" , "import Data.Ix (Ix)" @@ -344,14 +344,6 @@ genBlocksModule moduleName = done <$> Fold.foldl' step initial , "blockDefinition :: Block -> BlockDefinition" , "blockDefinition b = case b of" , mconcat (reverse defs) - , "-- | All the block ranges, in ascending order." - , "--" - , "-- @since 0.3.1" - , "{-# INLINE allBlockRanges #-}" - , "allBlockRanges :: [(Int, Int)]" - , "allBlockRanges =" - , " " <> show ranges' - , "" , "-- | Character block, if defined." , "--" , "-- @since 0.3.1" diff --git a/unicode-data/lib/Unicode/Char.hs b/unicode-data/lib/Unicode/Char.hs index 3c633cdd..d213bf37 100644 --- a/unicode-data/lib/Unicode/Char.hs +++ b/unicode-data/lib/Unicode/Char.hs @@ -35,7 +35,7 @@ module Unicode.Char , module Unicode.Char.Identifiers , unicodeVersion - -- * Re-export + -- * Re-export from @base@ , ord , chr ) diff --git a/unicode-data/lib/Unicode/Char/Case.hs b/unicode-data/lib/Unicode/Char/Case.hs index 421918d0..c23b3719 100644 --- a/unicode-data/lib/Unicode/Char/Case.hs +++ b/unicode-data/lib/Unicode/Char/Case.hs @@ -9,6 +9,12 @@ -- -- Case and case mapping related functions. -- +-- This module provides /full/ predicates and mappings that are /not/ compatible +-- with those in "Data.Char", which rely on simple properties. +-- See "Unicode.Char.Case.Compat" for a drop-in replacement of the functions in +-- "Data.Char". +-- + module Unicode.Char.Case ( -- * Predicates isLowerCase @@ -48,14 +54,20 @@ import qualified Unicode.Internal.Char.SpecialCasing.LowerCaseMapping as C import qualified Unicode.Internal.Char.SpecialCasing.TitleCaseMapping as C import qualified Unicode.Internal.Char.SpecialCasing.UpperCaseMapping as C --- | Returns 'True' for lower-case letters. +-- | Returns 'True' for lower-case characters. +-- +-- It uses the character property +-- . -- -- @since 0.3.0 {-# INLINE isLowerCase #-} isLowerCase :: Char -> Bool isLowerCase = P.isLowercase --- | Returns 'True' for lower-case letters. +-- | Returns 'True' for lower-case characters. +-- +-- It uses the character property +-- . -- -- @since 0.1.0 {-# INLINE isLower #-} @@ -63,14 +75,28 @@ isLowerCase = P.isLowercase isLower :: Char -> Bool isLower = P.isLowercase --- | Returns 'True' for upper-case letters. +-- | Returns 'True' for upper-case characters. +-- +-- It uses the character property +-- . +-- +-- Note: it does /not/ match title-cased letters. Those are matched using: +-- @'Unicode.Char.General.generalCategory' c == +-- 'Unicode.Char.General.TitlecaseLetter'@. -- -- @since 0.3.0 {-# INLINE isUpperCase #-} isUpperCase :: Char -> Bool isUpperCase = P.isUppercase --- | Returns 'True' for upper-case letters. +-- | Returns 'True' for upper-case characters. +-- +-- It uses the character property +-- . +-- +-- Note: it does /not/ match title-cased letters. Those are matched using: +-- @'Unicode.Char.General.generalCategory' c == +-- 'Unicode.Char.General.TitlecaseLetter'@. -- -- @since 0.1.0 {-# INLINE isUpper #-} diff --git a/unicode-data/lib/Unicode/Char/Case/Compat.hs b/unicode-data/lib/Unicode/Char/Case/Compat.hs index f17c3011..74666c15 100644 --- a/unicode-data/lib/Unicode/Char/Case/Compat.hs +++ b/unicode-data/lib/Unicode/Char/Case/Compat.hs @@ -5,7 +5,7 @@ -- Maintainer : streamly@composewell.com -- Stability : experimental -- --- Compatibility module for case and case mapping related functions.. +-- Compatibility module for case and case mapping related functions. -- -- The functions of this module are drop-in replacement for those in "Data.Char". -- They are similar but not identical to some functions in "Unicode.Char.Case", @@ -30,6 +30,11 @@ import qualified Unicode.Internal.Char.UnicodeData.SimpleUpperCaseMapping as C -- Title case is used by a small number of letter ligatures like the -- single-character form of /Lj/. -- +-- It matches characters with general category 'UppercaseLetter' and +-- 'TitlecaseLetter'. +-- +-- See: 'Unicode.Char.Case.isUpperCase' for the /full upper/ case predicate. +-- -- prop> isUpper c == Data.Char.isUpper c -- -- @since 0.3.0 @@ -41,6 +46,10 @@ isUpper c = case generalCategory c of -- | Selects lower-case alphabetic Unicode characters (letters). -- +-- It matches characters with general category 'LowercaseLetter'. +-- +-- See: 'Unicode.Char.Case.isLowerCase' for the /full/ lower case predicate. +-- -- prop> isLower c == Data.Char.isLower c -- -- @since 0.3.0 diff --git a/unicode-data/lib/Unicode/Char/General/Blocks.hs b/unicode-data/lib/Unicode/Char/General/Blocks.hs index eb77e7db..51ec7855 100644 --- a/unicode-data/lib/Unicode/Char/General/Blocks.hs +++ b/unicode-data/lib/Unicode/Char/General/Blocks.hs @@ -1,5 +1,5 @@ -- | --- Module : Unicode.Char.General +-- Module : Unicode.Char.General.Blocks -- Copyright : (c) 2020 Composewell Technologies and Contributors -- License : Apache-2.0 -- Maintainer : streamly@composewell.com @@ -14,7 +14,6 @@ module Unicode.Char.General.Blocks , B.BlockDefinition(..) , block , B.blockDefinition - , allBlockRanges ) where @@ -27,11 +26,3 @@ import qualified Unicode.Internal.Char.Blocks as B {-# INLINE block #-} block :: Char -> Maybe B.Block block = fmap toEnum . B.block - --- | All the [block](https://www.unicode.org/glossary/#block) ranges, --- in ascending order. --- --- @since 0.3.1 -{-# INLINE allBlockRanges #-} -allBlockRanges :: [(Int, Int)] -allBlockRanges = B.allBlockRanges diff --git a/unicode-data/lib/Unicode/Char/Numeric.hs b/unicode-data/lib/Unicode/Char/Numeric.hs index 02186334..770337fa 100644 --- a/unicode-data/lib/Unicode/Char/Numeric.hs +++ b/unicode-data/lib/Unicode/Char/Numeric.hs @@ -10,14 +10,14 @@ -- @since 0.3.0 module Unicode.Char.Numeric ( -- * Predicates - isNumber - , isNumeric + isNumeric + , isNumber -- * Numeric values , numericValue , integerValue - -- * Re-export + -- * Re-export from @base@ , isDigit , isOctDigit , isHexDigit diff --git a/unicode-data/lib/Unicode/Char/Numeric/Compat.hs b/unicode-data/lib/Unicode/Char/Numeric/Compat.hs index f3074d73..22f893b5 100644 --- a/unicode-data/lib/Unicode/Char/Numeric/Compat.hs +++ b/unicode-data/lib/Unicode/Char/Numeric/Compat.hs @@ -35,6 +35,7 @@ import Unicode.Char.General (GeneralCategory(..), generalCategory) -- prop> isNumber c == Data.Char.isNumber c -- -- @since 0.3.1 moved to Compat module. +-- -- @since 0.3.0 isNumber :: Char -> Bool isNumber c = case generalCategory c of diff --git a/unicode-data/lib/Unicode/Internal/Char/Blocks.hs b/unicode-data/lib/Unicode/Internal/Char/Blocks.hs index 77441062..a128546f 100644 --- a/unicode-data/lib/Unicode/Internal/Char/Blocks.hs +++ b/unicode-data/lib/Unicode/Internal/Char/Blocks.hs @@ -10,7 +10,7 @@ {-# OPTIONS_HADDOCK hide #-} module Unicode.Internal.Char.Blocks -(Block(..), BlockDefinition(..), block, blockDefinition, allBlockRanges) +(Block(..), BlockDefinition(..), block, blockDefinition) where import Data.Ix (Ix) @@ -678,14 +678,6 @@ blockDefinition b = case b of SupplementaryPrivateUseAreaA -> BlockDefinition (0xf0000, 0xfffff) "Supplementary Private Use Area-A" SupplementaryPrivateUseAreaB -> BlockDefinition (0x100000, 0x10ffff) "Supplementary Private Use Area-B" --- | All the block ranges, in ascending order. --- --- @since 0.3.1 -{-# INLINE allBlockRanges #-} -allBlockRanges :: [(Int, Int)] -allBlockRanges = - [(0,127),(128,255),(256,383),(384,591),(592,687),(688,767),(768,879),(880,1023),(1024,1279),(1280,1327),(1328,1423),(1424,1535),(1536,1791),(1792,1871),(1872,1919),(1920,1983),(1984,2047),(2048,2111),(2112,2143),(2144,2159),(2160,2207),(2208,2303),(2304,2431),(2432,2559),(2560,2687),(2688,2815),(2816,2943),(2944,3071),(3072,3199),(3200,3327),(3328,3455),(3456,3583),(3584,3711),(3712,3839),(3840,4095),(4096,4255),(4256,4351),(4352,4607),(4608,4991),(4992,5023),(5024,5119),(5120,5759),(5760,5791),(5792,5887),(5888,5919),(5920,5951),(5952,5983),(5984,6015),(6016,6143),(6144,6319),(6320,6399),(6400,6479),(6480,6527),(6528,6623),(6624,6655),(6656,6687),(6688,6831),(6832,6911),(6912,7039),(7040,7103),(7104,7167),(7168,7247),(7248,7295),(7296,7311),(7312,7359),(7360,7375),(7376,7423),(7424,7551),(7552,7615),(7616,7679),(7680,7935),(7936,8191),(8192,8303),(8304,8351),(8352,8399),(8400,8447),(8448,8527),(8528,8591),(8592,8703),(8704,8959),(8960,9215),(9216,9279),(9280,9311),(9312,9471),(9472,9599),(9600,9631),(9632,9727),(9728,9983),(9984,10175),(10176,10223),(10224,10239),(10240,10495),(10496,10623),(10624,10751),(10752,11007),(11008,11263),(11264,11359),(11360,11391),(11392,11519),(11520,11567),(11568,11647),(11648,11743),(11744,11775),(11776,11903),(11904,12031),(12032,12255),(12272,12287),(12288,12351),(12352,12447),(12448,12543),(12544,12591),(12592,12687),(12688,12703),(12704,12735),(12736,12783),(12784,12799),(12800,13055),(13056,13311),(13312,19903),(19904,19967),(19968,40959),(40960,42127),(42128,42191),(42192,42239),(42240,42559),(42560,42655),(42656,42751),(42752,42783),(42784,43007),(43008,43055),(43056,43071),(43072,43135),(43136,43231),(43232,43263),(43264,43311),(43312,43359),(43360,43391),(43392,43487),(43488,43519),(43520,43615),(43616,43647),(43648,43743),(43744,43775),(43776,43823),(43824,43887),(43888,43967),(43968,44031),(44032,55215),(55216,55295),(55296,56191),(56192,56319),(56320,57343),(57344,63743),(63744,64255),(64256,64335),(64336,65023),(65024,65039),(65040,65055),(65056,65071),(65072,65103),(65104,65135),(65136,65279),(65280,65519),(65520,65535),(65536,65663),(65664,65791),(65792,65855),(65856,65935),(65936,65999),(66000,66047),(66176,66207),(66208,66271),(66272,66303),(66304,66351),(66352,66383),(66384,66431),(66432,66463),(66464,66527),(66560,66639),(66640,66687),(66688,66735),(66736,66815),(66816,66863),(66864,66927),(66928,67007),(67072,67455),(67456,67519),(67584,67647),(67648,67679),(67680,67711),(67712,67759),(67808,67839),(67840,67871),(67872,67903),(67968,67999),(68000,68095),(68096,68191),(68192,68223),(68224,68255),(68288,68351),(68352,68415),(68416,68447),(68448,68479),(68480,68527),(68608,68687),(68736,68863),(68864,68927),(69216,69247),(69248,69311),(69376,69423),(69424,69487),(69488,69551),(69552,69599),(69600,69631),(69632,69759),(69760,69839),(69840,69887),(69888,69967),(69968,70015),(70016,70111),(70112,70143),(70144,70223),(70272,70319),(70320,70399),(70400,70527),(70656,70783),(70784,70879),(71040,71167),(71168,71263),(71264,71295),(71296,71375),(71424,71503),(71680,71759),(71840,71935),(71936,72031),(72096,72191),(72192,72271),(72272,72367),(72368,72383),(72384,72447),(72704,72815),(72816,72895),(72960,73055),(73056,73135),(73440,73471),(73648,73663),(73664,73727),(73728,74751),(74752,74879),(74880,75087),(77712,77823),(77824,78895),(78896,78911),(82944,83583),(92160,92735),(92736,92783),(92784,92879),(92880,92927),(92928,93071),(93760,93855),(93952,94111),(94176,94207),(94208,100351),(100352,101119),(101120,101631),(101632,101759),(110576,110591),(110592,110847),(110848,110895),(110896,110959),(110960,111359),(113664,113823),(113824,113839),(118528,118735),(118784,119039),(119040,119295),(119296,119375),(119520,119551),(119552,119647),(119648,119679),(119808,120831),(120832,121519),(122624,122879),(122880,122927),(123136,123215),(123536,123583),(123584,123647),(124896,124927),(124928,125151),(125184,125279),(126064,126143),(126208,126287),(126464,126719),(126976,127023),(127024,127135),(127136,127231),(127232,127487),(127488,127743),(127744,128511),(128512,128591),(128592,128639),(128640,128767),(128768,128895),(128896,129023),(129024,129279),(129280,129535),(129536,129647),(129648,129791),(129792,130047),(131072,173791),(173824,177983),(177984,178207),(178208,183983),(183984,191471),(194560,195103),(196608,201551),(917504,917631),(917760,917999),(983040,1048575),(1048576,1114111)] - -- | Character block, if defined. -- -- @since 0.3.1