Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce General Category blob #124

Merged
merged 8 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/haskell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ jobs:
- name: 8.0.2
ghc_version: 8.0.2
runner: ubuntu-latest
cabal_version: 3.2
cabal_version: latest
pack_options: DISABLE_TEST=y
ignore_error: false

Expand Down
4 changes: 4 additions & 0 deletions experimental/icu/cbits/icu.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,7 @@ int32_t __hs_u_charName( UChar32 codepoint
void __hs_u_charAge( UChar32 c, UVersionInfo versionArray ) {
u_charAge(c, versionArray);
}

int8_t __hs_u_charType(UChar32 c) {
return u_charType(c);
}
7 changes: 4 additions & 3 deletions experimental/icu/cbits/icu.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ int32_t __hs_u_charName( UChar32 codepoint
, UCharNameChoice nameChoice
, char * buffer
, int32_t bufferLength );
static const int __hs_U_UNICODE_CHAR_NAME = U_UNICODE_CHAR_NAME;
static const int __hs_U_CHAR_NAME_ALIAS = U_CHAR_NAME_ALIAS;

// typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH];
void __hs_u_charAge( UChar32 c, UVersionInfo versionArray );

static const int __hs_U_MAX_VERSION_LENGTH = U_MAX_VERSION_LENGTH;
static const int __hs_U_UNICODE_CHAR_NAME = U_UNICODE_CHAR_NAME;
static const int __hs_U_CHAR_NAME_ALIAS = U_CHAR_NAME_ALIAS;

int8_t __hs_u_charType(UChar32 c);

#endif
2 changes: 2 additions & 0 deletions experimental/icu/icu.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,5 @@ library
extra-libraries: icuuc
pkgconfig-depends:
icu-uc >= 72.1
build-tool-depends:
c2hs:c2hs
136 changes: 136 additions & 0 deletions experimental/icu/lib/ICU/Char.chs
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
-- |
-- Module : ICU.Char
-- Copyright : (c) 2023 Pierre Le Marre
-- License : Apache-2.0
-- Maintainer : [email protected]
-- Stability : experimental
--
-- Unicode character general properties
--
-- @since 0.3.0

module ICU.Char
( unicodeVersion
, charAge
, UGeneralCategory(..)
, toGeneralCategory
, charType
) where

#include <unicode/uchar.h>

import Data.Char (ord)
import qualified Data.Char as Char
import Data.Int (Int8)
import Data.Version (Version, makeVersion)
import Data.Word (Word32)
import Foreign (Ptr)
import Foreign.C (CInt)
import Foreign.Marshal.Array (allocaArray, peekArray)
import System.IO.Unsafe (unsafePerformIO)

type UChar32 = Word32

foreign import capi "icu.h value __hs_U_MAX_VERSION_LENGTH" maxVersionLength :: Int

foreign import ccall unsafe "icu.h __hs_u_getUnicodeVersion" u_getUnicodeVersion
:: Ptr Int8 -> IO ()

-- | ICU Unicode version
unicodeVersion :: Version
unicodeVersion
= makeVersion
. fmap fromIntegral
. unsafePerformIO
$ allocaArray
maxVersionLength
(\ptr -> u_getUnicodeVersion ptr *> peekArray maxVersionLength ptr)

foreign import ccall unsafe "icu.h __hs_u_charAge" u_charAge
:: UChar32 -> Ptr Int8 -> IO ()

-- | Character age
charAge :: Char -> Version
charAge c
= makeVersion
. fmap fromIntegral
. unsafePerformIO
$ allocaArray
maxVersionLength
(\ptr -> u_charAge cp ptr *> peekArray maxVersionLength ptr)
where
cp = fromIntegral (ord c)

foreign import ccall safe "icu.h __hs_u_charType" u_charType
:: UChar32 -> Int8

{#enum define UGeneralCategory {
U_UNASSIGNED as Unassigned,
U_UPPERCASE_LETTER as UppercaseLetter,
U_LOWERCASE_LETTER as LowercaseLetter,
U_TITLECASE_LETTER as TitlecaseLetter,
U_MODIFIER_LETTER as ModifierLetter,
U_OTHER_LETTER as OtherLetter,
U_NON_SPACING_MARK as NonSpacingMark,
U_ENCLOSING_MARK as EnclosingMark,
U_COMBINING_SPACING_MARK as CombiningSpacingMark,
U_DECIMAL_DIGIT_NUMBER as DecimalDigitNumber,
U_LETTER_NUMBER as LetterNumber,
U_OTHER_NUMBER as OtherNumber,
U_SPACE_SEPARATOR as SpaceSeparator,
U_LINE_SEPARATOR as LineSeparator,
U_PARAGRAPH_SEPARATOR as ParagraphSeparator,
U_CONTROL_CHAR as ControlChar,
U_FORMAT_CHAR as FormatChar,
U_PRIVATE_USE_CHAR as PrivateUseChar,
U_SURROGATE as Surrogate,
U_DASH_PUNCTUATION as DashPunctuation,
U_START_PUNCTUATION as StartPunctuation,
U_END_PUNCTUATION as EndPunctuation,
U_CONNECTOR_PUNCTUATION as ConnectorPunctuation,
U_OTHER_PUNCTUATION as OtherPunctuation,
U_MATH_SYMBOL as MathSymbol,
U_CURRENCY_SYMBOL as CurrencySymbol,
U_MODIFIER_SYMBOL as ModifierSymbol,
U_OTHER_SYMBOL as OtherSymbol,
U_INITIAL_PUNCTUATION as InitialPunctuation,
U_FINAL_PUNCTUATION as FinalPunctuation
}
deriving (Bounded, Eq, Ord, Show) #}

-- | General category
charType :: Char -> UGeneralCategory
charType = toEnum . fromIntegral . u_charType . fromIntegral . ord

toGeneralCategory :: UGeneralCategory -> Char.GeneralCategory
toGeneralCategory = \case
Unassigned -> Char.NotAssigned
UppercaseLetter -> Char.UppercaseLetter
LowercaseLetter -> Char.LowercaseLetter
TitlecaseLetter -> Char.TitlecaseLetter
ModifierLetter -> Char.ModifierLetter
OtherLetter -> Char.OtherLetter
NonSpacingMark -> Char.NonSpacingMark
EnclosingMark -> Char.EnclosingMark
CombiningSpacingMark -> Char.SpacingCombiningMark
DecimalDigitNumber -> Char.DecimalNumber
LetterNumber -> Char.LetterNumber
OtherNumber -> Char.OtherNumber
SpaceSeparator -> Char.Space
LineSeparator -> Char.LineSeparator
ParagraphSeparator -> Char.ParagraphSeparator
ControlChar -> Char.Control
FormatChar -> Char.Format
PrivateUseChar -> Char.PrivateUse
Surrogate -> Char.Surrogate
DashPunctuation -> Char.DashPunctuation
StartPunctuation -> Char.OpenPunctuation
EndPunctuation -> Char.ClosePunctuation
ConnectorPunctuation -> Char.ConnectorPunctuation
OtherPunctuation -> Char.OtherPunctuation
MathSymbol -> Char.MathSymbol
CurrencySymbol -> Char.CurrencySymbol
ModifierSymbol -> Char.ModifierSymbol
OtherSymbol -> Char.OtherSymbol
InitialPunctuation -> Char.InitialQuote
FinalPunctuation -> Char.FinalQuote
55 changes: 0 additions & 55 deletions experimental/icu/lib/ICU/Char.hsc

This file was deleted.

67 changes: 46 additions & 21 deletions ucd.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/sh
# shellcheck disable=SC3043
# shellcheck disable=SC3043,SC3010,SC3030,SC3054

# When reproducing the Haskell files we want to to be sure that the files that
# we used to generate them earlier are exactly the same as the ones we are
Expand Down Expand Up @@ -47,8 +47,11 @@ download_file() {
local directory="data/$VERSION/$1"
local url="$2"
local pair="$3"
local file="$(echo "$pair" | cut -f1 -d':')"
local checksum="$(echo "$pair" | cut -f2 -d':')"
local file
local checksum

file="$(echo "$pair" | cut -f1 -d':')"
checksum="$(echo "$pair" | cut -f2 -d':')"

if test ! -e "$directory/$file"
then
Expand Down Expand Up @@ -79,24 +82,42 @@ download_files() {

# Generate the Haskell files.
run_generator() {
# Get remaining arguments to pass to Cabal and ucd2haskell.
# Split them on “--” and store in arrays to avoid issues with empty strings.
local cabal_options=()
local cabal_options_end=false
local ucd2haskell_opts=()
for opt in "$@"
do
if [ "$cabal_options_end" = true ]; then
ucd2haskell_opts+=("$opt")
elif [ "$opt" = "--" ]; then
cabal_options_end=true
else
cabal_options+=("$opt")
fi
done

# Compile and run ucd2haskell
cabal run --flag ucd2haskell ucd2haskell:ucd2haskell -- \
--input "./data/$VERSION" \
--output-core ./unicode-data/lib/ \
--output-names ./unicode-data-names/lib/ \
--output-scripts ./unicode-data-scripts/lib/ \
--output-security ./unicode-data-security/lib/ \
--core-prop Uppercase \
--core-prop Lowercase \
--core-prop Alphabetic \
--core-prop White_Space \
--core-prop ID_Start \
--core-prop ID_Continue \
--core-prop XID_Start \
--core-prop XID_Continue \
--core-prop Pattern_Syntax \
--core-prop Pattern_White_Space \
--unicode-version "$VERSION"
cabal run --flag ucd2haskell "${cabal_options[@]}" \
ucd2haskell:ucd2haskell -- \
--input "./data/$VERSION" \
--output-core ./unicode-data/lib/ \
--output-names ./unicode-data-names/lib/ \
--output-scripts ./unicode-data-scripts/lib/ \
--output-security ./unicode-data-security/lib/ \
--core-prop Uppercase \
--core-prop Lowercase \
--core-prop Alphabetic \
--core-prop White_Space \
--core-prop ID_Start \
--core-prop ID_Continue \
--core-prop XID_Start \
--core-prop XID_Continue \
--core-prop Pattern_Syntax \
--core-prop Pattern_White_Space \
--unicode-version "$VERSION" \
"${ucd2haskell_opts[@]}"
}

# Print help text
Expand All @@ -109,6 +130,10 @@ print_help() {
echo
echo "Example:"
echo "$ ./ucd.sh download && ./ucd.sh generate"
echo
echo "Further arguments will be passed to cabal."
echo "The following compiles ucd2haskell with '-O2' and then displays its help."
echo "$ ./ucd.sh generate -O2 -- --help"
}

# Main program
Expand All @@ -122,6 +147,6 @@ case $1 in
download)
download_files "ucd" "$UCD_URL" "$UCD_FILES";
download_files "security" "$SECURITY_URL" "$SECURITY_FILES";;
generate) run_generator;;
generate) run_generator "${@:2}";;
*) echo "Unknown argument"; print_help;;
esac
24 changes: 16 additions & 8 deletions ucd2haskell/exe/UCD2Haskell.hs
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ module Main where

import GHC.Generics (Generic)
import System.FilePath ((</>))
import WithCli (HasArguments(..), withCli)
import WithCli (HasArguments (..), Modifier (..), withCliModified)

import qualified UCD2Haskell.Generator.Core as Core
import qualified UCD2Haskell.Generator.Names as Names
import qualified UCD2Haskell.Generator.Scripts as Scripts
import qualified UCD2Haskell.Generator.Security as Security
import UCD2Haskell.Common (Version (..))
import UCD2Haskell.Generator (printCpuTime)

data CLIOptions =
CLIOptions
Expand All @@ -31,20 +32,27 @@ data CLIOptions =
-- ^ Path to `unicode-data-security` lib directory
, core_prop :: [String]
-- ^ Core properties to select
, patterns :: [String]
-- ^ Simple patterns to filter modules to generate.
-- Leave empty to generate all modules.
, unicode_version :: Version
-- ^ Unicode version
}
deriving (Show, Generic, HasArguments)

cliClient :: CLIOptions -> IO ()
cliClient opts
= Core.generateModules version (inDir "ucd") (output_core opts) (core_prop opts)
*> Names.generateModules version (inDir "ucd") (output_names opts)
*> Scripts.generateModules version (inDir "ucd") (output_scripts opts)
*> Security.generateModules version (inDir "security") (output_security opts)
where
cliClient opts = do
Core.generateModules version (inDir "ucd") (output_core opts) ps (core_prop opts)
Names.generateModules version (inDir "ucd") (output_names opts) ps
Scripts.generateModules version (inDir "ucd") (output_scripts opts) ps
Security.generateModules version (inDir "security") (output_security opts) ps
putChar '[' *> printCpuTime *> putStrLn "s] Finished"
where
version = unVersion (unicode_version opts)
inDir = (input opts </>)
ps = patterns opts

main :: IO ()
main = withCli cliClient
main = withCliModified
[ AddShortOption "patterns" 'p']
cliClient
Loading
Loading