Skip to content

Commit

Permalink
Add operator-to-precedence table
Browse files Browse the repository at this point in the history
version.py: make sure black doesn't reformat
characters.py: tolerate an empty characters.json for now.
  • Loading branch information
rocky committed Jun 26, 2021
1 parent 42c0179 commit 411abf1
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 56 deletions.
26 changes: 15 additions & 11 deletions mathics_scanner/characters.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@
ROOT_DIR = pkg_resources.resource_filename("mathics_scanner", "")

# Load the conversion tables from disk
with open(os.path.join(ROOT_DIR, "data", "characters.json"), "r") as f:
_data = ujson.load(f)
characters_path = os.path.join(ROOT_DIR, "data", "characters.json")
if os.path.exists(characters_path):
with open(characters_path, "r") as f:
_data = ujson.load(f)
else:
_data = {}

# Character ranges of letters
_letters = "a-zA-Z\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u0103\u0106\u0107\
Expand All @@ -33,25 +37,25 @@
\uf793-\uf79a\uf79c-\uf7a2\uf7a4-\uf7bd\uf800-\uf833\ufb01\ufb02"

# Character ranges of letterlikes
_letterlikes = _data["letterlikes"]
_letterlikes = _data.get("letterlikes", {})

# Conversion from WL to the fully qualified names
_wl_to_ascii = _data["wl-to-ascii-dict"]
_wl_to_ascii_re = re.compile(_data["wl-to-ascii-re"])
_wl_to_ascii = _data.get("wl-to-ascii-dict", {})
_wl_to_ascii_re = re.compile(_data.get("wl-to-ascii-re", ""))

# Conversion from WL to unicode
_wl_to_unicode = _data["wl-to-unicode-dict"]
_wl_to_unicode_re = re.compile(_data["wl-to-unicode-re"])
_wl_to_unicode = _data.get("wl-to-unicode-dict", {})
_wl_to_unicode_re = re.compile(_data.get("wl-to-unicode-re", ""))

# Conversion from unicode to WL
_unicode_to_wl = _data["unicode-to-wl-dict"]
_unicode_to_wl_re = re.compile(_data["unicode-to-wl-re"])
_unicode_to_wl = _data.get("unicode-to-wl-dict", {})
_unicode_to_wl_re = re.compile(_data.get("unicode-to-wl-re", ""))

# All supported named characters
named_characters = _data["named-characters"]
named_characters = _data.get("named-characters", {})

# ESC sequence aliases
aliased_characters = _data["aliased-characters"]
aliased_characters = _data.get("aliased-characters", {})


def replace_wl_with_plain_text(wl_input: str, use_unicode=True) -> str:
Expand Down
2 changes: 1 addition & 1 deletion mathics_scanner/data/characters.json

Large diffs are not rendered by default.

92 changes: 48 additions & 44 deletions mathics_scanner/generate/build_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import os.path as osp
from pathlib import Path

from mathics_scanner.version import __version__


def get_srcdir():
filename = osp.normcase(osp.dirname(osp.abspath(__file__)))
Expand All @@ -21,16 +23,6 @@ def read(*rnames):
return open(osp.join(get_srcdir(), *rnames)).read()


# stores __version__ in the current namespace
exec(
compile(
open(Path(get_srcdir()) / ".." / "version.py").read(),
"mathics_scanner/version.py",
"exec",
)
)


def re_from_keys(d: dict) -> str:
"""
Takes dictionary whose keys are all strings and returns a regex that
Expand Down Expand Up @@ -73,8 +65,9 @@ def get_plain_text(char_name: str, char_data: dict, use_unicode: bool) -> str:

def compile_tables(data: dict) -> dict:
"""
Compiles the general table into the tables used internally by the library
for fast access
Compiles the general table into the tables used internally by the library.
This facilitates fast access of this information by clients needing this
information.
"""

# Multiple entries in the YAML table are redundant in the following sence:
Expand All @@ -91,26 +84,26 @@ def compile_tables(data: dict) -> dict:
# characters that have a unicode inverse are included in
# `unicode_to_wl_dict`

# Conversion from WL to the fully qualified names
wl_to_ascii_dict = {
v["wl-unicode"]: get_plain_text(k, v, use_unicode=False)
# ESC sequence aliases dictionary entry
aliased_characters = {
v["esc-alias"]: v["wl-unicode"] for v in data.values() if "esc-alias" in v
}

# operator-to-unicode dictionary entry
operator_to_precedence = {
v["operator-name"]: v["precedence"]
for k, v in data.items()
if "wl-unicode" in v
if "operator-name" in v and "precedence" in v
}
wl_to_ascii_dict = {k: v for k, v in wl_to_ascii_dict.items() if k != v}
wl_to_ascii_re = re_from_keys(wl_to_ascii_dict)

# Conversion from wl to unicode
# We filter the dictionary after it's first created to redundant entries
wl_to_unicode_dict = {
v["wl-unicode"]: get_plain_text(k, v, use_unicode=True)
# operator-to-unicode dictionary entry
operator_to_unicode = {
v["operator-name"]: v["unicode-equivalent"]
for k, v in data.items()
if "wl-unicode" in v
if "operator-name" in v and "unicode-equivalent" in v
}
wl_to_unicode_dict = {k: v for k, v in wl_to_unicode_dict.items() if k != v}
wl_to_unicode_re = re_from_keys(wl_to_unicode_dict)

# Conversion from unicode to wl
# Conversion from unicode to wl dictionary entry.
# We filter the dictionary after it's first created to redundant entries
unicode_to_wl_dict = {
v["unicode-equivalent"]: v["wl-unicode"]
Expand All @@ -120,32 +113,20 @@ def compile_tables(data: dict) -> dict:
unicode_to_wl_dict = {k: v for k, v in unicode_to_wl_dict.items() if k != v}
unicode_to_wl_re = re_from_keys(unicode_to_wl_dict)

# Unicode string containing all letterlikes values
# Unicode string containing all letterlikes values dictionarhy entry
letterlikes = "".join(v["wl-unicode"] for v in data.values() if v["is-letter-like"])

# All supported named characters
# All supported named characters dictionary entry
named_characters = {
k: v["wl-unicode"] for k, v in data.items() if "wl-unicode" in v
}

# Operators with ASCII sequences
# Operators with ASCII sequences list entry
ascii_operators = sorted(
[v["ascii"] for v in data.values() if "operator-name" in v and "ascii" in v]
)

# ESC sequence aliases
aliased_characters = {
v["esc-alias"]: v["wl-unicode"] for v in data.values() if "esc-alias" in v
}

# operator-to-unicode dictionary
operator_to_unicode = {
v["operator-name"]: v["unicode-equivalent"]
for k, v in data.items()
if "operator-name" in v and "unicode-equivalent" in v
}

# ESC sequence aliases
# unicode-equivalent list entry
unicode_operators = sorted(
[
v["unicode-equivalent"]
Expand All @@ -154,19 +135,40 @@ def compile_tables(data: dict) -> dict:
]
)

# operator-to-unicode dictionary
# unicode-to-operator dictionary entry
unicode_to_operator = {
v["unicode-equivalent"]: v["operator-name"]
for k, v in data.items()
if "operator-name" in v and "unicode-equivalent" in v
}
# Conversion from WL to the fully qualified names dictionary entry
wl_to_ascii_dict = {
v["wl-unicode"]: get_plain_text(k, v, use_unicode=False)
for k, v in data.items()
if "wl-unicode" in v
}
wl_to_ascii_dict = {k: v for k, v in wl_to_ascii_dict.items() if k != v}
wl_to_ascii_re = re_from_keys(wl_to_ascii_dict)

# Conversion from wl to unicode dictionary entry
# We filter the dictionary after it's first created to redundant entries
wl_to_unicode_dict = {
v["wl-unicode"]: get_plain_text(k, v, use_unicode=True)
for k, v in data.items()
if "wl-unicode" in v
}
wl_to_unicode_dict = {k: v for k, v in wl_to_unicode_dict.items() if k != v}
wl_to_unicode_re = re_from_keys(wl_to_unicode_dict)

return {
"aliased-characters": aliased_characters,
"ascii-operators": ascii_operators,
"letterlikes": letterlikes,
"named-characters": named_characters,
"operator-to-precedence": operator_to_precedence,
"operator-to-unicode": operator_to_unicode,
"unicode-operators": unicode_operators,
"unicode-equivalent": unicode_operators,
"unicode-operators": unicode_to_operator,
"unicode-to-operator": unicode_to_operator,
"unicode-to-wl-dict": unicode_to_wl_dict,
"unicode-to-wl-re": unicode_to_wl_re,
Expand All @@ -184,7 +186,9 @@ def compile_tables(data: dict) -> dict:
"ascii-operators",
"letterlikes",
"named-characters",
"operator-to-precedence",
"operator-to-unicode",
"unicode-equivalent",
"unicode-operators",
"unicode-to-operator",
"unicode-to-wl-dict",
Expand Down
1 change: 1 addition & 0 deletions mathics_scanner/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
# This file is suitable for sourcing inside POSIX shell as
# well as importing into Python. That's why there is no
# space around "=" below.
# fmt: off
__version__ = "1.2.1.dev0" # noqa

0 comments on commit 411abf1

Please sign in to comment.