[MetaTools] add a script to download and parse UCD data files
The script currently only parses the Scripts.txt file and generates a new python module `fontTools.unicodedata.scripts` containing a `SCRIPT_RANGES` list of tuples, each containing the range and the corresponding script name.
This commit is contained in:
parent
fdab63f0b6
commit
26db10b5ab
114
MetaTools/buildUCD.py
Executable file
114
MetaTools/buildUCD.py
Executable file
@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Tools to parse data files from the Unicode Character Database.
|
||||
"""
|
||||
|
||||
from __future__ import print_function, absolute_import, division
|
||||
from __future__ import unicode_literals
|
||||
from fontTools.misc.py23 import *
|
||||
|
||||
try:
|
||||
from urllib.request import urlopen
|
||||
except ImportError:
|
||||
from urllib2 import urlopen
|
||||
from contextlib import closing, contextmanager
|
||||
import re
|
||||
from codecs import iterdecode
|
||||
from os.path import abspath, dirname, join as pjoin, pardir, sep
|
||||
|
||||
|
||||
UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
|
||||
|
||||
# by default save output files to ../Lib/fontTools/unicodedata/
|
||||
UNIDATA_PATH = pjoin(abspath(dirname(__file__)), pardir,
|
||||
"Lib", "fontTools", "unicodedata") + sep
|
||||
|
||||
SRC_ENCODING = "# -*- coding: utf-8 -*-\n"
|
||||
|
||||
NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n"
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_unidata_file(filename):
|
||||
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
|
||||
url = UNIDATA_URL + filename
|
||||
with closing(urlopen(url)) as response:
|
||||
yield iterdecode(response, encoding="utf-8")
|
||||
|
||||
|
||||
def parse_unidata_header(infile):
|
||||
"""Read the top header of data files, until the first line
|
||||
that does not start with '#'.
|
||||
"""
|
||||
header = []
|
||||
line = next(infile)
|
||||
while line.startswith("#"):
|
||||
header.append(line)
|
||||
line = next(infile)
|
||||
return "".join(header)
|
||||
|
||||
|
||||
def parse_range_properties(infile):
|
||||
"""Parse a Unicode data file containing a column with one character or
|
||||
a range of characters, and another column containing a property value
|
||||
separated by a semicolon. Comments after '#' are ignored.
|
||||
"""
|
||||
ranges = []
|
||||
line_regex = re.compile(
|
||||
r"^"
|
||||
r"([0-9A-F]{4,6})" # first character code
|
||||
r"(?:\.\.([0-9A-F]{4,6}))?" # optional second character code
|
||||
r"\s*;\s*"
|
||||
r"([^#]+)") # everything up to the potential comment
|
||||
for line in infile:
|
||||
match = line_regex.match(line)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
first, last, data = match.groups()
|
||||
if last is None:
|
||||
last = first
|
||||
|
||||
first = int(first, 16)
|
||||
last = int(last, 16)
|
||||
data = data.rstrip()
|
||||
|
||||
ranges.append((first, last, data))
|
||||
|
||||
return ranges
|
||||
|
||||
|
||||
def build_scripts(output_path=None):
|
||||
"""Fetch "Scripts.txt" data file, parse the script ranges and write
|
||||
them as a list of Python tuples to 'fontTools.unicodedata.scripts'.
|
||||
"""
|
||||
filename = "Scripts.txt"
|
||||
with open_unidata_file(filename) as f:
|
||||
header = parse_unidata_header(f)
|
||||
script_ranges = parse_range_properties(f)
|
||||
|
||||
if not output_path:
|
||||
output_path = UNIDATA_PATH + "scripts.py"
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(SRC_ENCODING)
|
||||
f.write("#\n")
|
||||
f.write(NOTICE)
|
||||
f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
|
||||
f.write("#\n")
|
||||
f.write(header+"\n\n")
|
||||
|
||||
f.write("SCRIPT_RANGES = [\n")
|
||||
for first, last, script_name in sorted(script_ranges):
|
||||
f.write(" (0x{:X}, 0x{:X}, '{}'),\n".format(
|
||||
first, last, tostr(script_name)))
|
||||
f.write("]\n")
|
||||
|
||||
|
||||
def main():
|
||||
build_scripts()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.exit(main())
|
Loading…
x
Reference in New Issue
Block a user