[MetaTools] add a script to download and parse UCD data files

The script currently only parses the Scripts.txt file and generates a new python module `fontTools.unicodedata.scripts` containing a `SCRIPT_RANGES` list of tuples, each containing the range and the corresponding script name.
2017-11-17 19:10:35 +00:00 · 2017-11-17 19:10:35 +00:00 · 26db10b5ab
commit 26db10b5ab
parent fdab63f0b6
1 changed files with 114 additions and 0 deletions
--- a/MetaTools/buildUCD.py
+++ b/MetaTools/buildUCD.py
@ -0,0 +1,114 @@
+#!/usr/bin/env python
+"""
+Tools to parse data files from the Unicode Character Database.
+"""
+
+from __future__ import print_function, absolute_import, division
+from __future__ import unicode_literals
+from fontTools.misc.py23 import *
+
+try:
+    from urllib.request import urlopen
+except ImportError:
+    from urllib2 import urlopen
+from contextlib import closing, contextmanager
+import re
+from codecs import iterdecode
+from os.path import abspath, dirname, join as pjoin, pardir, sep
+
+
+UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
+
+# by default save output files to ../Lib/fontTools/unicodedata/
+UNIDATA_PATH = pjoin(abspath(dirname(__file__)), pardir,
+                     "Lib", "fontTools", "unicodedata") + sep
+
+SRC_ENCODING = "# -*- coding: utf-8 -*-\n"
+
+NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n"
+
+
+@contextmanager
+def open_unidata_file(filename):
+    """Open a text file from https://unicode.org/Public/UNIDATA/"""
+    url = UNIDATA_URL + filename
+    with closing(urlopen(url)) as response:
+        yield iterdecode(response, encoding="utf-8")
+
+
+def parse_unidata_header(infile):
+    """Read the top header of data files, until the first line
+    that does not start with '#'.
+    """
+    header = []
+    line = next(infile)
+    while line.startswith("#"):
+        header.append(line)
+        line = next(infile)
+    return "".join(header)
+
+
+def parse_range_properties(infile):
+    """Parse a Unicode data file containing a column with one character or
+    a range of characters, and another column containing a property value
+    separated by a semicolon. Comments after '#' are ignored.
+    """
+    ranges = []
+    line_regex = re.compile(
+        r"^"
+        r"([0-9A-F]{4,6})"  # first character code
+        r"(?:\.\.([0-9A-F]{4,6}))?"  # optional second character code
+        r"\s*;\s*"
+        r"([^#]+)")  # everything up to the potential comment
+    for line in infile:
+        match = line_regex.match(line)
+        if not match:
+            continue
+
+        first, last, data = match.groups()
+        if last is None:
+            last = first
+
+        first = int(first, 16)
+        last = int(last, 16)
+        data = data.rstrip()
+
+        ranges.append((first, last, data))
+
+    return ranges
+
+
+def build_scripts(output_path=None):
+    """Fetch "Scripts.txt" data file, parse the script ranges and write
+    them as a list of Python tuples to 'fontTools.unicodedata.scripts'.
+    """
+    filename = "Scripts.txt"
+    with open_unidata_file(filename) as f:
+        header = parse_unidata_header(f)
+        script_ranges = parse_range_properties(f)
+
+    if not output_path:
+        output_path = UNIDATA_PATH + "scripts.py"
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(SRC_ENCODING)
+        f.write("#\n")
+        f.write(NOTICE)
+        f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
+        f.write("#\n")
+        f.write(header+"\n\n")
+
+        f.write("SCRIPT_RANGES = [\n")
+        for first, last, script_name in sorted(script_ranges):
+            f.write("    (0x{:X}, 0x{:X}, '{}'),\n".format(
+                first, last, tostr(script_name)))
+        f.write("]\n")
+
+
+def main():
+    build_scripts()
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())