From 26db10b5abecab415b9e3cef0bd3aeac1d3e38c9 Mon Sep 17 00:00:00 2001 From: Cosimo Lupo Date: Fri, 17 Nov 2017 19:10:35 +0000 Subject: [PATCH] [MetaTools] add a script to download and parse UCD data files The script currently only parses the Scripts.txt file and generates a new python module `fontTools.unicodedata.scripts` containing a `SCRIPT_RANGES` list of tuples, each containing the range and the corresponding script name. --- MetaTools/buildUCD.py | 114 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100755 MetaTools/buildUCD.py diff --git a/MetaTools/buildUCD.py b/MetaTools/buildUCD.py new file mode 100755 index 000000000..994bd1e02 --- /dev/null +++ b/MetaTools/buildUCD.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +""" +Tools to parse data files from the Unicode Character Database. +""" + +from __future__ import print_function, absolute_import, division +from __future__ import unicode_literals +from fontTools.misc.py23 import * + +try: + from urllib.request import urlopen +except ImportError: + from urllib2 import urlopen +from contextlib import closing, contextmanager +import re +from codecs import iterdecode +from os.path import abspath, dirname, join as pjoin, pardir, sep + + +UNIDATA_URL = "https://unicode.org/Public/UNIDATA/" + +# by default save output files to ../Lib/fontTools/unicodedata/ +UNIDATA_PATH = pjoin(abspath(dirname(__file__)), pardir, + "Lib", "fontTools", "unicodedata") + sep + +SRC_ENCODING = "# -*- coding: utf-8 -*-\n" + +NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n" + + +@contextmanager +def open_unidata_file(filename): + """Open a text file from https://unicode.org/Public/UNIDATA/""" + url = UNIDATA_URL + filename + with closing(urlopen(url)) as response: + yield iterdecode(response, encoding="utf-8") + + +def parse_unidata_header(infile): + """Read the top header of data files, until the first line + that does not start with '#'. + """ + header = [] + line = next(infile) + while line.startswith("#"): + header.append(line) + line = next(infile) + return "".join(header) + + +def parse_range_properties(infile): + """Parse a Unicode data file containing a column with one character or + a range of characters, and another column containing a property value + separated by a semicolon. Comments after '#' are ignored. + """ + ranges = [] + line_regex = re.compile( + r"^" + r"([0-9A-F]{4,6})" # first character code + r"(?:\.\.([0-9A-F]{4,6}))?" # optional second character code + r"\s*;\s*" + r"([^#]+)") # everything up to the potential comment + for line in infile: + match = line_regex.match(line) + if not match: + continue + + first, last, data = match.groups() + if last is None: + last = first + + first = int(first, 16) + last = int(last, 16) + data = data.rstrip() + + ranges.append((first, last, data)) + + return ranges + + +def build_scripts(output_path=None): + """Fetch "Scripts.txt" data file, parse the script ranges and write + them as a list of Python tuples to 'fontTools.unicodedata.scripts'. + """ + filename = "Scripts.txt" + with open_unidata_file(filename) as f: + header = parse_unidata_header(f) + script_ranges = parse_range_properties(f) + + if not output_path: + output_path = UNIDATA_PATH + "scripts.py" + + with open(output_path, "w", encoding="utf-8") as f: + f.write(SRC_ENCODING) + f.write("#\n") + f.write(NOTICE) + f.write("# Source: {}{}\n".format(UNIDATA_URL, filename)) + f.write("#\n") + f.write(header+"\n\n") + + f.write("SCRIPT_RANGES = [\n") + for first, last, script_name in sorted(script_ranges): + f.write(" (0x{:X}, 0x{:X}, '{}'),\n".format( + first, last, tostr(script_name))) + f.write("]\n") + + +def main(): + build_scripts() + + +if __name__ == "__main__": + import sys + sys.exit(main())