diff --git a/MetaTools/buildUCD.py b/MetaTools/buildUCD.py index 994bd1e02..acd6cf187 100755 --- a/MetaTools/buildUCD.py +++ b/MetaTools/buildUCD.py @@ -14,6 +14,8 @@ except ImportError: from contextlib import closing, contextmanager import re from codecs import iterdecode +import logging +import os from os.path import abspath, dirname, join as pjoin, pardir, sep @@ -27,6 +29,10 @@ SRC_ENCODING = "# -*- coding: utf-8 -*-\n" NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n" +MAX_UNICODE = 0x10FFFF + +log = logging.getLogger() + @contextmanager def open_unidata_file(filename): @@ -48,10 +54,15 @@ def parse_unidata_header(infile): return "".join(header) -def parse_range_properties(infile): +def parse_range_properties(infile, default="Unknown"): """Parse a Unicode data file containing a column with one character or a range of characters, and another column containing a property value separated by a semicolon. Comments after '#' are ignored. + + If the ranges defined in the data file are not continuous, assign the + 'default' property to the unassigned codepoints. + + Return a list of (start, end, property_name) tuples. """ ranges = [] line_regex = re.compile( @@ -75,21 +86,66 @@ def parse_range_properties(infile): ranges.append((first, last, data)) - return ranges + ranges.sort() + + # fill the gaps between explicitly defined ranges + last_start, last_end = -1, -1 + full_ranges = [] + for start, end, name in ranges: + assert last_end < start + assert start <= end + if start - last_end > 1: + full_ranges.append((last_end+1, start-1, default)) + full_ranges.append((start, end, name)) + last_start, last_end = start, end + if last_end != MAX_UNICODE: + full_ranges.append((last_end+1, MAX_UNICODE, default)) + + # reduce total number of ranges by combining continuous ones + last_start, last_end, last_name = full_ranges.pop(0) + merged_ranges = [] + for start, end, name in full_ranges: + if name == last_name: + continue + else: + merged_ranges.append((last_start, start-1, last_name)) + last_start, line_end, last_name = start, end, name + merged_ranges.append((last_start, MAX_UNICODE, last_name)) + + # make sure that the ranges cover the full unicode repertoire + assert merged_ranges[0][0] == 0 + for (cs, ce, cn), (ns, ne, nn) in zip(merged_ranges, merged_ranges[1:]): + assert ce+1 == ns + assert merged_ranges[-1][1] == MAX_UNICODE + + return merged_ranges -def build_scripts(output_path=None): - """Fetch "Scripts.txt" data file, parse the script ranges and write - them as a list of Python tuples to 'fontTools.unicodedata.scripts'. +def build_scripts(local_ucd=None, output_path=None): + """Fetch "Scripts.txt" data file from Unicode official website, parse + the script ranges and write them as a list of Python tuples to + 'fontTools.unicodedata.scripts'. + + To load "Scripts.txt" from a local directory, you can use the + 'local_ucd' argument. """ - filename = "Scripts.txt" - with open_unidata_file(filename) as f: - header = parse_unidata_header(f) - script_ranges = parse_range_properties(f) - if not output_path: output_path = UNIDATA_PATH + "scripts.py" + filename = "Scripts.txt" + if local_ucd: + log.info("loading %r from local directory %r", filename, local_ucd) + cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") + else: + log.info("downloading %r from %r", filename, UNIDATA_URL) + cm = open_unidata_file(filename) + + with cm as f: + header = parse_unidata_header(f) + ranges = parse_range_properties(f) + + max_name_length = max(len(n) for _, _, n in ranges) + with open(output_path, "w", encoding="utf-8") as f: f.write(SRC_ENCODING) f.write("#\n") @@ -99,14 +155,36 @@ def build_scripts(output_path=None): f.write(header+"\n\n") f.write("SCRIPT_RANGES = [\n") - for first, last, script_name in sorted(script_ranges): - f.write(" (0x{:X}, 0x{:X}, '{}'),\n".format( + for first, last, script_name in ranges: + f.write(" 0x{:0>4X}, # .. 0x{:0>4X} ; {}\n".format( first, last, tostr(script_name))) f.write("]\n") + f.write("\n") + f.write("SCRIPT_NAMES = [\n") + for first, last, script_name in ranges: + script_name = "'{}',".format(script_name) + f.write(" {} # {:0>4X}..{:0>4X}\n".format( + script_name.ljust(max_name_length+3), first, last)) + f.write("]\n") + + log.info("saved new file: %r", os.path.normpath(output_path)) + def main(): - build_scripts() + import argparse + + parser = argparse.ArgumentParser( + description="Generate fontTools.unicodedata from UCD data files") + parser.add_argument( + '--ucd-path', help="Path to local folder containing UCD data files") + parser.add_argument('-q', '--quiet', action="store_true") + options = parser.parse_args() + + level = "WARNING" if options.quiet else "INFO" + logging.basicConfig(level=level, format="%(message)s") + + build_scripts(local_ucd=options.ucd_path) if __name__ == "__main__":