fonttools/MetaTools/buildUCD.py

#!/usr/bin/env python
"""
Tools to parse data files from the Unicode Character Database.
"""

from __future__ import print_function, absolute_import, division
from __future__ import unicode_literals
from fontTools.misc.py23 import *

try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen
from contextlib import closing, contextmanager
import re
from codecs import iterdecode
from os.path import abspath, dirname, join as pjoin, pardir, sep


UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"

# by default save output files to ../Lib/fontTools/unicodedata/
UNIDATA_PATH = pjoin(abspath(dirname(__file__)), pardir,
                     "Lib", "fontTools", "unicodedata") + sep

SRC_ENCODING = "# -*- coding: utf-8 -*-\n"

NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n"


@contextmanager
def open_unidata_file(filename):
    """Open a text file from https://unicode.org/Public/UNIDATA/"""
    url = UNIDATA_URL + filename
    with closing(urlopen(url)) as response:
        yield iterdecode(response, encoding="utf-8")


def parse_unidata_header(infile):
    """Read the top header of data files, until the first line
    that does not start with '#'.
    """
    header = []
    line = next(infile)
    while line.startswith("#"):
        header.append(line)
        line = next(infile)
    return "".join(header)


def parse_range_properties(infile):
    """Parse a Unicode data file containing a column with one character or
    a range of characters, and another column containing a property value
    separated by a semicolon. Comments after '#' are ignored.
    """
    ranges = []
    line_regex = re.compile(
        r"^"
        r"([0-9A-F]{4,6})"  # first character code
        r"(?:\.\.([0-9A-F]{4,6}))?"  # optional second character code
        r"\s*;\s*"
        r"([^#]+)")  # everything up to the potential comment
    for line in infile:
        match = line_regex.match(line)
        if not match:
            continue

        first, last, data = match.groups()
        if last is None:
            last = first

        first = int(first, 16)
        last = int(last, 16)
        data = data.rstrip()

        ranges.append((first, last, data))

    return ranges


def build_scripts(output_path=None):
    """Fetch "Scripts.txt" data file, parse the script ranges and write
    them as a list of Python tuples to 'fontTools.unicodedata.scripts'.
    """
    filename = "Scripts.txt"
    with open_unidata_file(filename) as f:
        header = parse_unidata_header(f)
        script_ranges = parse_range_properties(f)

    if not output_path:
        output_path = UNIDATA_PATH + "scripts.py"

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(SRC_ENCODING)
        f.write("#\n")
        f.write(NOTICE)
        f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
        f.write("#\n")
        f.write(header+"\n\n")

        f.write("SCRIPT_RANGES = [\n")
        for first, last, script_name in sorted(script_ranges):
            f.write("    (0x{:X}, 0x{:X}, '{}'),\n".format(
                first, last, tostr(script_name)))
        f.write("]\n")


def main():
    build_scripts()


if __name__ == "__main__":
    import sys
    sys.exit(main())