diff --git a/MetaTools/buildUCD.py b/MetaTools/buildUCD.py index f7117dafb..898e3759c 100755 --- a/MetaTools/buildUCD.py +++ b/MetaTools/buildUCD.py @@ -4,24 +4,14 @@ Tools to parse data files from the Unicode Character Database. """ -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen +from urllib.request import urlopen import re import logging import os -from io import open from os.path import abspath, dirname, join as pjoin, pardir, sep from typing import List -try: # pragma: no cover - unicode -except NameError: - unicode = str - - UNIDATA_URL = "https://unicode.org/Public/UNIDATA/" UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License" @@ -39,10 +29,18 @@ MAX_UNICODE = 0x10FFFF log = logging.getLogger() -def open_unidata_file(filename) -> List[str]: - """Open a text file from https://unicode.org/Public/UNIDATA/""" - url = UNIDATA_URL + filename - return urlopen(url).read().decode("utf-8").splitlines() +def read_unidata_file(filename, local_ucd_path=None) -> List[str]: + """Read a UCD file from https://unicode.org or optionally from a local directory. + + Return the list of lines. + """ + if local_ucd_path is not None: + with open(pjoin(local_ucd_path, filename), "r", encoding="utf-8") as f: + return f.readlines() + else: + url = UNIDATA_URL + filename + with urlopen(url) as response: + return response.read().decode("utf-8").splitlines(keepends=True) def parse_unidata_header(file_lines: List[str]): @@ -55,7 +53,7 @@ def parse_unidata_header(file_lines: List[str]): header.append(line) else: break - return "\n".join(header) + return "".join(header) def parse_range_properties(infile: List[str], default=None, is_set=False): @@ -93,9 +91,6 @@ def parse_range_properties(infile: List[str], default=None, is_set=False): ranges.sort() - if isinstance(default, unicode): - default = str(default) - # fill the gaps between explicitly defined ranges last_start, last_end = -1, -1 full_ranges = [] @@ -178,13 +173,10 @@ def build_ranges( if local_ucd: log.info("loading '%s' from local directory '%s'", filename, local_ucd) - file_lines = [ - l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8") - ] else: log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) - file_lines = open_unidata_file(filename) + file_lines = read_unidata_file(filename, local_ucd) header = parse_unidata_header(file_lines) ranges = parse_range_properties(file_lines, default=default, is_set=is_set) @@ -201,7 +193,7 @@ def build_ranges( f.write("# Source: {}{}\n".format(UNIDATA_URL, filename)) f.write("# License: {}\n".format(UNIDATA_LICENSE_URL)) f.write("#\n") - f.write(header + "\n\n") + f.write(header + "\n") f.write("RANGES = [\n") for first, last, value in ranges: @@ -259,12 +251,10 @@ def parse_property_value_aliases(property_tag, local_ucd=None): filename = "PropertyValueAliases.txt" if local_ucd: log.info("loading '%s' from local directory '%s'", filename, local_ucd) - file_lines = [ - l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8") - ] else: log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) - file_lines = open_unidata_file(filename) + + file_lines = read_unidata_file(filename, local_ucd) header = parse_unidata_header(file_lines) data = parse_semicolon_separated_data(file_lines)