buildUCD.py: minor refactorings, remove py2-isms

2025-01-29 17:48:00 +00:00 · 2025-01-29 17:48:00 +00:00 · 4957cd7aba
commit 4957cd7aba
parent eece3c1b62
1 changed files with 18 additions and 28 deletions
--- a/MetaTools/buildUCD.py
+++ b/MetaTools/buildUCD.py
@ -4,24 +4,14 @@ Tools to parse data files from the Unicode Character Database.
 """
-try:
+from urllib.request import urlopen
    from urllib.request import urlopen
 except ImportError:
    from urllib2 import urlopen
 import re
 import logging
 import os
 from io import open
 from os.path import abspath, dirname, join as pjoin, pardir, sep
 from typing import List
 try:  # pragma: no cover
    unicode
 except NameError:
    unicode = str
 UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
 UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
@ -39,10 +29,18 @@ MAX_UNICODE = 0x10FFFF
 log = logging.getLogger()
-def open_unidata_file(filename) -> List[str]:
+def read_unidata_file(filename, local_ucd_path=None) -> List[str]:
-    """Open a text file from https://unicode.org/Public/UNIDATA/"""
+    """Read a UCD file from https://unicode.org or optionally from a local directory.
-    url = UNIDATA_URL + filename
+
-    return urlopen(url).read().decode("utf-8").splitlines()
+    Return the list of lines.
    """
    if local_ucd_path is not None:
        with open(pjoin(local_ucd_path, filename), "r", encoding="utf-8") as f:
            return f.readlines()
    else:
        url = UNIDATA_URL + filename
        with urlopen(url) as response:
            return response.read().decode("utf-8").splitlines(keepends=True)
 def parse_unidata_header(file_lines: List[str]):
@ -55,7 +53,7 @@ def parse_unidata_header(file_lines: List[str]):
            header.append(line)
        else:
            break
-    return "\n".join(header)
+    return "".join(header)
 def parse_range_properties(infile: List[str], default=None, is_set=False):
@ -93,9 +91,6 @@ def parse_range_properties(infile: List[str], default=None, is_set=False):
    ranges.sort()
    if isinstance(default, unicode):
        default = str(default)
    # fill the gaps between explicitly defined ranges
    last_start, last_end = -1, -1
    full_ranges = []
@ -178,13 +173,10 @@ def build_ranges(
    if local_ucd:
        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
        file_lines = [
            l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
        ]
    else:
        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
        file_lines = open_unidata_file(filename)
    file_lines = read_unidata_file(filename, local_ucd)
    header = parse_unidata_header(file_lines)
    ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
@ -201,7 +193,7 @@ def build_ranges(
        f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
        f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
        f.write("#\n")
-        f.write(header + "\n\n")
+        f.write(header + "\n")
        f.write("RANGES = [\n")
        for first, last, value in ranges:
@ -259,12 +251,10 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
    filename = "PropertyValueAliases.txt"
    if local_ucd:
        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
        file_lines = [
            l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
        ]
    else:
        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
-        file_lines = open_unidata_file(filename)
+
    file_lines = read_unidata_file(filename, local_ucd)
    header = parse_unidata_header(file_lines)
    data = parse_semicolon_separated_data(file_lines)