buildUCD.py: minor refactorings, remove py2-isms

2025-01-29 17:48:00 +00:00 · 2025-01-29 17:48:00 +00:00 · 4957cd7aba
commit 4957cd7aba
parent eece3c1b62
1 changed files with 18 additions and 28 deletions
--- a/MetaTools/buildUCD.py
+++ b/MetaTools/buildUCD.py
@ -4,24 +4,14 @@ Tools to parse data files from the Unicode Character Database.
 """


-try:
-    from urllib.request import urlopen
-except ImportError:
-    from urllib2 import urlopen
+from urllib.request import urlopen
 import re
 import logging
 import os
-from io import open
 from os.path import abspath, dirname, join as pjoin, pardir, sep
 from typing import List


-try:  # pragma: no cover
-    unicode
-except NameError:
-    unicode = str
-
-
 UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
 UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"

@ -39,10 +29,18 @@ MAX_UNICODE = 0x10FFFF
 log = logging.getLogger()


-def open_unidata_file(filename) -> List[str]:
-    """Open a text file from https://unicode.org/Public/UNIDATA/"""
-    url = UNIDATA_URL + filename
-    return urlopen(url).read().decode("utf-8").splitlines()
+def read_unidata_file(filename, local_ucd_path=None) -> List[str]:
+    """Read a UCD file from https://unicode.org or optionally from a local directory.
+
+    Return the list of lines.
+    """
+    if local_ucd_path is not None:
+        with open(pjoin(local_ucd_path, filename), "r", encoding="utf-8") as f:
+            return f.readlines()
+    else:
+        url = UNIDATA_URL + filename
+        with urlopen(url) as response:
+            return response.read().decode("utf-8").splitlines(keepends=True)


 def parse_unidata_header(file_lines: List[str]):
@ -55,7 +53,7 @@ def parse_unidata_header(file_lines: List[str]):
            header.append(line)
        else:
            break
-    return "\n".join(header)
+    return "".join(header)


 def parse_range_properties(infile: List[str], default=None, is_set=False):
@ -93,9 +91,6 @@ def parse_range_properties(infile: List[str], default=None, is_set=False):

    ranges.sort()

-    if isinstance(default, unicode):
-        default = str(default)
-
    # fill the gaps between explicitly defined ranges
    last_start, last_end = -1, -1
    full_ranges = []
@ -178,13 +173,10 @@ def build_ranges(

    if local_ucd:
        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
-        file_lines = [
-            l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
-        ]
    else:
        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
-        file_lines = open_unidata_file(filename)

+    file_lines = read_unidata_file(filename, local_ucd)
    header = parse_unidata_header(file_lines)
    ranges = parse_range_properties(file_lines, default=default, is_set=is_set)

@ -201,7 +193,7 @@ def build_ranges(
        f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
        f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
        f.write("#\n")
-        f.write(header + "\n\n")
+        f.write(header + "\n")

        f.write("RANGES = [\n")
        for first, last, value in ranges:
@ -259,12 +251,10 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
    filename = "PropertyValueAliases.txt"
    if local_ucd:
        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
-        file_lines = [
-            l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
-        ]
    else:
        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
-        file_lines = open_unidata_file(filename)
+
+    file_lines = read_unidata_file(filename, local_ucd)
    header = parse_unidata_header(file_lines)
    data = parse_semicolon_separated_data(file_lines)