buildUCD.py: minor refactorings, remove py2-isms

This commit is contained in:
Cosimo Lupo 2025-01-29 17:48:00 +00:00
parent eece3c1b62
commit 4957cd7aba

View File

@ -4,24 +4,14 @@ Tools to parse data files from the Unicode Character Database.
"""
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
import re
import logging
import os
from io import open
from os.path import abspath, dirname, join as pjoin, pardir, sep
from typing import List
try: # pragma: no cover
unicode
except NameError:
unicode = str
UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
@ -39,10 +29,18 @@ MAX_UNICODE = 0x10FFFF
log = logging.getLogger()
def open_unidata_file(filename) -> List[str]:
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
def read_unidata_file(filename, local_ucd_path=None) -> List[str]:
"""Read a UCD file from https://unicode.org or optionally from a local directory.
Return the list of lines.
"""
if local_ucd_path is not None:
with open(pjoin(local_ucd_path, filename), "r", encoding="utf-8") as f:
return f.readlines()
else:
url = UNIDATA_URL + filename
return urlopen(url).read().decode("utf-8").splitlines()
with urlopen(url) as response:
return response.read().decode("utf-8").splitlines(keepends=True)
def parse_unidata_header(file_lines: List[str]):
@ -55,7 +53,7 @@ def parse_unidata_header(file_lines: List[str]):
header.append(line)
else:
break
return "\n".join(header)
return "".join(header)
def parse_range_properties(infile: List[str], default=None, is_set=False):
@ -93,9 +91,6 @@ def parse_range_properties(infile: List[str], default=None, is_set=False):
ranges.sort()
if isinstance(default, unicode):
default = str(default)
# fill the gaps between explicitly defined ranges
last_start, last_end = -1, -1
full_ranges = []
@ -178,13 +173,10 @@ def build_ranges(
if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
file_lines = open_unidata_file(filename)
file_lines = read_unidata_file(filename, local_ucd)
header = parse_unidata_header(file_lines)
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
@ -201,7 +193,7 @@ def build_ranges(
f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
f.write("#\n")
f.write(header + "\n\n")
f.write(header + "\n")
f.write("RANGES = [\n")
for first, last, value in ranges:
@ -259,12 +251,10 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
filename = "PropertyValueAliases.txt"
if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
file_lines = open_unidata_file(filename)
file_lines = read_unidata_file(filename, local_ucd)
header = parse_unidata_header(file_lines)
data = parse_semicolon_separated_data(file_lines)