buildUCD.py: minor refactorings, remove py2-isms

This commit is contained in:
Cosimo Lupo 2025-01-29 17:48:00 +00:00
parent eece3c1b62
commit 4957cd7aba

View File

@ -4,24 +4,14 @@ Tools to parse data files from the Unicode Character Database.
"""
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
from urllib.request import urlopen
import re
import logging
import os
from io import open
from os.path import abspath, dirname, join as pjoin, pardir, sep
from typing import List
try: # pragma: no cover
unicode
except NameError:
unicode = str
UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
@ -39,10 +29,18 @@ MAX_UNICODE = 0x10FFFF
log = logging.getLogger()
def open_unidata_file(filename) -> List[str]:
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
url = UNIDATA_URL + filename
return urlopen(url).read().decode("utf-8").splitlines()
def read_unidata_file(filename, local_ucd_path=None) -> List[str]:
"""Read a UCD file from https://unicode.org or optionally from a local directory.
Return the list of lines.
"""
if local_ucd_path is not None:
with open(pjoin(local_ucd_path, filename), "r", encoding="utf-8") as f:
return f.readlines()
else:
url = UNIDATA_URL + filename
with urlopen(url) as response:
return response.read().decode("utf-8").splitlines(keepends=True)
def parse_unidata_header(file_lines: List[str]):
@ -55,7 +53,7 @@ def parse_unidata_header(file_lines: List[str]):
header.append(line)
else:
break
return "\n".join(header)
return "".join(header)
def parse_range_properties(infile: List[str], default=None, is_set=False):
@ -93,9 +91,6 @@ def parse_range_properties(infile: List[str], default=None, is_set=False):
ranges.sort()
if isinstance(default, unicode):
default = str(default)
# fill the gaps between explicitly defined ranges
last_start, last_end = -1, -1
full_ranges = []
@ -178,13 +173,10 @@ def build_ranges(
if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
file_lines = open_unidata_file(filename)
file_lines = read_unidata_file(filename, local_ucd)
header = parse_unidata_header(file_lines)
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
@ -201,7 +193,7 @@ def build_ranges(
f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
f.write("#\n")
f.write(header + "\n\n")
f.write(header + "\n")
f.write("RANGES = [\n")
for first, last, value in ranges:
@ -259,12 +251,10 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
filename = "PropertyValueAliases.txt"
if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
file_lines = open_unidata_file(filename)
file_lines = read_unidata_file(filename, local_ucd)
header = parse_unidata_header(file_lines)
data = parse_semicolon_separated_data(file_lines)