buildUCD.py: minor refactorings, remove py2-isms

This commit is contained in:
Cosimo Lupo 2025-01-29 17:48:00 +00:00
parent eece3c1b62
commit 4957cd7aba

View File

@ -4,24 +4,14 @@ Tools to parse data files from the Unicode Character Database.
""" """
try: from urllib.request import urlopen
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
import re import re
import logging import logging
import os import os
from io import open
from os.path import abspath, dirname, join as pjoin, pardir, sep from os.path import abspath, dirname, join as pjoin, pardir, sep
from typing import List from typing import List
try: # pragma: no cover
unicode
except NameError:
unicode = str
UNIDATA_URL = "https://unicode.org/Public/UNIDATA/" UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License" UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
@ -39,10 +29,18 @@ MAX_UNICODE = 0x10FFFF
log = logging.getLogger() log = logging.getLogger()
def open_unidata_file(filename) -> List[str]: def read_unidata_file(filename, local_ucd_path=None) -> List[str]:
"""Open a text file from https://unicode.org/Public/UNIDATA/""" """Read a UCD file from https://unicode.org or optionally from a local directory.
url = UNIDATA_URL + filename
return urlopen(url).read().decode("utf-8").splitlines() Return the list of lines.
"""
if local_ucd_path is not None:
with open(pjoin(local_ucd_path, filename), "r", encoding="utf-8") as f:
return f.readlines()
else:
url = UNIDATA_URL + filename
with urlopen(url) as response:
return response.read().decode("utf-8").splitlines(keepends=True)
def parse_unidata_header(file_lines: List[str]): def parse_unidata_header(file_lines: List[str]):
@ -55,7 +53,7 @@ def parse_unidata_header(file_lines: List[str]):
header.append(line) header.append(line)
else: else:
break break
return "\n".join(header) return "".join(header)
def parse_range_properties(infile: List[str], default=None, is_set=False): def parse_range_properties(infile: List[str], default=None, is_set=False):
@ -93,9 +91,6 @@ def parse_range_properties(infile: List[str], default=None, is_set=False):
ranges.sort() ranges.sort()
if isinstance(default, unicode):
default = str(default)
# fill the gaps between explicitly defined ranges # fill the gaps between explicitly defined ranges
last_start, last_end = -1, -1 last_start, last_end = -1, -1
full_ranges = [] full_ranges = []
@ -178,13 +173,10 @@ def build_ranges(
if local_ucd: if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd) log.info("loading '%s' from local directory '%s'", filename, local_ucd)
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else: else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
file_lines = open_unidata_file(filename)
file_lines = read_unidata_file(filename, local_ucd)
header = parse_unidata_header(file_lines) header = parse_unidata_header(file_lines)
ranges = parse_range_properties(file_lines, default=default, is_set=is_set) ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
@ -201,7 +193,7 @@ def build_ranges(
f.write("# Source: {}{}\n".format(UNIDATA_URL, filename)) f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
f.write("# License: {}\n".format(UNIDATA_LICENSE_URL)) f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
f.write("#\n") f.write("#\n")
f.write(header + "\n\n") f.write(header + "\n")
f.write("RANGES = [\n") f.write("RANGES = [\n")
for first, last, value in ranges: for first, last, value in ranges:
@ -259,12 +251,10 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
filename = "PropertyValueAliases.txt" filename = "PropertyValueAliases.txt"
if local_ucd: if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd) log.info("loading '%s' from local directory '%s'", filename, local_ucd)
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else: else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
file_lines = open_unidata_file(filename)
file_lines = read_unidata_file(filename, local_ucd)
header = parse_unidata_header(file_lines) header = parse_unidata_header(file_lines)
data = parse_semicolon_separated_data(file_lines) data = parse_semicolon_separated_data(file_lines)