buildUCD.py: minor refactorings, remove py2-isms
This commit is contained in:
parent
eece3c1b62
commit
4957cd7aba
@ -4,24 +4,14 @@ Tools to parse data files from the Unicode Character Database.
|
||||
"""
|
||||
|
||||
|
||||
try:
|
||||
from urllib.request import urlopen
|
||||
except ImportError:
|
||||
from urllib2 import urlopen
|
||||
from urllib.request import urlopen
|
||||
import re
|
||||
import logging
|
||||
import os
|
||||
from io import open
|
||||
from os.path import abspath, dirname, join as pjoin, pardir, sep
|
||||
from typing import List
|
||||
|
||||
|
||||
try: # pragma: no cover
|
||||
unicode
|
||||
except NameError:
|
||||
unicode = str
|
||||
|
||||
|
||||
UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
|
||||
UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
|
||||
|
||||
@ -39,10 +29,18 @@ MAX_UNICODE = 0x10FFFF
|
||||
log = logging.getLogger()
|
||||
|
||||
|
||||
def open_unidata_file(filename) -> List[str]:
|
||||
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
|
||||
url = UNIDATA_URL + filename
|
||||
return urlopen(url).read().decode("utf-8").splitlines()
|
||||
def read_unidata_file(filename, local_ucd_path=None) -> List[str]:
|
||||
"""Read a UCD file from https://unicode.org or optionally from a local directory.
|
||||
|
||||
Return the list of lines.
|
||||
"""
|
||||
if local_ucd_path is not None:
|
||||
with open(pjoin(local_ucd_path, filename), "r", encoding="utf-8") as f:
|
||||
return f.readlines()
|
||||
else:
|
||||
url = UNIDATA_URL + filename
|
||||
with urlopen(url) as response:
|
||||
return response.read().decode("utf-8").splitlines(keepends=True)
|
||||
|
||||
|
||||
def parse_unidata_header(file_lines: List[str]):
|
||||
@ -55,7 +53,7 @@ def parse_unidata_header(file_lines: List[str]):
|
||||
header.append(line)
|
||||
else:
|
||||
break
|
||||
return "\n".join(header)
|
||||
return "".join(header)
|
||||
|
||||
|
||||
def parse_range_properties(infile: List[str], default=None, is_set=False):
|
||||
@ -93,9 +91,6 @@ def parse_range_properties(infile: List[str], default=None, is_set=False):
|
||||
|
||||
ranges.sort()
|
||||
|
||||
if isinstance(default, unicode):
|
||||
default = str(default)
|
||||
|
||||
# fill the gaps between explicitly defined ranges
|
||||
last_start, last_end = -1, -1
|
||||
full_ranges = []
|
||||
@ -178,13 +173,10 @@ def build_ranges(
|
||||
|
||||
if local_ucd:
|
||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||
file_lines = [
|
||||
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
]
|
||||
else:
|
||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||
file_lines = open_unidata_file(filename)
|
||||
|
||||
file_lines = read_unidata_file(filename, local_ucd)
|
||||
header = parse_unidata_header(file_lines)
|
||||
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
|
||||
|
||||
@ -201,7 +193,7 @@ def build_ranges(
|
||||
f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
|
||||
f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
|
||||
f.write("#\n")
|
||||
f.write(header + "\n\n")
|
||||
f.write(header + "\n")
|
||||
|
||||
f.write("RANGES = [\n")
|
||||
for first, last, value in ranges:
|
||||
@ -259,12 +251,10 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
|
||||
filename = "PropertyValueAliases.txt"
|
||||
if local_ucd:
|
||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||
file_lines = [
|
||||
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
]
|
||||
else:
|
||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||
file_lines = open_unidata_file(filename)
|
||||
|
||||
file_lines = read_unidata_file(filename, local_ucd)
|
||||
header = parse_unidata_header(file_lines)
|
||||
data = parse_semicolon_separated_data(file_lines)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user