buildUCD.py: minor refactorings, remove py2-isms
This commit is contained in:
parent
eece3c1b62
commit
4957cd7aba
@ -4,24 +4,14 @@ Tools to parse data files from the Unicode Character Database.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
try:
|
from urllib.request import urlopen
|
||||||
from urllib.request import urlopen
|
|
||||||
except ImportError:
|
|
||||||
from urllib2 import urlopen
|
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from io import open
|
|
||||||
from os.path import abspath, dirname, join as pjoin, pardir, sep
|
from os.path import abspath, dirname, join as pjoin, pardir, sep
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
try: # pragma: no cover
|
|
||||||
unicode
|
|
||||||
except NameError:
|
|
||||||
unicode = str
|
|
||||||
|
|
||||||
|
|
||||||
UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
|
UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
|
||||||
UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
|
UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
|
||||||
|
|
||||||
@ -39,10 +29,18 @@ MAX_UNICODE = 0x10FFFF
|
|||||||
log = logging.getLogger()
|
log = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
def open_unidata_file(filename) -> List[str]:
|
def read_unidata_file(filename, local_ucd_path=None) -> List[str]:
|
||||||
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
|
"""Read a UCD file from https://unicode.org or optionally from a local directory.
|
||||||
url = UNIDATA_URL + filename
|
|
||||||
return urlopen(url).read().decode("utf-8").splitlines()
|
Return the list of lines.
|
||||||
|
"""
|
||||||
|
if local_ucd_path is not None:
|
||||||
|
with open(pjoin(local_ucd_path, filename), "r", encoding="utf-8") as f:
|
||||||
|
return f.readlines()
|
||||||
|
else:
|
||||||
|
url = UNIDATA_URL + filename
|
||||||
|
with urlopen(url) as response:
|
||||||
|
return response.read().decode("utf-8").splitlines(keepends=True)
|
||||||
|
|
||||||
|
|
||||||
def parse_unidata_header(file_lines: List[str]):
|
def parse_unidata_header(file_lines: List[str]):
|
||||||
@ -55,7 +53,7 @@ def parse_unidata_header(file_lines: List[str]):
|
|||||||
header.append(line)
|
header.append(line)
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
return "\n".join(header)
|
return "".join(header)
|
||||||
|
|
||||||
|
|
||||||
def parse_range_properties(infile: List[str], default=None, is_set=False):
|
def parse_range_properties(infile: List[str], default=None, is_set=False):
|
||||||
@ -93,9 +91,6 @@ def parse_range_properties(infile: List[str], default=None, is_set=False):
|
|||||||
|
|
||||||
ranges.sort()
|
ranges.sort()
|
||||||
|
|
||||||
if isinstance(default, unicode):
|
|
||||||
default = str(default)
|
|
||||||
|
|
||||||
# fill the gaps between explicitly defined ranges
|
# fill the gaps between explicitly defined ranges
|
||||||
last_start, last_end = -1, -1
|
last_start, last_end = -1, -1
|
||||||
full_ranges = []
|
full_ranges = []
|
||||||
@ -178,13 +173,10 @@ def build_ranges(
|
|||||||
|
|
||||||
if local_ucd:
|
if local_ucd:
|
||||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||||
file_lines = [
|
|
||||||
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
|
||||||
]
|
|
||||||
else:
|
else:
|
||||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||||
file_lines = open_unidata_file(filename)
|
|
||||||
|
|
||||||
|
file_lines = read_unidata_file(filename, local_ucd)
|
||||||
header = parse_unidata_header(file_lines)
|
header = parse_unidata_header(file_lines)
|
||||||
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
|
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
|
||||||
|
|
||||||
@ -201,7 +193,7 @@ def build_ranges(
|
|||||||
f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
|
f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
|
||||||
f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
|
f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
|
||||||
f.write("#\n")
|
f.write("#\n")
|
||||||
f.write(header + "\n\n")
|
f.write(header + "\n")
|
||||||
|
|
||||||
f.write("RANGES = [\n")
|
f.write("RANGES = [\n")
|
||||||
for first, last, value in ranges:
|
for first, last, value in ranges:
|
||||||
@ -259,12 +251,10 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
|
|||||||
filename = "PropertyValueAliases.txt"
|
filename = "PropertyValueAliases.txt"
|
||||||
if local_ucd:
|
if local_ucd:
|
||||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||||
file_lines = [
|
|
||||||
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
|
||||||
]
|
|
||||||
else:
|
else:
|
||||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||||
file_lines = open_unidata_file(filename)
|
|
||||||
|
file_lines = read_unidata_file(filename, local_ucd)
|
||||||
header = parse_unidata_header(file_lines)
|
header = parse_unidata_header(file_lines)
|
||||||
data = parse_semicolon_separated_data(file_lines)
|
data = parse_semicolon_separated_data(file_lines)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user