[unicodedata] Fix bug in buildUCD.py
The method that was being used to read the header from the input was inadvertently dropping the first non-header line on the floor; although this happens to be okay in some cases (where there is an empty line after the header) in the case of newer versions of the ScriptExtensions.txt file, this was causing the generated code to be missing the first entry, for U+00B7 (MIDDLE DOT)
This commit is contained in:
parent
6bc5d7f887
commit
b26271cc4d
@ -16,7 +16,6 @@
|
||||
# Format:
|
||||
# Start Code..End Code; Block Name
|
||||
|
||||
|
||||
RANGES = [
|
||||
0x0000, # .. 0x007F ; Basic Latin
|
||||
0x0080, # .. 0x00FF ; Latin-1 Supplement
|
||||
|
@ -32,9 +32,10 @@
|
||||
#
|
||||
# @missing: 0000..10FFFF; <script>
|
||||
|
||||
|
||||
RANGES = [
|
||||
0x0000, # .. 0x02BB ; None
|
||||
0x0000, # .. 0x00B6 ; None
|
||||
0x00B7, # .. 0x00B7 ; {'Avst', 'Cari', 'Copt', 'Dupl', 'Elba', 'Geor', 'Glag', 'Gong', 'Goth', 'Grek', 'Hani', 'Latn', 'Lydi', 'Mahj', 'Perm', 'Shaw'}
|
||||
0x00B8, # .. 0x02BB ; None
|
||||
0x02BC, # .. 0x02BC ; {'Beng', 'Cyrl', 'Deva', 'Latn', 'Lisu', 'Thai', 'Toto'}
|
||||
0x02BD, # .. 0x02C6 ; None
|
||||
0x02C7, # .. 0x02C7 ; {'Bopo', 'Latn'}
|
||||
@ -316,7 +317,26 @@ RANGES = [
|
||||
]
|
||||
|
||||
VALUES = [
|
||||
None, # 0000..02BB
|
||||
None, # 0000..00B6
|
||||
{
|
||||
"Avst",
|
||||
"Cari",
|
||||
"Copt",
|
||||
"Dupl",
|
||||
"Elba",
|
||||
"Geor",
|
||||
"Glag",
|
||||
"Gong",
|
||||
"Goth",
|
||||
"Grek",
|
||||
"Hani",
|
||||
"Latn",
|
||||
"Lydi",
|
||||
"Mahj",
|
||||
"Perm",
|
||||
"Shaw",
|
||||
}, # 00B7..00B7
|
||||
None, # 00B8..02BB
|
||||
{"Beng", "Cyrl", "Deva", "Latn", "Lisu", "Thai", "Toto"}, # 02BC..02BC
|
||||
None, # 02BD..02C6
|
||||
{"Bopo", "Latn"}, # 02C7..02C7
|
||||
|
@ -19,7 +19,6 @@
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
#
|
||||
|
||||
|
||||
RANGES = [
|
||||
0x0000, # .. 0x0040 ; Common
|
||||
0x0041, # .. 0x005A ; Latin
|
||||
|
@ -8,13 +8,12 @@ try:
|
||||
from urllib.request import urlopen
|
||||
except ImportError:
|
||||
from urllib2 import urlopen
|
||||
from contextlib import closing, contextmanager
|
||||
import re
|
||||
from codecs import iterdecode
|
||||
import logging
|
||||
import os
|
||||
from io import open
|
||||
from os.path import abspath, dirname, join as pjoin, pardir, sep
|
||||
from typing import List
|
||||
|
||||
|
||||
try: # pragma: no cover
|
||||
@ -40,27 +39,26 @@ MAX_UNICODE = 0x10FFFF
|
||||
log = logging.getLogger()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_unidata_file(filename):
|
||||
def open_unidata_file(filename) -> List[str]:
|
||||
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
|
||||
url = UNIDATA_URL + filename
|
||||
with closing(urlopen(url)) as response:
|
||||
yield iterdecode(response, encoding="utf-8")
|
||||
return urlopen(url).read().decode("utf-8").splitlines()
|
||||
|
||||
|
||||
def parse_unidata_header(infile):
|
||||
def parse_unidata_header(file_lines: List[str]):
|
||||
"""Read the top header of data files, until the first line
|
||||
that does not start with '#'.
|
||||
"""
|
||||
header = []
|
||||
line = next(infile)
|
||||
while line.startswith("#"):
|
||||
header.append(line)
|
||||
line = next(infile)
|
||||
return "".join(header)
|
||||
for line in file_lines:
|
||||
if line.startswith("#"):
|
||||
header.append(line)
|
||||
else:
|
||||
break
|
||||
return "\n".join(header)
|
||||
|
||||
|
||||
def parse_range_properties(infile, default=None, is_set=False):
|
||||
def parse_range_properties(infile: List[str], default=None, is_set=False):
|
||||
"""Parse a Unicode data file containing a column with one character or
|
||||
a range of characters, and another column containing a property value
|
||||
separated by a semicolon. Comments after '#' are ignored.
|
||||
@ -180,14 +178,15 @@ def build_ranges(
|
||||
|
||||
if local_ucd:
|
||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
file_lines = [
|
||||
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
]
|
||||
else:
|
||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||
cm = open_unidata_file(filename)
|
||||
file_lines = open_unidata_file(filename)
|
||||
|
||||
with cm as f:
|
||||
header = parse_unidata_header(f)
|
||||
ranges = parse_range_properties(f, default=default, is_set=is_set)
|
||||
header = parse_unidata_header(file_lines)
|
||||
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
|
||||
|
||||
if aliases:
|
||||
reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
|
||||
@ -260,14 +259,14 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
|
||||
filename = "PropertyValueAliases.txt"
|
||||
if local_ucd:
|
||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
file_lines = [
|
||||
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
]
|
||||
else:
|
||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||
cm = open_unidata_file(filename)
|
||||
|
||||
with cm as f:
|
||||
header = parse_unidata_header(f)
|
||||
data = parse_semicolon_separated_data(f)
|
||||
file_lines = open_unidata_file(filename)
|
||||
header = parse_unidata_header(file_lines)
|
||||
data = parse_semicolon_separated_data(file_lines)
|
||||
|
||||
aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}
|
||||
|
||||
|
@ -155,6 +155,33 @@ def test_script():
|
||||
|
||||
|
||||
def test_script_extension():
|
||||
assert unicodedata.script_extension("\u00B7") == {
|
||||
"Avst",
|
||||
"Cari",
|
||||
"Copt",
|
||||
"Dupl",
|
||||
"Elba",
|
||||
"Geor",
|
||||
"Glag",
|
||||
"Gong",
|
||||
"Goth",
|
||||
"Grek",
|
||||
"Hani",
|
||||
"Latn",
|
||||
"Lydi",
|
||||
"Mahj",
|
||||
"Perm",
|
||||
"Shaw",
|
||||
}
|
||||
assert unicodedata.script_extension("\u02BC") == {
|
||||
"Beng",
|
||||
"Cyrl",
|
||||
"Deva",
|
||||
"Latn",
|
||||
"Lisu",
|
||||
"Thai",
|
||||
"Toto",
|
||||
}
|
||||
assert unicodedata.script_extension("a") == {"Latn"}
|
||||
assert unicodedata.script_extension(chr(0)) == {"Zyyy"}
|
||||
assert unicodedata.script_extension(chr(0x0378)) == {"Zzzz"}
|
||||
|
Loading…
x
Reference in New Issue
Block a user