[unicodedata] Fix bug in buildUCD.py
The method that was being used to read the header from the input was inadvertently dropping the first non-header line on the floor; although this happens to be okay in some cases (where there is an empty line after the header) in the case of newer versions of the ScriptExtensions.txt file, this was causing the generated code to be missing the first entry, for U+00B7 (MIDDLE DOT)
This commit is contained in:
parent
6bc5d7f887
commit
b26271cc4d
@ -16,7 +16,6 @@
|
|||||||
# Format:
|
# Format:
|
||||||
# Start Code..End Code; Block Name
|
# Start Code..End Code; Block Name
|
||||||
|
|
||||||
|
|
||||||
RANGES = [
|
RANGES = [
|
||||||
0x0000, # .. 0x007F ; Basic Latin
|
0x0000, # .. 0x007F ; Basic Latin
|
||||||
0x0080, # .. 0x00FF ; Latin-1 Supplement
|
0x0080, # .. 0x00FF ; Latin-1 Supplement
|
||||||
|
@ -32,9 +32,10 @@
|
|||||||
#
|
#
|
||||||
# @missing: 0000..10FFFF; <script>
|
# @missing: 0000..10FFFF; <script>
|
||||||
|
|
||||||
|
|
||||||
RANGES = [
|
RANGES = [
|
||||||
0x0000, # .. 0x02BB ; None
|
0x0000, # .. 0x00B6 ; None
|
||||||
|
0x00B7, # .. 0x00B7 ; {'Avst', 'Cari', 'Copt', 'Dupl', 'Elba', 'Geor', 'Glag', 'Gong', 'Goth', 'Grek', 'Hani', 'Latn', 'Lydi', 'Mahj', 'Perm', 'Shaw'}
|
||||||
|
0x00B8, # .. 0x02BB ; None
|
||||||
0x02BC, # .. 0x02BC ; {'Beng', 'Cyrl', 'Deva', 'Latn', 'Lisu', 'Thai', 'Toto'}
|
0x02BC, # .. 0x02BC ; {'Beng', 'Cyrl', 'Deva', 'Latn', 'Lisu', 'Thai', 'Toto'}
|
||||||
0x02BD, # .. 0x02C6 ; None
|
0x02BD, # .. 0x02C6 ; None
|
||||||
0x02C7, # .. 0x02C7 ; {'Bopo', 'Latn'}
|
0x02C7, # .. 0x02C7 ; {'Bopo', 'Latn'}
|
||||||
@ -316,7 +317,26 @@ RANGES = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
VALUES = [
|
VALUES = [
|
||||||
None, # 0000..02BB
|
None, # 0000..00B6
|
||||||
|
{
|
||||||
|
"Avst",
|
||||||
|
"Cari",
|
||||||
|
"Copt",
|
||||||
|
"Dupl",
|
||||||
|
"Elba",
|
||||||
|
"Geor",
|
||||||
|
"Glag",
|
||||||
|
"Gong",
|
||||||
|
"Goth",
|
||||||
|
"Grek",
|
||||||
|
"Hani",
|
||||||
|
"Latn",
|
||||||
|
"Lydi",
|
||||||
|
"Mahj",
|
||||||
|
"Perm",
|
||||||
|
"Shaw",
|
||||||
|
}, # 00B7..00B7
|
||||||
|
None, # 00B8..02BB
|
||||||
{"Beng", "Cyrl", "Deva", "Latn", "Lisu", "Thai", "Toto"}, # 02BC..02BC
|
{"Beng", "Cyrl", "Deva", "Latn", "Lisu", "Thai", "Toto"}, # 02BC..02BC
|
||||||
None, # 02BD..02C6
|
None, # 02BD..02C6
|
||||||
{"Bopo", "Latn"}, # 02C7..02C7
|
{"Bopo", "Latn"}, # 02C7..02C7
|
||||||
|
@ -19,7 +19,6 @@
|
|||||||
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
RANGES = [
|
RANGES = [
|
||||||
0x0000, # .. 0x0040 ; Common
|
0x0000, # .. 0x0040 ; Common
|
||||||
0x0041, # .. 0x005A ; Latin
|
0x0041, # .. 0x005A ; Latin
|
||||||
|
@ -8,13 +8,12 @@ try:
|
|||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib2 import urlopen
|
from urllib2 import urlopen
|
||||||
from contextlib import closing, contextmanager
|
|
||||||
import re
|
import re
|
||||||
from codecs import iterdecode
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from io import open
|
from io import open
|
||||||
from os.path import abspath, dirname, join as pjoin, pardir, sep
|
from os.path import abspath, dirname, join as pjoin, pardir, sep
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
try: # pragma: no cover
|
try: # pragma: no cover
|
||||||
@ -40,27 +39,26 @@ MAX_UNICODE = 0x10FFFF
|
|||||||
log = logging.getLogger()
|
log = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
def open_unidata_file(filename) -> List[str]:
|
||||||
def open_unidata_file(filename):
|
|
||||||
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
|
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
|
||||||
url = UNIDATA_URL + filename
|
url = UNIDATA_URL + filename
|
||||||
with closing(urlopen(url)) as response:
|
return urlopen(url).read().decode("utf-8").splitlines()
|
||||||
yield iterdecode(response, encoding="utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def parse_unidata_header(infile):
|
def parse_unidata_header(file_lines: List[str]):
|
||||||
"""Read the top header of data files, until the first line
|
"""Read the top header of data files, until the first line
|
||||||
that does not start with '#'.
|
that does not start with '#'.
|
||||||
"""
|
"""
|
||||||
header = []
|
header = []
|
||||||
line = next(infile)
|
for line in file_lines:
|
||||||
while line.startswith("#"):
|
if line.startswith("#"):
|
||||||
header.append(line)
|
header.append(line)
|
||||||
line = next(infile)
|
else:
|
||||||
return "".join(header)
|
break
|
||||||
|
return "\n".join(header)
|
||||||
|
|
||||||
|
|
||||||
def parse_range_properties(infile, default=None, is_set=False):
|
def parse_range_properties(infile: List[str], default=None, is_set=False):
|
||||||
"""Parse a Unicode data file containing a column with one character or
|
"""Parse a Unicode data file containing a column with one character or
|
||||||
a range of characters, and another column containing a property value
|
a range of characters, and another column containing a property value
|
||||||
separated by a semicolon. Comments after '#' are ignored.
|
separated by a semicolon. Comments after '#' are ignored.
|
||||||
@ -180,14 +178,15 @@ def build_ranges(
|
|||||||
|
|
||||||
if local_ucd:
|
if local_ucd:
|
||||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||||
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
file_lines = [
|
||||||
|
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||||
cm = open_unidata_file(filename)
|
file_lines = open_unidata_file(filename)
|
||||||
|
|
||||||
with cm as f:
|
header = parse_unidata_header(file_lines)
|
||||||
header = parse_unidata_header(f)
|
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
|
||||||
ranges = parse_range_properties(f, default=default, is_set=is_set)
|
|
||||||
|
|
||||||
if aliases:
|
if aliases:
|
||||||
reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
|
reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
|
||||||
@ -260,14 +259,14 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
|
|||||||
filename = "PropertyValueAliases.txt"
|
filename = "PropertyValueAliases.txt"
|
||||||
if local_ucd:
|
if local_ucd:
|
||||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||||
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
file_lines = [
|
||||||
|
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||||
cm = open_unidata_file(filename)
|
file_lines = open_unidata_file(filename)
|
||||||
|
header = parse_unidata_header(file_lines)
|
||||||
with cm as f:
|
data = parse_semicolon_separated_data(file_lines)
|
||||||
header = parse_unidata_header(f)
|
|
||||||
data = parse_semicolon_separated_data(f)
|
|
||||||
|
|
||||||
aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}
|
aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}
|
||||||
|
|
||||||
|
@ -155,6 +155,33 @@ def test_script():
|
|||||||
|
|
||||||
|
|
||||||
def test_script_extension():
|
def test_script_extension():
|
||||||
|
assert unicodedata.script_extension("\u00B7") == {
|
||||||
|
"Avst",
|
||||||
|
"Cari",
|
||||||
|
"Copt",
|
||||||
|
"Dupl",
|
||||||
|
"Elba",
|
||||||
|
"Geor",
|
||||||
|
"Glag",
|
||||||
|
"Gong",
|
||||||
|
"Goth",
|
||||||
|
"Grek",
|
||||||
|
"Hani",
|
||||||
|
"Latn",
|
||||||
|
"Lydi",
|
||||||
|
"Mahj",
|
||||||
|
"Perm",
|
||||||
|
"Shaw",
|
||||||
|
}
|
||||||
|
assert unicodedata.script_extension("\u02BC") == {
|
||||||
|
"Beng",
|
||||||
|
"Cyrl",
|
||||||
|
"Deva",
|
||||||
|
"Latn",
|
||||||
|
"Lisu",
|
||||||
|
"Thai",
|
||||||
|
"Toto",
|
||||||
|
}
|
||||||
assert unicodedata.script_extension("a") == {"Latn"}
|
assert unicodedata.script_extension("a") == {"Latn"}
|
||||||
assert unicodedata.script_extension(chr(0)) == {"Zyyy"}
|
assert unicodedata.script_extension(chr(0)) == {"Zyyy"}
|
||||||
assert unicodedata.script_extension(chr(0x0378)) == {"Zzzz"}
|
assert unicodedata.script_extension(chr(0x0378)) == {"Zzzz"}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user