[unicodedata] Fix bug in buildUCD.py

The method that was being used to read the header from the input was
inadvertently dropping the first non-header line on the floor; although
this happens to be okay in some cases (where there is an empty line
after the header) in the case of newer versions of the
ScriptExtensions.txt file, this was causing the generated code to be
missing the first entry, for U+00B7 (MIDDLE DOT)
This commit is contained in:
Colin Rofls 2025-01-28 19:03:43 -05:00
parent 6bc5d7f887
commit b26271cc4d
5 changed files with 73 additions and 29 deletions

View File

@ -16,7 +16,6 @@
# Format:
# Start Code..End Code; Block Name
RANGES = [
0x0000, # .. 0x007F ; Basic Latin
0x0080, # .. 0x00FF ; Latin-1 Supplement

View File

@ -32,9 +32,10 @@
#
# @missing: 0000..10FFFF; <script>
RANGES = [
0x0000, # .. 0x02BB ; None
0x0000, # .. 0x00B6 ; None
0x00B7, # .. 0x00B7 ; {'Avst', 'Cari', 'Copt', 'Dupl', 'Elba', 'Geor', 'Glag', 'Gong', 'Goth', 'Grek', 'Hani', 'Latn', 'Lydi', 'Mahj', 'Perm', 'Shaw'}
0x00B8, # .. 0x02BB ; None
0x02BC, # .. 0x02BC ; {'Beng', 'Cyrl', 'Deva', 'Latn', 'Lisu', 'Thai', 'Toto'}
0x02BD, # .. 0x02C6 ; None
0x02C7, # .. 0x02C7 ; {'Bopo', 'Latn'}
@ -316,7 +317,26 @@ RANGES = [
]
VALUES = [
None, # 0000..02BB
None, # 0000..00B6
{
"Avst",
"Cari",
"Copt",
"Dupl",
"Elba",
"Geor",
"Glag",
"Gong",
"Goth",
"Grek",
"Hani",
"Latn",
"Lydi",
"Mahj",
"Perm",
"Shaw",
}, # 00B7..00B7
None, # 00B8..02BB
{"Beng", "Cyrl", "Deva", "Latn", "Lisu", "Thai", "Toto"}, # 02BC..02BC
None, # 02BD..02C6
{"Bopo", "Latn"}, # 02C7..02C7

View File

@ -19,7 +19,6 @@
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
#
RANGES = [
0x0000, # .. 0x0040 ; Common
0x0041, # .. 0x005A ; Latin

View File

@ -8,13 +8,12 @@ try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
from contextlib import closing, contextmanager
import re
from codecs import iterdecode
import logging
import os
from io import open
from os.path import abspath, dirname, join as pjoin, pardir, sep
from typing import List
try: # pragma: no cover
@ -40,27 +39,26 @@ MAX_UNICODE = 0x10FFFF
log = logging.getLogger()
@contextmanager
def open_unidata_file(filename):
def open_unidata_file(filename) -> List[str]:
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
url = UNIDATA_URL + filename
with closing(urlopen(url)) as response:
yield iterdecode(response, encoding="utf-8")
return urlopen(url).read().decode("utf-8").splitlines()
def parse_unidata_header(infile):
def parse_unidata_header(file_lines: List[str]):
"""Read the top header of data files, until the first line
that does not start with '#'.
"""
header = []
line = next(infile)
while line.startswith("#"):
header.append(line)
line = next(infile)
return "".join(header)
for line in file_lines:
if line.startswith("#"):
header.append(line)
else:
break
return "\n".join(header)
def parse_range_properties(infile, default=None, is_set=False):
def parse_range_properties(infile: List[str], default=None, is_set=False):
"""Parse a Unicode data file containing a column with one character or
a range of characters, and another column containing a property value
separated by a semicolon. Comments after '#' are ignored.
@ -180,14 +178,15 @@ def build_ranges(
if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
cm = open_unidata_file(filename)
file_lines = open_unidata_file(filename)
with cm as f:
header = parse_unidata_header(f)
ranges = parse_range_properties(f, default=default, is_set=is_set)
header = parse_unidata_header(file_lines)
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
if aliases:
reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
@ -260,14 +259,14 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
filename = "PropertyValueAliases.txt"
if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
cm = open_unidata_file(filename)
with cm as f:
header = parse_unidata_header(f)
data = parse_semicolon_separated_data(f)
file_lines = open_unidata_file(filename)
header = parse_unidata_header(file_lines)
data = parse_semicolon_separated_data(file_lines)
aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}

View File

@ -155,6 +155,33 @@ def test_script():
def test_script_extension():
assert unicodedata.script_extension("\u00B7") == {
"Avst",
"Cari",
"Copt",
"Dupl",
"Elba",
"Geor",
"Glag",
"Gong",
"Goth",
"Grek",
"Hani",
"Latn",
"Lydi",
"Mahj",
"Perm",
"Shaw",
}
assert unicodedata.script_extension("\u02BC") == {
"Beng",
"Cyrl",
"Deva",
"Latn",
"Lisu",
"Thai",
"Toto",
}
assert unicodedata.script_extension("a") == {"Latn"}
assert unicodedata.script_extension(chr(0)) == {"Zyyy"}
assert unicodedata.script_extension(chr(0x0378)) == {"Zzzz"}