Merge pull request #3756 from cmyr/unicode-data-fixup

[unicodedata] Fix bug in buildUCD.py
This commit is contained in:
Cosimo Lupo 2025-01-29 17:35:33 +00:00 committed by GitHub
commit eece3c1b62
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 73 additions and 29 deletions

View File

@ -16,7 +16,6 @@
# Format:
# Start Code..End Code; Block Name
RANGES = [
0x0000, # .. 0x007F ; Basic Latin
0x0080, # .. 0x00FF ; Latin-1 Supplement

View File

@ -32,9 +32,10 @@
#
# @missing: 0000..10FFFF; <script>
RANGES = [
0x0000, # .. 0x02BB ; None
0x0000, # .. 0x00B6 ; None
0x00B7, # .. 0x00B7 ; {'Avst', 'Cari', 'Copt', 'Dupl', 'Elba', 'Geor', 'Glag', 'Gong', 'Goth', 'Grek', 'Hani', 'Latn', 'Lydi', 'Mahj', 'Perm', 'Shaw'}
0x00B8, # .. 0x02BB ; None
0x02BC, # .. 0x02BC ; {'Beng', 'Cyrl', 'Deva', 'Latn', 'Lisu', 'Thai', 'Toto'}
0x02BD, # .. 0x02C6 ; None
0x02C7, # .. 0x02C7 ; {'Bopo', 'Latn'}
@ -316,7 +317,26 @@ RANGES = [
]
VALUES = [
None, # 0000..02BB
None, # 0000..00B6
{
"Avst",
"Cari",
"Copt",
"Dupl",
"Elba",
"Geor",
"Glag",
"Gong",
"Goth",
"Grek",
"Hani",
"Latn",
"Lydi",
"Mahj",
"Perm",
"Shaw",
}, # 00B7..00B7
None, # 00B8..02BB
{"Beng", "Cyrl", "Deva", "Latn", "Lisu", "Thai", "Toto"}, # 02BC..02BC
None, # 02BD..02C6
{"Bopo", "Latn"}, # 02C7..02C7

View File

@ -19,7 +19,6 @@
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
#
RANGES = [
0x0000, # .. 0x0040 ; Common
0x0041, # .. 0x005A ; Latin

View File

@ -8,13 +8,12 @@ try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
from contextlib import closing, contextmanager
import re
from codecs import iterdecode
import logging
import os
from io import open
from os.path import abspath, dirname, join as pjoin, pardir, sep
from typing import List
try: # pragma: no cover
@ -40,27 +39,26 @@ MAX_UNICODE = 0x10FFFF
log = logging.getLogger()
@contextmanager
def open_unidata_file(filename):
def open_unidata_file(filename) -> List[str]:
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
url = UNIDATA_URL + filename
with closing(urlopen(url)) as response:
yield iterdecode(response, encoding="utf-8")
return urlopen(url).read().decode("utf-8").splitlines()
def parse_unidata_header(infile):
def parse_unidata_header(file_lines: List[str]):
"""Read the top header of data files, until the first line
that does not start with '#'.
"""
header = []
line = next(infile)
while line.startswith("#"):
header.append(line)
line = next(infile)
return "".join(header)
for line in file_lines:
if line.startswith("#"):
header.append(line)
else:
break
return "\n".join(header)
def parse_range_properties(infile, default=None, is_set=False):
def parse_range_properties(infile: List[str], default=None, is_set=False):
"""Parse a Unicode data file containing a column with one character or
a range of characters, and another column containing a property value
separated by a semicolon. Comments after '#' are ignored.
@ -180,14 +178,15 @@ def build_ranges(
if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
cm = open_unidata_file(filename)
file_lines = open_unidata_file(filename)
with cm as f:
header = parse_unidata_header(f)
ranges = parse_range_properties(f, default=default, is_set=is_set)
header = parse_unidata_header(file_lines)
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
if aliases:
reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
@ -260,14 +259,14 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
filename = "PropertyValueAliases.txt"
if local_ucd:
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
file_lines = [
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
]
else:
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
cm = open_unidata_file(filename)
with cm as f:
header = parse_unidata_header(f)
data = parse_semicolon_separated_data(f)
file_lines = open_unidata_file(filename)
header = parse_unidata_header(file_lines)
data = parse_semicolon_separated_data(file_lines)
aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}

View File

@ -155,6 +155,33 @@ def test_script():
def test_script_extension():
assert unicodedata.script_extension("\u00B7") == {
"Avst",
"Cari",
"Copt",
"Dupl",
"Elba",
"Geor",
"Glag",
"Gong",
"Goth",
"Grek",
"Hani",
"Latn",
"Lydi",
"Mahj",
"Perm",
"Shaw",
}
assert unicodedata.script_extension("\u02BC") == {
"Beng",
"Cyrl",
"Deva",
"Latn",
"Lisu",
"Thai",
"Toto",
}
assert unicodedata.script_extension("a") == {"Latn"}
assert unicodedata.script_extension(chr(0)) == {"Zyyy"}
assert unicodedata.script_extension(chr(0x0378)) == {"Zzzz"}