Merge pull request #3756 from cmyr/unicode-data-fixup
[unicodedata] Fix bug in buildUCD.py
This commit is contained in:
commit
eece3c1b62
@ -16,7 +16,6 @@
|
||||
# Format:
|
||||
# Start Code..End Code; Block Name
|
||||
|
||||
|
||||
RANGES = [
|
||||
0x0000, # .. 0x007F ; Basic Latin
|
||||
0x0080, # .. 0x00FF ; Latin-1 Supplement
|
||||
|
@ -32,9 +32,10 @@
|
||||
#
|
||||
# @missing: 0000..10FFFF; <script>
|
||||
|
||||
|
||||
RANGES = [
|
||||
0x0000, # .. 0x02BB ; None
|
||||
0x0000, # .. 0x00B6 ; None
|
||||
0x00B7, # .. 0x00B7 ; {'Avst', 'Cari', 'Copt', 'Dupl', 'Elba', 'Geor', 'Glag', 'Gong', 'Goth', 'Grek', 'Hani', 'Latn', 'Lydi', 'Mahj', 'Perm', 'Shaw'}
|
||||
0x00B8, # .. 0x02BB ; None
|
||||
0x02BC, # .. 0x02BC ; {'Beng', 'Cyrl', 'Deva', 'Latn', 'Lisu', 'Thai', 'Toto'}
|
||||
0x02BD, # .. 0x02C6 ; None
|
||||
0x02C7, # .. 0x02C7 ; {'Bopo', 'Latn'}
|
||||
@ -316,7 +317,26 @@ RANGES = [
|
||||
]
|
||||
|
||||
VALUES = [
|
||||
None, # 0000..02BB
|
||||
None, # 0000..00B6
|
||||
{
|
||||
"Avst",
|
||||
"Cari",
|
||||
"Copt",
|
||||
"Dupl",
|
||||
"Elba",
|
||||
"Geor",
|
||||
"Glag",
|
||||
"Gong",
|
||||
"Goth",
|
||||
"Grek",
|
||||
"Hani",
|
||||
"Latn",
|
||||
"Lydi",
|
||||
"Mahj",
|
||||
"Perm",
|
||||
"Shaw",
|
||||
}, # 00B7..00B7
|
||||
None, # 00B8..02BB
|
||||
{"Beng", "Cyrl", "Deva", "Latn", "Lisu", "Thai", "Toto"}, # 02BC..02BC
|
||||
None, # 02BD..02C6
|
||||
{"Bopo", "Latn"}, # 02C7..02C7
|
||||
|
@ -19,7 +19,6 @@
|
||||
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
|
||||
#
|
||||
|
||||
|
||||
RANGES = [
|
||||
0x0000, # .. 0x0040 ; Common
|
||||
0x0041, # .. 0x005A ; Latin
|
||||
|
@ -8,13 +8,12 @@ try:
|
||||
from urllib.request import urlopen
|
||||
except ImportError:
|
||||
from urllib2 import urlopen
|
||||
from contextlib import closing, contextmanager
|
||||
import re
|
||||
from codecs import iterdecode
|
||||
import logging
|
||||
import os
|
||||
from io import open
|
||||
from os.path import abspath, dirname, join as pjoin, pardir, sep
|
||||
from typing import List
|
||||
|
||||
|
||||
try: # pragma: no cover
|
||||
@ -40,27 +39,26 @@ MAX_UNICODE = 0x10FFFF
|
||||
log = logging.getLogger()
|
||||
|
||||
|
||||
@contextmanager
|
||||
def open_unidata_file(filename):
|
||||
def open_unidata_file(filename) -> List[str]:
|
||||
"""Open a text file from https://unicode.org/Public/UNIDATA/"""
|
||||
url = UNIDATA_URL + filename
|
||||
with closing(urlopen(url)) as response:
|
||||
yield iterdecode(response, encoding="utf-8")
|
||||
return urlopen(url).read().decode("utf-8").splitlines()
|
||||
|
||||
|
||||
def parse_unidata_header(infile):
|
||||
def parse_unidata_header(file_lines: List[str]):
|
||||
"""Read the top header of data files, until the first line
|
||||
that does not start with '#'.
|
||||
"""
|
||||
header = []
|
||||
line = next(infile)
|
||||
while line.startswith("#"):
|
||||
header.append(line)
|
||||
line = next(infile)
|
||||
return "".join(header)
|
||||
for line in file_lines:
|
||||
if line.startswith("#"):
|
||||
header.append(line)
|
||||
else:
|
||||
break
|
||||
return "\n".join(header)
|
||||
|
||||
|
||||
def parse_range_properties(infile, default=None, is_set=False):
|
||||
def parse_range_properties(infile: List[str], default=None, is_set=False):
|
||||
"""Parse a Unicode data file containing a column with one character or
|
||||
a range of characters, and another column containing a property value
|
||||
separated by a semicolon. Comments after '#' are ignored.
|
||||
@ -180,14 +178,15 @@ def build_ranges(
|
||||
|
||||
if local_ucd:
|
||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
file_lines = [
|
||||
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
]
|
||||
else:
|
||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||
cm = open_unidata_file(filename)
|
||||
file_lines = open_unidata_file(filename)
|
||||
|
||||
with cm as f:
|
||||
header = parse_unidata_header(f)
|
||||
ranges = parse_range_properties(f, default=default, is_set=is_set)
|
||||
header = parse_unidata_header(file_lines)
|
||||
ranges = parse_range_properties(file_lines, default=default, is_set=is_set)
|
||||
|
||||
if aliases:
|
||||
reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
|
||||
@ -260,14 +259,14 @@ def parse_property_value_aliases(property_tag, local_ucd=None):
|
||||
filename = "PropertyValueAliases.txt"
|
||||
if local_ucd:
|
||||
log.info("loading '%s' from local directory '%s'", filename, local_ucd)
|
||||
cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
file_lines = [
|
||||
l for l in open(pjoin(local_ucd, filename), "r", encoding="utf-8")
|
||||
]
|
||||
else:
|
||||
log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
|
||||
cm = open_unidata_file(filename)
|
||||
|
||||
with cm as f:
|
||||
header = parse_unidata_header(f)
|
||||
data = parse_semicolon_separated_data(f)
|
||||
file_lines = open_unidata_file(filename)
|
||||
header = parse_unidata_header(file_lines)
|
||||
data = parse_semicolon_separated_data(file_lines)
|
||||
|
||||
aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}
|
||||
|
||||
|
@ -155,6 +155,33 @@ def test_script():
|
||||
|
||||
|
||||
def test_script_extension():
|
||||
assert unicodedata.script_extension("\u00B7") == {
|
||||
"Avst",
|
||||
"Cari",
|
||||
"Copt",
|
||||
"Dupl",
|
||||
"Elba",
|
||||
"Geor",
|
||||
"Glag",
|
||||
"Gong",
|
||||
"Goth",
|
||||
"Grek",
|
||||
"Hani",
|
||||
"Latn",
|
||||
"Lydi",
|
||||
"Mahj",
|
||||
"Perm",
|
||||
"Shaw",
|
||||
}
|
||||
assert unicodedata.script_extension("\u02BC") == {
|
||||
"Beng",
|
||||
"Cyrl",
|
||||
"Deva",
|
||||
"Latn",
|
||||
"Lisu",
|
||||
"Thai",
|
||||
"Toto",
|
||||
}
|
||||
assert unicodedata.script_extension("a") == {"Latn"}
|
||||
assert unicodedata.script_extension(chr(0)) == {"Zyyy"}
|
||||
assert unicodedata.script_extension(chr(0x0378)) == {"Zzzz"}
|
||||
|
Loading…
x
Reference in New Issue
Block a user