972 lines
29 KiB
Python
972 lines
29 KiB
Python
# -*- coding: utf-8 -*-
|
||
from fontTools.misc.py23 import *
|
||
from fontTools.misc import sstruct
|
||
from fontTools.misc.textTools import safeEval
|
||
from fontTools.misc.encodingTools import getEncoding
|
||
from fontTools.ttLib import newTable
|
||
from . import DefaultTable
|
||
import struct
|
||
import logging
|
||
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
nameRecordFormat = """
|
||
> # big endian
|
||
platformID: H
|
||
platEncID: H
|
||
langID: H
|
||
nameID: H
|
||
length: H
|
||
offset: H
|
||
"""
|
||
|
||
nameRecordSize = sstruct.calcsize(nameRecordFormat)
|
||
|
||
|
||
class table__n_a_m_e(DefaultTable.DefaultTable):
|
||
dependencies = ["ltag"]
|
||
|
||
def decompile(self, data, ttFont):
|
||
format, n, stringOffset = struct.unpack(b">HHH", data[:6])
|
||
expectedStringOffset = 6 + n * nameRecordSize
|
||
if stringOffset != expectedStringOffset:
|
||
log.error(
|
||
"'name' table stringOffset incorrect. Expected: %s; Actual: %s",
|
||
expectedStringOffset, stringOffset)
|
||
stringData = data[stringOffset:]
|
||
data = data[6:]
|
||
self.names = []
|
||
for i in range(n):
|
||
if len(data) < 12:
|
||
log.error('skipping malformed name record #%d', i)
|
||
continue
|
||
name, data = sstruct.unpack2(nameRecordFormat, data, NameRecord())
|
||
name.string = stringData[name.offset:name.offset+name.length]
|
||
if name.offset + name.length > len(stringData):
|
||
log.error('skipping malformed name record #%d', i)
|
||
continue
|
||
assert len(name.string) == name.length
|
||
#if (name.platEncID, name.platformID) in ((0, 0), (1, 3)):
|
||
# if len(name.string) % 2:
|
||
# print "2-byte string doesn't have even length!"
|
||
# print name.__dict__
|
||
del name.offset, name.length
|
||
self.names.append(name)
|
||
|
||
def compile(self, ttFont):
|
||
if not hasattr(self, "names"):
|
||
# only happens when there are NO name table entries read
|
||
# from the TTX file
|
||
self.names = []
|
||
names = self.names
|
||
names.sort() # sort according to the spec; see NameRecord.__lt__()
|
||
stringData = b""
|
||
format = 0
|
||
n = len(names)
|
||
stringOffset = 6 + n * sstruct.calcsize(nameRecordFormat)
|
||
data = struct.pack(b">HHH", format, n, stringOffset)
|
||
lastoffset = 0
|
||
done = {} # remember the data so we can reuse the "pointers"
|
||
for name in names:
|
||
string = name.toBytes()
|
||
if string in done:
|
||
name.offset, name.length = done[string]
|
||
else:
|
||
name.offset, name.length = done[string] = len(stringData), len(string)
|
||
stringData = bytesjoin([stringData, string])
|
||
data = data + sstruct.pack(nameRecordFormat, name)
|
||
return data + stringData
|
||
|
||
def toXML(self, writer, ttFont):
|
||
for name in self.names:
|
||
name.toXML(writer, ttFont)
|
||
|
||
def fromXML(self, name, attrs, content, ttFont):
|
||
if name != "namerecord":
|
||
return # ignore unknown tags
|
||
if not hasattr(self, "names"):
|
||
self.names = []
|
||
name = NameRecord()
|
||
self.names.append(name)
|
||
name.fromXML(name, attrs, content, ttFont)
|
||
|
||
def getName(self, nameID, platformID, platEncID, langID=None):
|
||
for namerecord in self.names:
|
||
if ( namerecord.nameID == nameID and
|
||
namerecord.platformID == platformID and
|
||
namerecord.platEncID == platEncID):
|
||
if langID is None or namerecord.langID == langID:
|
||
return namerecord
|
||
return None # not found
|
||
|
||
def getDebugName(self, nameID):
|
||
englishName = someName = None
|
||
for name in self.names:
|
||
if name.nameID != nameID:
|
||
continue
|
||
try:
|
||
unistr = name.toUnicode()
|
||
except UnicodeDecodeError:
|
||
continue
|
||
|
||
someName = unistr
|
||
if (name.platformID, name.langID) in ((1, 0), (3, 0x409)):
|
||
englishName = unistr
|
||
break
|
||
if englishName:
|
||
return englishName
|
||
elif someName:
|
||
return someName
|
||
else:
|
||
return None
|
||
|
||
def setName(self, string, nameID, platformID, platEncID, langID):
|
||
""" Set the 'string' for the name record identified by 'nameID', 'platformID',
|
||
'platEncID' and 'langID'. If a record with that nameID doesn't exist, create it
|
||
and append to the name table.
|
||
|
||
'string' can be of type `str` (`unicode` in PY2) or `bytes`. In the latter case,
|
||
it is assumed to be already encoded with the correct plaform-specific encoding
|
||
identified by the (platformID, platEncID, langID) triplet. A warning is issued
|
||
to prevent unexpected results.
|
||
"""
|
||
if not hasattr(self, 'names'):
|
||
self.names = []
|
||
if not isinstance(string, unicode):
|
||
if isinstance(string, bytes):
|
||
log.warning(
|
||
"name string is bytes, ensure it's correctly encoded: %r", string)
|
||
else:
|
||
raise TypeError(
|
||
"expected unicode or bytes, found %s: %r" % (
|
||
type(string).__name__, string))
|
||
namerecord = self.getName(nameID, platformID, platEncID, langID)
|
||
if namerecord:
|
||
namerecord.string = string
|
||
else:
|
||
self.names.append(makeName(string, nameID, platformID, platEncID, langID))
|
||
|
||
def removeNames(self, nameID=None, platformID=None, platEncID=None, langID=None):
|
||
"""Remove any name records identified by the given combination of 'nameID',
|
||
'platformID', 'platEncID' and 'langID'.
|
||
"""
|
||
args = {
|
||
argName: argValue
|
||
for argName, argValue in (
|
||
("nameID", nameID),
|
||
("platformID", platformID),
|
||
("platEncID", platEncID),
|
||
("langID", langID),
|
||
)
|
||
if argValue is not None
|
||
}
|
||
if not args:
|
||
# no arguments, nothing to do
|
||
return
|
||
self.names = [
|
||
rec for rec in self.names
|
||
if any(
|
||
argValue != getattr(rec, argName)
|
||
for argName, argValue in args.items()
|
||
)
|
||
]
|
||
|
||
def _findUnusedNameID(self, minNameID=256):
|
||
"""Finds an unused name id.
|
||
|
||
The nameID is assigned in the range between 'minNameID' and 32767 (inclusive),
|
||
following the last nameID in the name table.
|
||
"""
|
||
names = getattr(self, 'names', [])
|
||
nameID = 1 + max([n.nameID for n in names] + [minNameID - 1])
|
||
if nameID > 32767:
|
||
raise ValueError("nameID must be less than 32768")
|
||
return nameID
|
||
|
||
def addMultilingualName(self, names, ttFont=None, nameID=None,
|
||
windows=True, mac=True):
|
||
"""Add a multilingual name, returning its name ID
|
||
|
||
'names' is a dictionary with the name in multiple languages,
|
||
such as {'en': 'Pale', 'de': 'Blaß', 'de-CH': 'Blass'}.
|
||
The keys can be arbitrary IETF BCP 47 language codes;
|
||
the values are Unicode strings.
|
||
|
||
'ttFont' is the TTFont to which the names are added, or None.
|
||
If present, the font's 'ltag' table can get populated
|
||
to store exotic language codes, which allows encoding
|
||
names that otherwise cannot get encoded at all.
|
||
|
||
'nameID' is the name ID to be used, or None to let the library
|
||
pick an unused name ID.
|
||
|
||
If 'windows' is True, a platformID=3 name record will be added.
|
||
If 'mac' is True, a platformID=1 name record will be added.
|
||
"""
|
||
if not hasattr(self, 'names'):
|
||
self.names = []
|
||
if nameID is None:
|
||
nameID = self._findUnusedNameID()
|
||
# TODO: Should minimize BCP 47 language codes.
|
||
# https://github.com/fonttools/fonttools/issues/930
|
||
for lang, name in sorted(names.items()):
|
||
if windows:
|
||
windowsName = _makeWindowsName(name, nameID, lang)
|
||
if windowsName is not None:
|
||
self.names.append(windowsName)
|
||
else:
|
||
# We cannot not make a Windows name: make sure we add a
|
||
# Mac name as a fallback. This can happen for exotic
|
||
# BCP47 language tags that have no Windows language code.
|
||
mac = True
|
||
if mac:
|
||
macName = _makeMacName(name, nameID, lang, ttFont)
|
||
if macName is not None:
|
||
self.names.append(macName)
|
||
return nameID
|
||
|
||
def addName(self, string, platforms=((1, 0, 0), (3, 1, 0x409)), minNameID=255):
|
||
""" Add a new name record containing 'string' for each (platformID, platEncID,
|
||
langID) tuple specified in the 'platforms' list.
|
||
|
||
The nameID is assigned in the range between 'minNameID'+1 and 32767 (inclusive),
|
||
following the last nameID in the name table.
|
||
If no 'platforms' are specified, two English name records are added, one for the
|
||
Macintosh (platformID=0), and one for the Windows platform (3).
|
||
|
||
The 'string' must be a Unicode string, so it can be encoded with different,
|
||
platform-specific encodings.
|
||
|
||
Return the new nameID.
|
||
"""
|
||
assert len(platforms) > 0, \
|
||
"'platforms' must contain at least one (platformID, platEncID, langID) tuple"
|
||
if not hasattr(self, 'names'):
|
||
self.names = []
|
||
if not isinstance(string, unicode):
|
||
raise TypeError(
|
||
"expected %s, found %s: %r" % (
|
||
unicode.__name__, type(string).__name__,string ))
|
||
nameID = self._findUnusedNameID(minNameID + 1)
|
||
for platformID, platEncID, langID in platforms:
|
||
self.names.append(makeName(string, nameID, platformID, platEncID, langID))
|
||
return nameID
|
||
|
||
|
||
def makeName(string, nameID, platformID, platEncID, langID):
|
||
name = NameRecord()
|
||
name.string, name.nameID, name.platformID, name.platEncID, name.langID = (
|
||
string, nameID, platformID, platEncID, langID)
|
||
return name
|
||
|
||
|
||
def _makeWindowsName(name, nameID, language):
|
||
"""Create a NameRecord for the Microsoft Windows platform
|
||
|
||
'language' is an arbitrary IETF BCP 47 language identifier such
|
||
as 'en', 'de-CH', 'de-AT-1901', or 'fa-Latn'. If Microsoft Windows
|
||
does not support the desired language, the result will be None.
|
||
Future versions of fonttools might return a NameRecord for the
|
||
OpenType 'name' table format 1, but this is not implemented yet.
|
||
"""
|
||
langID = _WINDOWS_LANGUAGE_CODES.get(language.lower())
|
||
if langID is not None:
|
||
return makeName(name, nameID, 3, 1, langID)
|
||
else:
|
||
log.warning("cannot add Windows name in language %s "
|
||
"because fonttools does not yet support "
|
||
"name table format 1" % language)
|
||
return None
|
||
|
||
|
||
def _makeMacName(name, nameID, language, font=None):
|
||
"""Create a NameRecord for Apple platforms
|
||
|
||
'language' is an arbitrary IETF BCP 47 language identifier such
|
||
as 'en', 'de-CH', 'de-AT-1901', or 'fa-Latn'. When possible, we
|
||
create a Macintosh NameRecord that is understood by old applications
|
||
(platform ID 1 and an old-style Macintosh language enum). If this
|
||
is not possible, we create a Unicode NameRecord (platform ID 0)
|
||
whose language points to the font’s 'ltag' table. The latter
|
||
can encode any string in any language, but legacy applications
|
||
might not recognize the format (in which case they will ignore
|
||
those names).
|
||
|
||
'font' should be the TTFont for which you want to create a name.
|
||
If 'font' is None, we only return NameRecords for legacy Macintosh;
|
||
in that case, the result will be None for names that need to
|
||
be encoded with an 'ltag' table.
|
||
|
||
See the section “The language identifier” in Apple’s specification:
|
||
https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html
|
||
"""
|
||
macLang = _MAC_LANGUAGE_CODES.get(language.lower())
|
||
macScript = _MAC_LANGUAGE_TO_SCRIPT.get(macLang)
|
||
if macLang is not None and macScript is not None:
|
||
encoding = getEncoding(1, macScript, macLang, default="ascii")
|
||
# Check if we can actually encode this name. If we can't,
|
||
# for example because we have no support for the legacy
|
||
# encoding, or because the name string contains Unicode
|
||
# characters that the legacy encoding cannot represent,
|
||
# we fall back to encoding the name in Unicode and put
|
||
# the language tag into the ltag table.
|
||
try:
|
||
_ = tobytes(name, encoding, errors="strict")
|
||
return makeName(name, nameID, 1, macScript, macLang)
|
||
except UnicodeEncodeError:
|
||
pass
|
||
if font is not None:
|
||
ltag = font.tables.get("ltag")
|
||
if ltag is None:
|
||
ltag = font["ltag"] = newTable("ltag")
|
||
# 0 = Unicode; 4 = “Unicode 2.0 or later semantics (non-BMP characters allowed)”
|
||
# “The preferred platform-specific code for Unicode would be 3 or 4.”
|
||
# https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html
|
||
return makeName(name, nameID, 0, 4, ltag.addTag(language))
|
||
else:
|
||
log.warning("cannot store language %s into 'ltag' table "
|
||
"without having access to the TTFont object" %
|
||
language)
|
||
return None
|
||
|
||
|
||
class NameRecord(object):
|
||
|
||
def getEncoding(self, default='ascii'):
|
||
"""Returns the Python encoding name for this name entry based on its platformID,
|
||
platEncID, and langID. If encoding for these values is not known, by default
|
||
'ascii' is returned. That can be overriden by passing a value to the default
|
||
argument.
|
||
"""
|
||
return getEncoding(self.platformID, self.platEncID, self.langID, default)
|
||
|
||
def encodingIsUnicodeCompatible(self):
|
||
return self.getEncoding(None) in ['utf_16_be', 'ucs2be', 'ascii', 'latin1']
|
||
|
||
def __str__(self):
|
||
return self.toStr(errors='backslashreplace')
|
||
|
||
def isUnicode(self):
|
||
return (self.platformID == 0 or
|
||
(self.platformID == 3 and self.platEncID in [0, 1, 10]))
|
||
|
||
def toUnicode(self, errors='strict'):
|
||
"""
|
||
If self.string is a Unicode string, return it; otherwise try decoding the
|
||
bytes in self.string to a Unicode string using the encoding of this
|
||
entry as returned by self.getEncoding(); Note that self.getEncoding()
|
||
returns 'ascii' if the encoding is unknown to the library.
|
||
|
||
Certain heuristics are performed to recover data from bytes that are
|
||
ill-formed in the chosen encoding, or that otherwise look misencoded
|
||
(mostly around bad UTF-16BE encoded bytes, or bytes that look like UTF-16BE
|
||
but marked otherwise). If the bytes are ill-formed and the heuristics fail,
|
||
the error is handled according to the errors parameter to this function, which is
|
||
passed to the underlying decode() function; by default it throws a
|
||
UnicodeDecodeError exception.
|
||
|
||
Note: The mentioned heuristics mean that roundtripping a font to XML and back
|
||
to binary might recover some misencoded data whereas just loading the font
|
||
and saving it back will not change them.
|
||
"""
|
||
def isascii(b):
|
||
return (b >= 0x20 and b <= 0x7E) or b in [0x09, 0x0A, 0x0D]
|
||
encoding = self.getEncoding()
|
||
string = self.string
|
||
|
||
if encoding == 'utf_16_be' and len(string) % 2 == 1:
|
||
# Recover badly encoded UTF-16 strings that have an odd number of bytes:
|
||
# - If the last byte is zero, drop it. Otherwise,
|
||
# - If all the odd bytes are zero and all the even bytes are ASCII,
|
||
# prepend one zero byte. Otherwise,
|
||
# - If first byte is zero and all other bytes are ASCII, insert zero
|
||
# bytes between consecutive ASCII bytes.
|
||
#
|
||
# (Yes, I've seen all of these in the wild... sigh)
|
||
if byteord(string[-1]) == 0:
|
||
string = string[:-1]
|
||
elif all(byteord(b) == 0 if i % 2 else isascii(byteord(b)) for i,b in enumerate(string)):
|
||
string = b'\0' + string
|
||
elif byteord(string[0]) == 0 and all(isascii(byteord(b)) for b in string[1:]):
|
||
string = bytesjoin(b'\0'+bytechr(byteord(b)) for b in string[1:])
|
||
|
||
string = tounicode(string, encoding=encoding, errors=errors)
|
||
|
||
# If decoded strings still looks like UTF-16BE, it suggests a double-encoding.
|
||
# Fix it up.
|
||
if all(ord(c) == 0 if i % 2 == 0 else isascii(ord(c)) for i,c in enumerate(string)):
|
||
# If string claims to be Mac encoding, but looks like UTF-16BE with ASCII text,
|
||
# narrow it down.
|
||
string = ''.join(c for c in string[1::2])
|
||
|
||
return string
|
||
|
||
def toBytes(self, errors='strict'):
|
||
""" If self.string is a bytes object, return it; otherwise try encoding
|
||
the Unicode string in self.string to bytes using the encoding of this
|
||
entry as returned by self.getEncoding(); Note that self.getEncoding()
|
||
returns 'ascii' if the encoding is unknown to the library.
|
||
|
||
If the Unicode string cannot be encoded to bytes in the chosen encoding,
|
||
the error is handled according to the errors parameter to this function,
|
||
which is passed to the underlying encode() function; by default it throws a
|
||
UnicodeEncodeError exception.
|
||
"""
|
||
return tobytes(self.string, encoding=self.getEncoding(), errors=errors)
|
||
|
||
toStr = toUnicode
|
||
|
||
def toXML(self, writer, ttFont):
|
||
try:
|
||
unistr = self.toUnicode()
|
||
except UnicodeDecodeError:
|
||
unistr = None
|
||
attrs = [
|
||
("nameID", self.nameID),
|
||
("platformID", self.platformID),
|
||
("platEncID", self.platEncID),
|
||
("langID", hex(self.langID)),
|
||
]
|
||
|
||
if unistr is None or not self.encodingIsUnicodeCompatible():
|
||
attrs.append(("unicode", unistr is not None))
|
||
|
||
writer.begintag("namerecord", attrs)
|
||
writer.newline()
|
||
if unistr is not None:
|
||
writer.write(unistr)
|
||
else:
|
||
writer.write8bit(self.string)
|
||
writer.newline()
|
||
writer.endtag("namerecord")
|
||
writer.newline()
|
||
|
||
def fromXML(self, name, attrs, content, ttFont):
|
||
self.nameID = safeEval(attrs["nameID"])
|
||
self.platformID = safeEval(attrs["platformID"])
|
||
self.platEncID = safeEval(attrs["platEncID"])
|
||
self.langID = safeEval(attrs["langID"])
|
||
s = strjoin(content).strip()
|
||
encoding = self.getEncoding()
|
||
if self.encodingIsUnicodeCompatible() or safeEval(attrs.get("unicode", "False")):
|
||
self.string = s.encode(encoding)
|
||
else:
|
||
# This is the inverse of write8bit...
|
||
self.string = s.encode("latin1")
|
||
|
||
def __lt__(self, other):
|
||
if type(self) != type(other):
|
||
return NotImplemented
|
||
|
||
# implemented so that list.sort() sorts according to the spec.
|
||
selfTuple = (
|
||
getattr(self, "platformID", None),
|
||
getattr(self, "platEncID", None),
|
||
getattr(self, "langID", None),
|
||
getattr(self, "nameID", None),
|
||
getattr(self, "string", None),
|
||
)
|
||
otherTuple = (
|
||
getattr(other, "platformID", None),
|
||
getattr(other, "platEncID", None),
|
||
getattr(other, "langID", None),
|
||
getattr(other, "nameID", None),
|
||
getattr(other, "string", None),
|
||
)
|
||
return selfTuple < otherTuple
|
||
|
||
def __repr__(self):
|
||
return "<NameRecord NameID=%d; PlatformID=%d; LanguageID=%d>" % (
|
||
self.nameID, self.platformID, self.langID)
|
||
|
||
|
||
# Windows language ID → IETF BCP-47 language tag
|
||
#
|
||
# While Microsoft indicates a region/country for all its language
|
||
# IDs, we follow Unicode practice by omitting “most likely subtags”
|
||
# as per Unicode CLDR. For example, English is simply “en” and not
|
||
# “en-Latn” because according to Unicode, the default script
|
||
# for English is Latin.
|
||
#
|
||
# http://www.unicode.org/cldr/charts/latest/supplemental/likely_subtags.html
|
||
# http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
|
||
_WINDOWS_LANGUAGES = {
|
||
0x0436: 'af',
|
||
0x041C: 'sq',
|
||
0x0484: 'gsw',
|
||
0x045E: 'am',
|
||
0x1401: 'ar-DZ',
|
||
0x3C01: 'ar-BH',
|
||
0x0C01: 'ar',
|
||
0x0801: 'ar-IQ',
|
||
0x2C01: 'ar-JO',
|
||
0x3401: 'ar-KW',
|
||
0x3001: 'ar-LB',
|
||
0x1001: 'ar-LY',
|
||
0x1801: 'ary',
|
||
0x2001: 'ar-OM',
|
||
0x4001: 'ar-QA',
|
||
0x0401: 'ar-SA',
|
||
0x2801: 'ar-SY',
|
||
0x1C01: 'aeb',
|
||
0x3801: 'ar-AE',
|
||
0x2401: 'ar-YE',
|
||
0x042B: 'hy',
|
||
0x044D: 'as',
|
||
0x082C: 'az-Cyrl',
|
||
0x042C: 'az',
|
||
0x046D: 'ba',
|
||
0x042D: 'eu',
|
||
0x0423: 'be',
|
||
0x0845: 'bn',
|
||
0x0445: 'bn-IN',
|
||
0x201A: 'bs-Cyrl',
|
||
0x141A: 'bs',
|
||
0x047E: 'br',
|
||
0x0402: 'bg',
|
||
0x0403: 'ca',
|
||
0x0C04: 'zh-HK',
|
||
0x1404: 'zh-MO',
|
||
0x0804: 'zh',
|
||
0x1004: 'zh-SG',
|
||
0x0404: 'zh-TW',
|
||
0x0483: 'co',
|
||
0x041A: 'hr',
|
||
0x101A: 'hr-BA',
|
||
0x0405: 'cs',
|
||
0x0406: 'da',
|
||
0x048C: 'prs',
|
||
0x0465: 'dv',
|
||
0x0813: 'nl-BE',
|
||
0x0413: 'nl',
|
||
0x0C09: 'en-AU',
|
||
0x2809: 'en-BZ',
|
||
0x1009: 'en-CA',
|
||
0x2409: 'en-029',
|
||
0x4009: 'en-IN',
|
||
0x1809: 'en-IE',
|
||
0x2009: 'en-JM',
|
||
0x4409: 'en-MY',
|
||
0x1409: 'en-NZ',
|
||
0x3409: 'en-PH',
|
||
0x4809: 'en-SG',
|
||
0x1C09: 'en-ZA',
|
||
0x2C09: 'en-TT',
|
||
0x0809: 'en-GB',
|
||
0x0409: 'en',
|
||
0x3009: 'en-ZW',
|
||
0x0425: 'et',
|
||
0x0438: 'fo',
|
||
0x0464: 'fil',
|
||
0x040B: 'fi',
|
||
0x080C: 'fr-BE',
|
||
0x0C0C: 'fr-CA',
|
||
0x040C: 'fr',
|
||
0x140C: 'fr-LU',
|
||
0x180C: 'fr-MC',
|
||
0x100C: 'fr-CH',
|
||
0x0462: 'fy',
|
||
0x0456: 'gl',
|
||
0x0437: 'ka',
|
||
0x0C07: 'de-AT',
|
||
0x0407: 'de',
|
||
0x1407: 'de-LI',
|
||
0x1007: 'de-LU',
|
||
0x0807: 'de-CH',
|
||
0x0408: 'el',
|
||
0x046F: 'kl',
|
||
0x0447: 'gu',
|
||
0x0468: 'ha',
|
||
0x040D: 'he',
|
||
0x0439: 'hi',
|
||
0x040E: 'hu',
|
||
0x040F: 'is',
|
||
0x0470: 'ig',
|
||
0x0421: 'id',
|
||
0x045D: 'iu',
|
||
0x085D: 'iu-Latn',
|
||
0x083C: 'ga',
|
||
0x0434: 'xh',
|
||
0x0435: 'zu',
|
||
0x0410: 'it',
|
||
0x0810: 'it-CH',
|
||
0x0411: 'ja',
|
||
0x044B: 'kn',
|
||
0x043F: 'kk',
|
||
0x0453: 'km',
|
||
0x0486: 'quc',
|
||
0x0487: 'rw',
|
||
0x0441: 'sw',
|
||
0x0457: 'kok',
|
||
0x0412: 'ko',
|
||
0x0440: 'ky',
|
||
0x0454: 'lo',
|
||
0x0426: 'lv',
|
||
0x0427: 'lt',
|
||
0x082E: 'dsb',
|
||
0x046E: 'lb',
|
||
0x042F: 'mk',
|
||
0x083E: 'ms-BN',
|
||
0x043E: 'ms',
|
||
0x044C: 'ml',
|
||
0x043A: 'mt',
|
||
0x0481: 'mi',
|
||
0x047A: 'arn',
|
||
0x044E: 'mr',
|
||
0x047C: 'moh',
|
||
0x0450: 'mn',
|
||
0x0850: 'mn-CN',
|
||
0x0461: 'ne',
|
||
0x0414: 'nb',
|
||
0x0814: 'nn',
|
||
0x0482: 'oc',
|
||
0x0448: 'or',
|
||
0x0463: 'ps',
|
||
0x0415: 'pl',
|
||
0x0416: 'pt',
|
||
0x0816: 'pt-PT',
|
||
0x0446: 'pa',
|
||
0x046B: 'qu-BO',
|
||
0x086B: 'qu-EC',
|
||
0x0C6B: 'qu',
|
||
0x0418: 'ro',
|
||
0x0417: 'rm',
|
||
0x0419: 'ru',
|
||
0x243B: 'smn',
|
||
0x103B: 'smj-NO',
|
||
0x143B: 'smj',
|
||
0x0C3B: 'se-FI',
|
||
0x043B: 'se',
|
||
0x083B: 'se-SE',
|
||
0x203B: 'sms',
|
||
0x183B: 'sma-NO',
|
||
0x1C3B: 'sms',
|
||
0x044F: 'sa',
|
||
0x1C1A: 'sr-Cyrl-BA',
|
||
0x0C1A: 'sr',
|
||
0x181A: 'sr-Latn-BA',
|
||
0x081A: 'sr-Latn',
|
||
0x046C: 'nso',
|
||
0x0432: 'tn',
|
||
0x045B: 'si',
|
||
0x041B: 'sk',
|
||
0x0424: 'sl',
|
||
0x2C0A: 'es-AR',
|
||
0x400A: 'es-BO',
|
||
0x340A: 'es-CL',
|
||
0x240A: 'es-CO',
|
||
0x140A: 'es-CR',
|
||
0x1C0A: 'es-DO',
|
||
0x300A: 'es-EC',
|
||
0x440A: 'es-SV',
|
||
0x100A: 'es-GT',
|
||
0x480A: 'es-HN',
|
||
0x080A: 'es-MX',
|
||
0x4C0A: 'es-NI',
|
||
0x180A: 'es-PA',
|
||
0x3C0A: 'es-PY',
|
||
0x280A: 'es-PE',
|
||
0x500A: 'es-PR',
|
||
|
||
# Microsoft has defined two different language codes for
|
||
# “Spanish with modern sorting” and “Spanish with traditional
|
||
# sorting”. This makes sense for collation APIs, and it would be
|
||
# possible to express this in BCP 47 language tags via Unicode
|
||
# extensions (eg., “es-u-co-trad” is “Spanish with traditional
|
||
# sorting”). However, for storing names in fonts, this distinction
|
||
# does not make sense, so we use “es” in both cases.
|
||
0x0C0A: 'es',
|
||
0x040A: 'es',
|
||
|
||
0x540A: 'es-US',
|
||
0x380A: 'es-UY',
|
||
0x200A: 'es-VE',
|
||
0x081D: 'sv-FI',
|
||
0x041D: 'sv',
|
||
0x045A: 'syr',
|
||
0x0428: 'tg',
|
||
0x085F: 'tzm',
|
||
0x0449: 'ta',
|
||
0x0444: 'tt',
|
||
0x044A: 'te',
|
||
0x041E: 'th',
|
||
0x0451: 'bo',
|
||
0x041F: 'tr',
|
||
0x0442: 'tk',
|
||
0x0480: 'ug',
|
||
0x0422: 'uk',
|
||
0x042E: 'hsb',
|
||
0x0420: 'ur',
|
||
0x0843: 'uz-Cyrl',
|
||
0x0443: 'uz',
|
||
0x042A: 'vi',
|
||
0x0452: 'cy',
|
||
0x0488: 'wo',
|
||
0x0485: 'sah',
|
||
0x0478: 'ii',
|
||
0x046A: 'yo',
|
||
}
|
||
|
||
|
||
_MAC_LANGUAGES = {
|
||
0: 'en',
|
||
1: 'fr',
|
||
2: 'de',
|
||
3: 'it',
|
||
4: 'nl',
|
||
5: 'sv',
|
||
6: 'es',
|
||
7: 'da',
|
||
8: 'pt',
|
||
9: 'no',
|
||
10: 'he',
|
||
11: 'ja',
|
||
12: 'ar',
|
||
13: 'fi',
|
||
14: 'el',
|
||
15: 'is',
|
||
16: 'mt',
|
||
17: 'tr',
|
||
18: 'hr',
|
||
19: 'zh-Hant',
|
||
20: 'ur',
|
||
21: 'hi',
|
||
22: 'th',
|
||
23: 'ko',
|
||
24: 'lt',
|
||
25: 'pl',
|
||
26: 'hu',
|
||
27: 'es',
|
||
28: 'lv',
|
||
29: 'se',
|
||
30: 'fo',
|
||
31: 'fa',
|
||
32: 'ru',
|
||
33: 'zh',
|
||
34: 'nl-BE',
|
||
35: 'ga',
|
||
36: 'sq',
|
||
37: 'ro',
|
||
38: 'cz',
|
||
39: 'sk',
|
||
40: 'sl',
|
||
41: 'yi',
|
||
42: 'sr',
|
||
43: 'mk',
|
||
44: 'bg',
|
||
45: 'uk',
|
||
46: 'be',
|
||
47: 'uz',
|
||
48: 'kk',
|
||
49: 'az-Cyrl',
|
||
50: 'az-Arab',
|
||
51: 'hy',
|
||
52: 'ka',
|
||
53: 'mo',
|
||
54: 'ky',
|
||
55: 'tg',
|
||
56: 'tk',
|
||
57: 'mn-CN',
|
||
58: 'mn',
|
||
59: 'ps',
|
||
60: 'ks',
|
||
61: 'ku',
|
||
62: 'sd',
|
||
63: 'bo',
|
||
64: 'ne',
|
||
65: 'sa',
|
||
66: 'mr',
|
||
67: 'bn',
|
||
68: 'as',
|
||
69: 'gu',
|
||
70: 'pa',
|
||
71: 'or',
|
||
72: 'ml',
|
||
73: 'kn',
|
||
74: 'ta',
|
||
75: 'te',
|
||
76: 'si',
|
||
77: 'my',
|
||
78: 'km',
|
||
79: 'lo',
|
||
80: 'vi',
|
||
81: 'id',
|
||
82: 'tl',
|
||
83: 'ms',
|
||
84: 'ms-Arab',
|
||
85: 'am',
|
||
86: 'ti',
|
||
87: 'om',
|
||
88: 'so',
|
||
89: 'sw',
|
||
90: 'rw',
|
||
91: 'rn',
|
||
92: 'ny',
|
||
93: 'mg',
|
||
94: 'eo',
|
||
128: 'cy',
|
||
129: 'eu',
|
||
130: 'ca',
|
||
131: 'la',
|
||
132: 'qu',
|
||
133: 'gn',
|
||
134: 'ay',
|
||
135: 'tt',
|
||
136: 'ug',
|
||
137: 'dz',
|
||
138: 'jv',
|
||
139: 'su',
|
||
140: 'gl',
|
||
141: 'af',
|
||
142: 'br',
|
||
143: 'iu',
|
||
144: 'gd',
|
||
145: 'gv',
|
||
146: 'ga',
|
||
147: 'to',
|
||
148: 'el-polyton',
|
||
149: 'kl',
|
||
150: 'az',
|
||
151: 'nn',
|
||
}
|
||
|
||
|
||
_WINDOWS_LANGUAGE_CODES = {lang.lower(): code for code, lang in _WINDOWS_LANGUAGES.items()}
|
||
_MAC_LANGUAGE_CODES = {lang.lower(): code for code, lang in _MAC_LANGUAGES.items()}
|
||
|
||
|
||
# MacOS language ID → MacOS script ID
|
||
#
|
||
# Note that the script ID is not sufficient to determine what encoding
|
||
# to use in TrueType files. For some languages, MacOS used a modification
|
||
# of a mainstream script. For example, an Icelandic name would be stored
|
||
# with smRoman in the TrueType naming table, but the actual encoding
|
||
# is a special Icelandic version of the normal Macintosh Roman encoding.
|
||
# As another example, Inuktitut uses an 8-bit encoding for Canadian Aboriginal
|
||
# Syllables but MacOS had run out of available script codes, so this was
|
||
# done as a (pretty radical) “modification” of Ethiopic.
|
||
#
|
||
# http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/Readme.txt
|
||
_MAC_LANGUAGE_TO_SCRIPT = {
|
||
0: 0, # langEnglish → smRoman
|
||
1: 0, # langFrench → smRoman
|
||
2: 0, # langGerman → smRoman
|
||
3: 0, # langItalian → smRoman
|
||
4: 0, # langDutch → smRoman
|
||
5: 0, # langSwedish → smRoman
|
||
6: 0, # langSpanish → smRoman
|
||
7: 0, # langDanish → smRoman
|
||
8: 0, # langPortuguese → smRoman
|
||
9: 0, # langNorwegian → smRoman
|
||
10: 5, # langHebrew → smHebrew
|
||
11: 1, # langJapanese → smJapanese
|
||
12: 4, # langArabic → smArabic
|
||
13: 0, # langFinnish → smRoman
|
||
14: 6, # langGreek → smGreek
|
||
15: 0, # langIcelandic → smRoman (modified)
|
||
16: 0, # langMaltese → smRoman
|
||
17: 0, # langTurkish → smRoman (modified)
|
||
18: 0, # langCroatian → smRoman (modified)
|
||
19: 2, # langTradChinese → smTradChinese
|
||
20: 4, # langUrdu → smArabic
|
||
21: 9, # langHindi → smDevanagari
|
||
22: 21, # langThai → smThai
|
||
23: 3, # langKorean → smKorean
|
||
24: 29, # langLithuanian → smCentralEuroRoman
|
||
25: 29, # langPolish → smCentralEuroRoman
|
||
26: 29, # langHungarian → smCentralEuroRoman
|
||
27: 29, # langEstonian → smCentralEuroRoman
|
||
28: 29, # langLatvian → smCentralEuroRoman
|
||
29: 0, # langSami → smRoman
|
||
30: 0, # langFaroese → smRoman (modified)
|
||
31: 4, # langFarsi → smArabic (modified)
|
||
32: 7, # langRussian → smCyrillic
|
||
33: 25, # langSimpChinese → smSimpChinese
|
||
34: 0, # langFlemish → smRoman
|
||
35: 0, # langIrishGaelic → smRoman (modified)
|
||
36: 0, # langAlbanian → smRoman
|
||
37: 0, # langRomanian → smRoman (modified)
|
||
38: 29, # langCzech → smCentralEuroRoman
|
||
39: 29, # langSlovak → smCentralEuroRoman
|
||
40: 0, # langSlovenian → smRoman (modified)
|
||
41: 5, # langYiddish → smHebrew
|
||
42: 7, # langSerbian → smCyrillic
|
||
43: 7, # langMacedonian → smCyrillic
|
||
44: 7, # langBulgarian → smCyrillic
|
||
45: 7, # langUkrainian → smCyrillic (modified)
|
||
46: 7, # langByelorussian → smCyrillic
|
||
47: 7, # langUzbek → smCyrillic
|
||
48: 7, # langKazakh → smCyrillic
|
||
49: 7, # langAzerbaijani → smCyrillic
|
||
50: 4, # langAzerbaijanAr → smArabic
|
||
51: 24, # langArmenian → smArmenian
|
||
52: 23, # langGeorgian → smGeorgian
|
||
53: 7, # langMoldavian → smCyrillic
|
||
54: 7, # langKirghiz → smCyrillic
|
||
55: 7, # langTajiki → smCyrillic
|
||
56: 7, # langTurkmen → smCyrillic
|
||
57: 27, # langMongolian → smMongolian
|
||
58: 7, # langMongolianCyr → smCyrillic
|
||
59: 4, # langPashto → smArabic
|
||
60: 4, # langKurdish → smArabic
|
||
61: 4, # langKashmiri → smArabic
|
||
62: 4, # langSindhi → smArabic
|
||
63: 26, # langTibetan → smTibetan
|
||
64: 9, # langNepali → smDevanagari
|
||
65: 9, # langSanskrit → smDevanagari
|
||
66: 9, # langMarathi → smDevanagari
|
||
67: 13, # langBengali → smBengali
|
||
68: 13, # langAssamese → smBengali
|
||
69: 11, # langGujarati → smGujarati
|
||
70: 10, # langPunjabi → smGurmukhi
|
||
71: 12, # langOriya → smOriya
|
||
72: 17, # langMalayalam → smMalayalam
|
||
73: 16, # langKannada → smKannada
|
||
74: 14, # langTamil → smTamil
|
||
75: 15, # langTelugu → smTelugu
|
||
76: 18, # langSinhalese → smSinhalese
|
||
77: 19, # langBurmese → smBurmese
|
||
78: 20, # langKhmer → smKhmer
|
||
79: 22, # langLao → smLao
|
||
80: 30, # langVietnamese → smVietnamese
|
||
81: 0, # langIndonesian → smRoman
|
||
82: 0, # langTagalog → smRoman
|
||
83: 0, # langMalayRoman → smRoman
|
||
84: 4, # langMalayArabic → smArabic
|
||
85: 28, # langAmharic → smEthiopic
|
||
86: 28, # langTigrinya → smEthiopic
|
||
87: 28, # langOromo → smEthiopic
|
||
88: 0, # langSomali → smRoman
|
||
89: 0, # langSwahili → smRoman
|
||
90: 0, # langKinyarwanda → smRoman
|
||
91: 0, # langRundi → smRoman
|
||
92: 0, # langNyanja → smRoman
|
||
93: 0, # langMalagasy → smRoman
|
||
94: 0, # langEsperanto → smRoman
|
||
128: 0, # langWelsh → smRoman (modified)
|
||
129: 0, # langBasque → smRoman
|
||
130: 0, # langCatalan → smRoman
|
||
131: 0, # langLatin → smRoman
|
||
132: 0, # langQuechua → smRoman
|
||
133: 0, # langGuarani → smRoman
|
||
134: 0, # langAymara → smRoman
|
||
135: 7, # langTatar → smCyrillic
|
||
136: 4, # langUighur → smArabic
|
||
137: 26, # langDzongkha → smTibetan
|
||
138: 0, # langJavaneseRom → smRoman
|
||
139: 0, # langSundaneseRom → smRoman
|
||
140: 0, # langGalician → smRoman
|
||
141: 0, # langAfrikaans → smRoman
|
||
142: 0, # langBreton → smRoman (modified)
|
||
143: 28, # langInuktitut → smEthiopic (modified)
|
||
144: 0, # langScottishGaelic → smRoman (modified)
|
||
145: 0, # langManxGaelic → smRoman (modified)
|
||
146: 0, # langIrishGaelicScript → smRoman (modified)
|
||
147: 0, # langTongan → smRoman
|
||
148: 6, # langGreekAncient → smRoman
|
||
149: 0, # langGreenlandic → smRoman
|
||
150: 0, # langAzerbaijanRoman → smRoman
|
||
151: 0, # langNynorsk → smRoman
|
||
}
|