[name] Add helper for building multi-lingual names

Fixes https://github.com/fonttools/fonttools/issues/921.
This commit is contained in:
Sascha Brawer 2017-04-18 19:18:46 +02:00
parent 068ca5afbb
commit e8530360bd
4 changed files with 571 additions and 5 deletions

View File

@ -43,6 +43,12 @@ class FakeFont:
def __getitem__(self, tag):
return self.tables[tag]
def __setitem__(self, tag, table):
self.tables[tag] = table
def get(self, tag, default=None):
return self.tables.get(tag, default)
def getGlyphID(self, name):
return self.glyphOrder_.index(name)

View File

@ -1,8 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import print_function, division, absolute_import
from __future__ import unicode_literals
from fontTools.misc.py23 import *
from fontTools.misc import sstruct
from fontTools.misc.textTools import safeEval
from fontTools.misc.encodingTools import getEncoding
from fontTools.ttLib import newTable
from . import DefaultTable
import struct
import logging
@ -24,6 +27,7 @@ nameRecordSize = sstruct.calcsize(nameRecordFormat)
class table__n_a_m_e(DefaultTable.DefaultTable):
dependencies = ["ltag"]
def decompile(self, data, ttFont):
format, n, stringOffset = struct.unpack(">HHH", data[:6])
@ -145,13 +149,49 @@ class table__n_a_m_e(DefaultTable.DefaultTable):
else:
self.names.append(makeName(string, nameID, platformID, platEncID, langID))
def _findUnusedNameID(self, minNameID=256):
"""Finds an unused name id.
The nameID is assigned in the range between 'minNameID' and 32767 (inclusive),
following the last nameID in the name table.
"""
names = getattr(self, 'names', [])
nameID = 1 + max([n.nameID for n in names] + [minNameID - 1])
if nameID > 32767:
raise ValueError("nameID must be less than 32768")
return nameID
def addMultilingualName(self, names, ttFont, nameID=None):
if not hasattr(self, 'names'):
self.names = []
if nameID is None:
nameID = self._findUnusedNameID()
for lang, name in sorted(names.items()):
# Add a Macintosh name. See section “The language identifier” in
# https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html
macLang = _MAC_LANGUAGE_CODES.get(lang.lower())
if macLang is not None:
macScript = _MAC_LANGUAGE_TO_SCRIPT[macLang]
self.names.append(makeName(name, nameID, 1, macScript, macLang))
else:
ltag = ttFont.tables.get("ltag")
if ltag is None:
ltag = ttFont["ltag"] = newTable("ltag")
self.names.append(makeName(name, nameID, 2, 4, ltag.addTag(lang)))
# Add a Windows name.
windowsLang = _WINDOWS_LANGUAGE_CODES.get(lang.lower())
if windowsLang is not None:
self.names.append(makeName(name, nameID, 3, 1, windowsLang))
else:
log.warning("cannot add name in language %s because fonttools does not yet support name table format 1" % lang)
return nameID
def addName(self, string, platforms=((1, 0, 0), (3, 1, 0x409)), minNameID=255):
""" Add a new name record containing 'string' for each (platformID, platEncID,
langID) tuple specified in the 'platforms' list.
The nameID is assigned in the range between 'minNameID'+1 and 32767 (inclusive),
following the last nameID in the name table.
If no 'platforms' are specified, two English name records are added, one for the
Macintosh (platformID=0), and one for the Windows platform (3).
@ -168,9 +208,7 @@ class table__n_a_m_e(DefaultTable.DefaultTable):
raise TypeError(
"expected %s, found %s: %r" % (
unicode.__name__, type(string).__name__,string ))
nameID = 1 + max([n.nameID for n in self.names] + [minNameID])
if nameID > 32767:
raise ValueError("nameID must be less than 32768")
nameID = self._findUnusedNameID(minNameID + 1)
for platformID, platEncID, langID in platforms:
self.names.append(makeName(string, nameID, platformID, platEncID, langID))
return nameID
@ -337,3 +375,493 @@ class NameRecord(object):
def __repr__(self):
return "<NameRecord NameID=%d; PlatformID=%d; LanguageID=%d>" % (
self.nameID, self.platformID, self.langID)
# Windows language ID → IETF BCP-47 language tag
#
# While Microsoft indicates a region/country for all its language
# IDs, we follow Unicode practice by omitting “most likely subtags”
# as per Unicode CLDR. For example, English is simply “en” and not
# “en-Latn” because according to Unicode, the default script
# for English is Latin.
#
# http://www.unicode.org/cldr/charts/latest/supplemental/likely_subtags.html
# http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
_WINDOWS_LANGUAGES = {
0x0436: 'af',
0x041C: 'sq',
0x0484: 'gsw',
0x045E: 'am',
0x1401: 'ar-DZ',
0x3C01: 'ar-BH',
0x0C01: 'ar',
0x0801: 'ar-IQ',
0x2C01: 'ar-JO',
0x3401: 'ar-KW',
0x3001: 'ar-LB',
0x1001: 'ar-LY',
0x1801: 'ary',
0x2001: 'ar-OM',
0x4001: 'ar-QA',
0x0401: 'ar-SA',
0x2801: 'ar-SY',
0x1C01: 'aeb',
0x3801: 'ar-AE',
0x2401: 'ar-YE',
0x042B: 'hy',
0x044D: 'as',
0x082C: 'az-Cyrl',
0x042C: 'az',
0x046D: 'ba',
0x042D: 'eu',
0x0423: 'be',
0x0845: 'bn',
0x0445: 'bn-IN',
0x201A: 'bs-Cyrl',
0x141A: 'bs',
0x047E: 'br',
0x0402: 'bg',
0x0403: 'ca',
0x0C04: 'zh-HK',
0x1404: 'zh-MO',
0x0804: 'zh',
0x1004: 'zh-SG',
0x0404: 'zh-TW',
0x0483: 'co',
0x041A: 'hr',
0x101A: 'hr-BA',
0x0405: 'cs',
0x0406: 'da',
0x048C: 'prs',
0x0465: 'dv',
0x0813: 'nl-BE',
0x0413: 'nl',
0x0C09: 'en-AU',
0x2809: 'en-BZ',
0x1009: 'en-CA',
0x2409: 'en-029',
0x4009: 'en-IN',
0x1809: 'en-IE',
0x2009: 'en-JM',
0x4409: 'en-MY',
0x1409: 'en-NZ',
0x3409: 'en-PH',
0x4809: 'en-SG',
0x1C09: 'en-ZA',
0x2C09: 'en-TT',
0x0809: 'en-GB',
0x0409: 'en',
0x3009: 'en-ZW',
0x0425: 'et',
0x0438: 'fo',
0x0464: 'fil',
0x040B: 'fi',
0x080C: 'fr-BE',
0x0C0C: 'fr-CA',
0x040C: 'fr',
0x140C: 'fr-LU',
0x180C: 'fr-MC',
0x100C: 'fr-CH',
0x0462: 'fy',
0x0456: 'gl',
0x0437: 'ka',
0x0C07: 'de-AT',
0x0407: 'de',
0x1407: 'de-LI',
0x1007: 'de-LU',
0x0807: 'de-CH',
0x0408: 'el',
0x046F: 'kl',
0x0447: 'gu',
0x0468: 'ha',
0x040D: 'he',
0x0439: 'hi',
0x040E: 'hu',
0x040F: 'is',
0x0470: 'ig',
0x0421: 'id',
0x045D: 'iu',
0x085D: 'iu-Latn',
0x083C: 'ga',
0x0434: 'xh',
0x0435: 'zu',
0x0410: 'it',
0x0810: 'it-CH',
0x0411: 'ja',
0x044B: 'kn',
0x043F: 'kk',
0x0453: 'km',
0x0486: 'quc',
0x0487: 'rw',
0x0441: 'sw',
0x0457: 'kok',
0x0412: 'ko',
0x0440: 'ky',
0x0454: 'lo',
0x0426: 'lv',
0x0427: 'lt',
0x082E: 'dsb',
0x046E: 'lb',
0x042F: 'mk',
0x083E: 'ms-BN',
0x043E: 'ms',
0x044C: 'ml',
0x043A: 'mt',
0x0481: 'mi',
0x047A: 'arn',
0x044E: 'mr',
0x047C: 'moh',
0x0450: 'mn',
0x0850: 'mn-CN',
0x0461: 'ne',
0x0414: 'nb',
0x0814: 'nn',
0x0482: 'oc',
0x0448: 'or',
0x0463: 'ps',
0x0415: 'pl',
0x0416: 'pt',
0x0816: 'pt-PT',
0x0446: 'pa',
0x046B: 'qu-BO',
0x086B: 'qu-EC',
0x0C6B: 'qu',
0x0418: 'ro',
0x0417: 'rm',
0x0419: 'ru',
0x243B: 'smn',
0x103B: 'smj-NO',
0x143B: 'smj',
0x0C3B: 'se-FI',
0x043B: 'se',
0x083B: 'se-SE',
0x203B: 'sms',
0x183B: 'sma-NO',
0x1C3B: 'sms',
0x044F: 'sa',
0x1C1A: 'sr-Cyrl-BA',
0x0C1A: 'sr',
0x181A: 'sr-Latn-BA',
0x081A: 'sr-Latn',
0x046C: 'nso',
0x0432: 'tn',
0x045B: 'si',
0x041B: 'sk',
0x0424: 'sl',
0x2C0A: 'es-AR',
0x400A: 'es-BO',
0x340A: 'es-CL',
0x240A: 'es-CO',
0x140A: 'es-CR',
0x1C0A: 'es-DO',
0x300A: 'es-EC',
0x440A: 'es-SV',
0x100A: 'es-GT',
0x480A: 'es-HN',
0x080A: 'es-MX',
0x4C0A: 'es-NI',
0x180A: 'es-PA',
0x3C0A: 'es-PY',
0x280A: 'es-PE',
0x500A: 'es-PR',
# Microsoft has defined two different language codes for
# “Spanish with modern sorting” and “Spanish with traditional
# sorting”. This makes sense for collation APIs, and it would be
# possible to express this in BCP 47 language tags via Unicode
# extensions (eg., “es-u-co-trad” is “Spanish with traditional
# sorting”). However, for storing names in fonts, this distinction
# does not make sense, so we use “es” in both cases.
0x0C0A: 'es',
0x040A: 'es',
0x540A: 'es-US',
0x380A: 'es-UY',
0x200A: 'es-VE',
0x081D: 'sv-FI',
0x041D: 'sv',
0x045A: 'syr',
0x0428: 'tg',
0x085F: 'tzm',
0x0449: 'ta',
0x0444: 'tt',
0x044A: 'te',
0x041E: 'th',
0x0451: 'bo',
0x041F: 'tr',
0x0442: 'tk',
0x0480: 'ug',
0x0422: 'uk',
0x042E: 'hsb',
0x0420: 'ur',
0x0843: 'uz-Cyrl',
0x0443: 'uz',
0x042A: 'vi',
0x0452: 'cy',
0x0488: 'wo',
0x0485: 'sah',
0x0478: 'ii',
0x046A: 'yo',
}
_MAC_LANGUAGES = {
0: 'en',
1: 'fr',
2: 'de',
3: 'it',
4: 'nl',
5: 'sv',
6: 'es',
7: 'da',
8: 'pt',
9: 'no',
10: 'he',
11: 'ja',
12: 'ar',
13: 'fi',
14: 'el',
15: 'is',
16: 'mt',
17: 'tr',
18: 'hr',
19: 'zh-Hant',
20: 'ur',
21: 'hi',
22: 'th',
23: 'ko',
24: 'lt',
25: 'pl',
26: 'hu',
27: 'es',
28: 'lv',
29: 'se',
30: 'fo',
31: 'fa',
32: 'ru',
33: 'zh',
34: 'nl-BE',
35: 'ga',
36: 'sq',
37: 'ro',
38: 'cz',
39: 'sk',
40: 'si',
41: 'yi',
42: 'sr',
43: 'mk',
44: 'bg',
45: 'uk',
46: 'be',
47: 'uz',
48: 'kk',
49: 'az-Cyrl',
50: 'az-Arab',
51: 'hy',
52: 'ka',
53: 'mo',
54: 'ky',
55: 'tg',
56: 'tk',
57: 'mn-CN',
58: 'mn',
59: 'ps',
60: 'ks',
61: 'ku',
62: 'sd',
63: 'bo',
64: 'ne',
65: 'sa',
66: 'mr',
67: 'bn',
68: 'as',
69: 'gu',
70: 'pa',
71: 'or',
72: 'ml',
73: 'kn',
74: 'ta',
75: 'te',
76: 'si',
77: 'my',
78: 'km',
79: 'lo',
80: 'vi',
81: 'id',
82: 'tl',
83: 'ms',
84: 'ms-Arab',
85: 'am',
86: 'ti',
87: 'om',
88: 'so',
89: 'sw',
90: 'rw',
91: 'rn',
92: 'ny',
93: 'mg',
94: 'eo',
128: 'cy',
129: 'eu',
130: 'ca',
131: 'la',
132: 'qu',
133: 'gn',
134: 'ay',
135: 'tt',
136: 'ug',
137: 'dz',
138: 'jv',
139: 'su',
140: 'gl',
141: 'af',
142: 'br',
143: 'iu',
144: 'gd',
145: 'gv',
146: 'ga',
147: 'to',
148: 'el-polyton',
149: 'kl',
150: 'az',
151: 'nn',
}
_WINDOWS_LANGUAGE_CODES = {lang.lower(): code for code, lang in _WINDOWS_LANGUAGES.items()}
_MAC_LANGUAGE_CODES = {lang.lower(): code for code, lang in _MAC_LANGUAGES.items()}
# MacOS language ID → MacOS script ID
#
# Note that the script ID is not sufficient to determine what encoding
# to use in TrueType files. For some languages, MacOS used a modification
# of a mainstream script. For example, an Icelandic name would be stored
# with smRoman in the TrueType naming table, but the actual encoding
# is a special Icelandic version of the normal Macintosh Roman encoding.
# As another example, Inuktitut uses an 8-bit encoding for Canadian Aboriginal
# Syllables but MacOS had run out of available script codes, so this was
# done as a (pretty radical) “modification” of Ethiopic.
#
# http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/Readme.txt
_MAC_LANGUAGE_TO_SCRIPT = {
0: 0, # langEnglish → smRoman
1: 0, # langFrench → smRoman
2: 0, # langGerman → smRoman
3: 0, # langItalian → smRoman
4: 0, # langDutch → smRoman
5: 0, # langSwedish → smRoman
6: 0, # langSpanish → smRoman
7: 0, # langDanish → smRoman
8: 0, # langPortuguese → smRoman
9: 0, # langNorwegian → smRoman
10: 5, # langHebrew → smHebrew
11: 1, # langJapanese → smJapanese
12: 4, # langArabic → smArabic
13: 0, # langFinnish → smRoman
14: 6, # langGreek → smGreek
15: 0, # langIcelandic → smRoman (modified)
16: 0, # langMaltese → smRoman
17: 0, # langTurkish → smRoman (modified)
18: 0, # langCroatian → smRoman (modified)
19: 2, # langTradChinese → smTradChinese
20: 4, # langUrdu → smArabic
21: 9, # langHindi → smDevanagari
22: 21, # langThai → smThai
23: 3, # langKorean → smKorean
24: 29, # langLithuanian → smCentralEuroRoman
25: 29, # langPolish → smCentralEuroRoman
26: 29, # langHungarian → smCentralEuroRoman
27: 29, # langEstonian → smCentralEuroRoman
28: 29, # langLatvian → smCentralEuroRoman
29: 0, # langSami → smRoman
30: 0, # langFaroese → smRoman (modified)
31: 4, # langFarsi → smArabic (modified)
32: 7, # langRussian → smCyrillic
33: 25, # langSimpChinese → smSimpChinese
34: 0, # langFlemish → smRoman
35: 0, # langIrishGaelic → smRoman (modified)
36: 0, # langAlbanian → smRoman
37: 0, # langRomanian → smRoman (modified)
38: 29, # langCzech → smCentralEuroRoman
39: 29, # langSlovak → smCentralEuroRoman
40: 0, # langSlovenian → smRoman (modified)
41: 5, # langYiddish → smHebrew
42: 7, # langSerbian → smCyrillic
43: 7, # langMacedonian → smCyrillic
44: 7, # langBulgarian → smCyrillic
45: 7, # langUkrainian → smCyrillic (modified)
46: 7, # langByelorussian → smCyrillic
47: 7, # langUzbek → smCyrillic
48: 7, # langKazakh → smCyrillic
49: 7, # langAzerbaijani → smCyrillic
50: 4, # langAzerbaijanAr → smArabic
51: 24, # langArmenian → smArmenian
52: 23, # langGeorgian → smGeorgian
53: 7, # langMoldavian → smCyrillic
54: 7, # langKirghiz → smCyrillic
55: 7, # langTajiki → smCyrillic
56: 7, # langTurkmen → smCyrillic
57: 27, # langMongolian → smMongolian
58: 7, # langMongolianCyr → smCyrillic
59: 4, # langPashto → smArabic
60: 4, # langKurdish → smArabic
61: 4, # langKashmiri → smArabic
62: 4, # langSindhi → smArabic
63: 26, # langTibetan → smTibetan
64: 9, # langNepali → smDevanagari
65: 9, # langSanskrit → smDevanagari
66: 9, # langMarathi → smDevanagari
67: 13, # langBengali → smBengali
68: 13, # langAssamese → smBengali
69: 11, # langGujarati → smGujarati
70: 10, # langPunjabi → smGurmukhi
71: 12, # langOriya → smOriya
72: 17, # langMalayalam → smMalayalam
73: 16, # langKannada → smKannada
74: 14, # langTamil → smTamil
75: 15, # langTelugu → smTelugu
76: 18, # langSinhalese → smSinhalese
77: 19, # langBurmese → smBurmese
78: 20, # langKhmer → smKhmer
79: 22, # langLao → smLao
80: 30, # langVietnamese → smVietnamese
81: 0, # langIndonesian → smRoman
82: 0, # langTagalog → smRoman
83: 0, # langMalayRoman → smRoman
84: 4, # langMalayArabic → smArabic
85: 28, # langAmharic → smEthiopic
86: 28, # langTigrinya → smEthiopic
87: 28, # langOromo → smEthiopic
88: 0, # langSomali → smRoman
89: 0, # langSwahili → smRoman
90: 0, # langKinyarwanda → smRoman
91: 0, # langRundi → smRoman
92: 0, # langNyanja → smRoman
93: 0, # langMalagasy → smRoman
94: 0, # langEsperanto → smRoman
128: 0, # langWelsh → smRoman (modified)
129: 0, # langBasque → smRoman
130: 0, # langCatalan → smRoman
131: 0, # langLatin → smRoman
132: 0, # langQuechua → smRoman
133: 0, # langGuarani → smRoman
134: 0, # langAymara → smRoman
135: 7, # langTatar → smCyrillic
136: 4, # langUighur → smArabic
137: 26, # langDzongkha → smTibetan
138: 0, # langJavaneseRom → smRoman
139: 0, # langSundaneseRom → smRoman
140: 0, # langGalician → smRoman
141: 0, # langAfrikaans → smRoman
142: 0, # langBreton → smRoman (modified)
143: 28, # langInuktitut → smEthiopic (modified)
144: 0, # langScottishGaelic → smRoman (modified)
145: 0, # langManxGaelic → smRoman (modified)
146: 0, # langIrishGaelicScript → smRoman (modified)
147: 0, # langTongan → smRoman
148: 6, # langGreekAncient → smRoman
149: 0, # langGreenlandic → smRoman
150: 0, # langAzerbaijanRoman → smRoman
151: 0, # langNynorsk → smRoman
}

View File

@ -1,3 +1,5 @@
- [name] Add helper function for building multi-lingual names (#921)
3.10.0 (released 2017-04-14)
----------------------------

View File

@ -2,10 +2,12 @@
from __future__ import print_function, division, absolute_import, unicode_literals
from fontTools.misc.py23 import *
from fontTools.misc import sstruct
from fontTools.misc.xmlWriter import XMLWriter
from fontTools.misc.loggingTools import CapturingLogHandler
from fontTools.misc.testTools import FakeFont
from fontTools.misc.xmlWriter import XMLWriter
import struct
import unittest
from fontTools.ttLib import newTable
from fontTools.ttLib.tables._n_a_m_e import (
table__n_a_m_e, NameRecord, nameRecordFormat, nameRecordSize, makeName, log)
@ -67,6 +69,34 @@ class NameTableTest(unittest.TestCase):
with self.assertRaises(TypeError):
table.addName(b"abc") # must be unicode string
def test_addMultilingualName(self):
font = FakeFont(glyphs=[".notdef", "A"])
nameTable = font.tables['name'] = newTable("name")
widthID = nameTable.addMultilingualName(
{"en": "Width", "de-CH": "Breite", "gsw": "Bräiti"},
ttFont=font)
xHeightID = nameTable.addMultilingualName(
{"en": "X-Height", "gsw": "X-Hööchi"}, ttFont=font)
self.assertEqual(widthID, 256)
self.assertEqual(xHeightID, 257)
names = [(n.nameID, n.platformID, n.platEncID, n.langID, n.string)
for n in nameTable.names]
names.sort()
self.assertEqual(names, [
(256, 1, 0, 0, "Width"),
(256, 2, 4, 0, "Breite"),
(256, 2, 4, 1, "Bräiti"),
(256, 3, 1, 0x0409, "Width"),
(256, 3, 1, 0x0484, "Bräiti"),
(256, 3, 1, 0x0807, "Breite"),
(257, 1, 0, 0, "X-Height"),
(257, 2, 4, 1, "X-Hööchi"),
(257, 3, 1, 0x0409, "X-Height"),
(257, 3, 1, 0x0484, "X-Hööchi"),
])
self.assertEqual(set(font.tables.keys()), {"ltag", "name"})
self.assertEqual(font["ltag"].tags, ["de-CH", "gsw"])
def test_decompile_badOffset(self):
# https://github.com/behdad/fonttools/issues/525
table = table__n_a_m_e()