[name] Add helper for building multi-lingual names

Fixes https://github.com/fonttools/fonttools/issues/921.
This commit is contained in:
Sascha Brawer 2017-04-18 19:18:46 +02:00
parent 068ca5afbb
commit e8530360bd
4 changed files with 571 additions and 5 deletions

View File

@ -43,6 +43,12 @@ class FakeFont:
def __getitem__(self, tag): def __getitem__(self, tag):
return self.tables[tag] return self.tables[tag]
def __setitem__(self, tag, table):
self.tables[tag] = table
def get(self, tag, default=None):
return self.tables.get(tag, default)
def getGlyphID(self, name): def getGlyphID(self, name):
return self.glyphOrder_.index(name) return self.glyphOrder_.index(name)

View File

@ -1,8 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import print_function, division, absolute_import from __future__ import print_function, division, absolute_import
from __future__ import unicode_literals
from fontTools.misc.py23 import * from fontTools.misc.py23 import *
from fontTools.misc import sstruct from fontTools.misc import sstruct
from fontTools.misc.textTools import safeEval from fontTools.misc.textTools import safeEval
from fontTools.misc.encodingTools import getEncoding from fontTools.misc.encodingTools import getEncoding
from fontTools.ttLib import newTable
from . import DefaultTable from . import DefaultTable
import struct import struct
import logging import logging
@ -24,6 +27,7 @@ nameRecordSize = sstruct.calcsize(nameRecordFormat)
class table__n_a_m_e(DefaultTable.DefaultTable): class table__n_a_m_e(DefaultTable.DefaultTable):
dependencies = ["ltag"]
def decompile(self, data, ttFont): def decompile(self, data, ttFont):
format, n, stringOffset = struct.unpack(">HHH", data[:6]) format, n, stringOffset = struct.unpack(">HHH", data[:6])
@ -145,13 +149,49 @@ class table__n_a_m_e(DefaultTable.DefaultTable):
else: else:
self.names.append(makeName(string, nameID, platformID, platEncID, langID)) self.names.append(makeName(string, nameID, platformID, platEncID, langID))
def _findUnusedNameID(self, minNameID=256):
"""Finds an unused name id.
The nameID is assigned in the range between 'minNameID' and 32767 (inclusive),
following the last nameID in the name table.
"""
names = getattr(self, 'names', [])
nameID = 1 + max([n.nameID for n in names] + [minNameID - 1])
if nameID > 32767:
raise ValueError("nameID must be less than 32768")
return nameID
def addMultilingualName(self, names, ttFont, nameID=None):
if not hasattr(self, 'names'):
self.names = []
if nameID is None:
nameID = self._findUnusedNameID()
for lang, name in sorted(names.items()):
# Add a Macintosh name. See section “The language identifier” in
# https://developer.apple.com/fonts/TrueType-Reference-Manual/RM06/Chap6name.html
macLang = _MAC_LANGUAGE_CODES.get(lang.lower())
if macLang is not None:
macScript = _MAC_LANGUAGE_TO_SCRIPT[macLang]
self.names.append(makeName(name, nameID, 1, macScript, macLang))
else:
ltag = ttFont.tables.get("ltag")
if ltag is None:
ltag = ttFont["ltag"] = newTable("ltag")
self.names.append(makeName(name, nameID, 2, 4, ltag.addTag(lang)))
# Add a Windows name.
windowsLang = _WINDOWS_LANGUAGE_CODES.get(lang.lower())
if windowsLang is not None:
self.names.append(makeName(name, nameID, 3, 1, windowsLang))
else:
log.warning("cannot add name in language %s because fonttools does not yet support name table format 1" % lang)
return nameID
def addName(self, string, platforms=((1, 0, 0), (3, 1, 0x409)), minNameID=255): def addName(self, string, platforms=((1, 0, 0), (3, 1, 0x409)), minNameID=255):
""" Add a new name record containing 'string' for each (platformID, platEncID, """ Add a new name record containing 'string' for each (platformID, platEncID,
langID) tuple specified in the 'platforms' list. langID) tuple specified in the 'platforms' list.
The nameID is assigned in the range between 'minNameID'+1 and 32767 (inclusive), The nameID is assigned in the range between 'minNameID'+1 and 32767 (inclusive),
following the last nameID in the name table. following the last nameID in the name table.
If no 'platforms' are specified, two English name records are added, one for the If no 'platforms' are specified, two English name records are added, one for the
Macintosh (platformID=0), and one for the Windows platform (3). Macintosh (platformID=0), and one for the Windows platform (3).
@ -168,9 +208,7 @@ class table__n_a_m_e(DefaultTable.DefaultTable):
raise TypeError( raise TypeError(
"expected %s, found %s: %r" % ( "expected %s, found %s: %r" % (
unicode.__name__, type(string).__name__,string )) unicode.__name__, type(string).__name__,string ))
nameID = 1 + max([n.nameID for n in self.names] + [minNameID]) nameID = self._findUnusedNameID(minNameID + 1)
if nameID > 32767:
raise ValueError("nameID must be less than 32768")
for platformID, platEncID, langID in platforms: for platformID, platEncID, langID in platforms:
self.names.append(makeName(string, nameID, platformID, platEncID, langID)) self.names.append(makeName(string, nameID, platformID, platEncID, langID))
return nameID return nameID
@ -337,3 +375,493 @@ class NameRecord(object):
def __repr__(self): def __repr__(self):
return "<NameRecord NameID=%d; PlatformID=%d; LanguageID=%d>" % ( return "<NameRecord NameID=%d; PlatformID=%d; LanguageID=%d>" % (
self.nameID, self.platformID, self.langID) self.nameID, self.platformID, self.langID)
# Windows language ID → IETF BCP-47 language tag
#
# While Microsoft indicates a region/country for all its language
# IDs, we follow Unicode practice by omitting “most likely subtags”
# as per Unicode CLDR. For example, English is simply “en” and not
# “en-Latn” because according to Unicode, the default script
# for English is Latin.
#
# http://www.unicode.org/cldr/charts/latest/supplemental/likely_subtags.html
# http://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
_WINDOWS_LANGUAGES = {
0x0436: 'af',
0x041C: 'sq',
0x0484: 'gsw',
0x045E: 'am',
0x1401: 'ar-DZ',
0x3C01: 'ar-BH',
0x0C01: 'ar',
0x0801: 'ar-IQ',
0x2C01: 'ar-JO',
0x3401: 'ar-KW',
0x3001: 'ar-LB',
0x1001: 'ar-LY',
0x1801: 'ary',
0x2001: 'ar-OM',
0x4001: 'ar-QA',
0x0401: 'ar-SA',
0x2801: 'ar-SY',
0x1C01: 'aeb',
0x3801: 'ar-AE',
0x2401: 'ar-YE',
0x042B: 'hy',
0x044D: 'as',
0x082C: 'az-Cyrl',
0x042C: 'az',
0x046D: 'ba',
0x042D: 'eu',
0x0423: 'be',
0x0845: 'bn',
0x0445: 'bn-IN',
0x201A: 'bs-Cyrl',
0x141A: 'bs',
0x047E: 'br',
0x0402: 'bg',
0x0403: 'ca',
0x0C04: 'zh-HK',
0x1404: 'zh-MO',
0x0804: 'zh',
0x1004: 'zh-SG',
0x0404: 'zh-TW',
0x0483: 'co',
0x041A: 'hr',
0x101A: 'hr-BA',
0x0405: 'cs',
0x0406: 'da',
0x048C: 'prs',
0x0465: 'dv',
0x0813: 'nl-BE',
0x0413: 'nl',
0x0C09: 'en-AU',
0x2809: 'en-BZ',
0x1009: 'en-CA',
0x2409: 'en-029',
0x4009: 'en-IN',
0x1809: 'en-IE',
0x2009: 'en-JM',
0x4409: 'en-MY',
0x1409: 'en-NZ',
0x3409: 'en-PH',
0x4809: 'en-SG',
0x1C09: 'en-ZA',
0x2C09: 'en-TT',
0x0809: 'en-GB',
0x0409: 'en',
0x3009: 'en-ZW',
0x0425: 'et',
0x0438: 'fo',
0x0464: 'fil',
0x040B: 'fi',
0x080C: 'fr-BE',
0x0C0C: 'fr-CA',
0x040C: 'fr',
0x140C: 'fr-LU',
0x180C: 'fr-MC',
0x100C: 'fr-CH',
0x0462: 'fy',
0x0456: 'gl',
0x0437: 'ka',
0x0C07: 'de-AT',
0x0407: 'de',
0x1407: 'de-LI',
0x1007: 'de-LU',
0x0807: 'de-CH',
0x0408: 'el',
0x046F: 'kl',
0x0447: 'gu',
0x0468: 'ha',
0x040D: 'he',
0x0439: 'hi',
0x040E: 'hu',
0x040F: 'is',
0x0470: 'ig',
0x0421: 'id',
0x045D: 'iu',
0x085D: 'iu-Latn',
0x083C: 'ga',
0x0434: 'xh',
0x0435: 'zu',
0x0410: 'it',
0x0810: 'it-CH',
0x0411: 'ja',
0x044B: 'kn',
0x043F: 'kk',
0x0453: 'km',
0x0486: 'quc',
0x0487: 'rw',
0x0441: 'sw',
0x0457: 'kok',
0x0412: 'ko',
0x0440: 'ky',
0x0454: 'lo',
0x0426: 'lv',
0x0427: 'lt',
0x082E: 'dsb',
0x046E: 'lb',
0x042F: 'mk',
0x083E: 'ms-BN',
0x043E: 'ms',
0x044C: 'ml',
0x043A: 'mt',
0x0481: 'mi',
0x047A: 'arn',
0x044E: 'mr',
0x047C: 'moh',
0x0450: 'mn',
0x0850: 'mn-CN',
0x0461: 'ne',
0x0414: 'nb',
0x0814: 'nn',
0x0482: 'oc',
0x0448: 'or',
0x0463: 'ps',
0x0415: 'pl',
0x0416: 'pt',
0x0816: 'pt-PT',
0x0446: 'pa',
0x046B: 'qu-BO',
0x086B: 'qu-EC',
0x0C6B: 'qu',
0x0418: 'ro',
0x0417: 'rm',
0x0419: 'ru',
0x243B: 'smn',
0x103B: 'smj-NO',
0x143B: 'smj',
0x0C3B: 'se-FI',
0x043B: 'se',
0x083B: 'se-SE',
0x203B: 'sms',
0x183B: 'sma-NO',
0x1C3B: 'sms',
0x044F: 'sa',
0x1C1A: 'sr-Cyrl-BA',
0x0C1A: 'sr',
0x181A: 'sr-Latn-BA',
0x081A: 'sr-Latn',
0x046C: 'nso',
0x0432: 'tn',
0x045B: 'si',
0x041B: 'sk',
0x0424: 'sl',
0x2C0A: 'es-AR',
0x400A: 'es-BO',
0x340A: 'es-CL',
0x240A: 'es-CO',
0x140A: 'es-CR',
0x1C0A: 'es-DO',
0x300A: 'es-EC',
0x440A: 'es-SV',
0x100A: 'es-GT',
0x480A: 'es-HN',
0x080A: 'es-MX',
0x4C0A: 'es-NI',
0x180A: 'es-PA',
0x3C0A: 'es-PY',
0x280A: 'es-PE',
0x500A: 'es-PR',
# Microsoft has defined two different language codes for
# “Spanish with modern sorting” and “Spanish with traditional
# sorting”. This makes sense for collation APIs, and it would be
# possible to express this in BCP 47 language tags via Unicode
# extensions (eg., “es-u-co-trad” is “Spanish with traditional
# sorting”). However, for storing names in fonts, this distinction
# does not make sense, so we use “es” in both cases.
0x0C0A: 'es',
0x040A: 'es',
0x540A: 'es-US',
0x380A: 'es-UY',
0x200A: 'es-VE',
0x081D: 'sv-FI',
0x041D: 'sv',
0x045A: 'syr',
0x0428: 'tg',
0x085F: 'tzm',
0x0449: 'ta',
0x0444: 'tt',
0x044A: 'te',
0x041E: 'th',
0x0451: 'bo',
0x041F: 'tr',
0x0442: 'tk',
0x0480: 'ug',
0x0422: 'uk',
0x042E: 'hsb',
0x0420: 'ur',
0x0843: 'uz-Cyrl',
0x0443: 'uz',
0x042A: 'vi',
0x0452: 'cy',
0x0488: 'wo',
0x0485: 'sah',
0x0478: 'ii',
0x046A: 'yo',
}
_MAC_LANGUAGES = {
0: 'en',
1: 'fr',
2: 'de',
3: 'it',
4: 'nl',
5: 'sv',
6: 'es',
7: 'da',
8: 'pt',
9: 'no',
10: 'he',
11: 'ja',
12: 'ar',
13: 'fi',
14: 'el',
15: 'is',
16: 'mt',
17: 'tr',
18: 'hr',
19: 'zh-Hant',
20: 'ur',
21: 'hi',
22: 'th',
23: 'ko',
24: 'lt',
25: 'pl',
26: 'hu',
27: 'es',
28: 'lv',
29: 'se',
30: 'fo',
31: 'fa',
32: 'ru',
33: 'zh',
34: 'nl-BE',
35: 'ga',
36: 'sq',
37: 'ro',
38: 'cz',
39: 'sk',
40: 'si',
41: 'yi',
42: 'sr',
43: 'mk',
44: 'bg',
45: 'uk',
46: 'be',
47: 'uz',
48: 'kk',
49: 'az-Cyrl',
50: 'az-Arab',
51: 'hy',
52: 'ka',
53: 'mo',
54: 'ky',
55: 'tg',
56: 'tk',
57: 'mn-CN',
58: 'mn',
59: 'ps',
60: 'ks',
61: 'ku',
62: 'sd',
63: 'bo',
64: 'ne',
65: 'sa',
66: 'mr',
67: 'bn',
68: 'as',
69: 'gu',
70: 'pa',
71: 'or',
72: 'ml',
73: 'kn',
74: 'ta',
75: 'te',
76: 'si',
77: 'my',
78: 'km',
79: 'lo',
80: 'vi',
81: 'id',
82: 'tl',
83: 'ms',
84: 'ms-Arab',
85: 'am',
86: 'ti',
87: 'om',
88: 'so',
89: 'sw',
90: 'rw',
91: 'rn',
92: 'ny',
93: 'mg',
94: 'eo',
128: 'cy',
129: 'eu',
130: 'ca',
131: 'la',
132: 'qu',
133: 'gn',
134: 'ay',
135: 'tt',
136: 'ug',
137: 'dz',
138: 'jv',
139: 'su',
140: 'gl',
141: 'af',
142: 'br',
143: 'iu',
144: 'gd',
145: 'gv',
146: 'ga',
147: 'to',
148: 'el-polyton',
149: 'kl',
150: 'az',
151: 'nn',
}
_WINDOWS_LANGUAGE_CODES = {lang.lower(): code for code, lang in _WINDOWS_LANGUAGES.items()}
_MAC_LANGUAGE_CODES = {lang.lower(): code for code, lang in _MAC_LANGUAGES.items()}
# MacOS language ID → MacOS script ID
#
# Note that the script ID is not sufficient to determine what encoding
# to use in TrueType files. For some languages, MacOS used a modification
# of a mainstream script. For example, an Icelandic name would be stored
# with smRoman in the TrueType naming table, but the actual encoding
# is a special Icelandic version of the normal Macintosh Roman encoding.
# As another example, Inuktitut uses an 8-bit encoding for Canadian Aboriginal
# Syllables but MacOS had run out of available script codes, so this was
# done as a (pretty radical) “modification” of Ethiopic.
#
# http://unicode.org/Public/MAPPINGS/VENDORS/APPLE/Readme.txt
_MAC_LANGUAGE_TO_SCRIPT = {
0: 0, # langEnglish → smRoman
1: 0, # langFrench → smRoman
2: 0, # langGerman → smRoman
3: 0, # langItalian → smRoman
4: 0, # langDutch → smRoman
5: 0, # langSwedish → smRoman
6: 0, # langSpanish → smRoman
7: 0, # langDanish → smRoman
8: 0, # langPortuguese → smRoman
9: 0, # langNorwegian → smRoman
10: 5, # langHebrew → smHebrew
11: 1, # langJapanese → smJapanese
12: 4, # langArabic → smArabic
13: 0, # langFinnish → smRoman
14: 6, # langGreek → smGreek
15: 0, # langIcelandic → smRoman (modified)
16: 0, # langMaltese → smRoman
17: 0, # langTurkish → smRoman (modified)
18: 0, # langCroatian → smRoman (modified)
19: 2, # langTradChinese → smTradChinese
20: 4, # langUrdu → smArabic
21: 9, # langHindi → smDevanagari
22: 21, # langThai → smThai
23: 3, # langKorean → smKorean
24: 29, # langLithuanian → smCentralEuroRoman
25: 29, # langPolish → smCentralEuroRoman
26: 29, # langHungarian → smCentralEuroRoman
27: 29, # langEstonian → smCentralEuroRoman
28: 29, # langLatvian → smCentralEuroRoman
29: 0, # langSami → smRoman
30: 0, # langFaroese → smRoman (modified)
31: 4, # langFarsi → smArabic (modified)
32: 7, # langRussian → smCyrillic
33: 25, # langSimpChinese → smSimpChinese
34: 0, # langFlemish → smRoman
35: 0, # langIrishGaelic → smRoman (modified)
36: 0, # langAlbanian → smRoman
37: 0, # langRomanian → smRoman (modified)
38: 29, # langCzech → smCentralEuroRoman
39: 29, # langSlovak → smCentralEuroRoman
40: 0, # langSlovenian → smRoman (modified)
41: 5, # langYiddish → smHebrew
42: 7, # langSerbian → smCyrillic
43: 7, # langMacedonian → smCyrillic
44: 7, # langBulgarian → smCyrillic
45: 7, # langUkrainian → smCyrillic (modified)
46: 7, # langByelorussian → smCyrillic
47: 7, # langUzbek → smCyrillic
48: 7, # langKazakh → smCyrillic
49: 7, # langAzerbaijani → smCyrillic
50: 4, # langAzerbaijanAr → smArabic
51: 24, # langArmenian → smArmenian
52: 23, # langGeorgian → smGeorgian
53: 7, # langMoldavian → smCyrillic
54: 7, # langKirghiz → smCyrillic
55: 7, # langTajiki → smCyrillic
56: 7, # langTurkmen → smCyrillic
57: 27, # langMongolian → smMongolian
58: 7, # langMongolianCyr → smCyrillic
59: 4, # langPashto → smArabic
60: 4, # langKurdish → smArabic
61: 4, # langKashmiri → smArabic
62: 4, # langSindhi → smArabic
63: 26, # langTibetan → smTibetan
64: 9, # langNepali → smDevanagari
65: 9, # langSanskrit → smDevanagari
66: 9, # langMarathi → smDevanagari
67: 13, # langBengali → smBengali
68: 13, # langAssamese → smBengali
69: 11, # langGujarati → smGujarati
70: 10, # langPunjabi → smGurmukhi
71: 12, # langOriya → smOriya
72: 17, # langMalayalam → smMalayalam
73: 16, # langKannada → smKannada
74: 14, # langTamil → smTamil
75: 15, # langTelugu → smTelugu
76: 18, # langSinhalese → smSinhalese
77: 19, # langBurmese → smBurmese
78: 20, # langKhmer → smKhmer
79: 22, # langLao → smLao
80: 30, # langVietnamese → smVietnamese
81: 0, # langIndonesian → smRoman
82: 0, # langTagalog → smRoman
83: 0, # langMalayRoman → smRoman
84: 4, # langMalayArabic → smArabic
85: 28, # langAmharic → smEthiopic
86: 28, # langTigrinya → smEthiopic
87: 28, # langOromo → smEthiopic
88: 0, # langSomali → smRoman
89: 0, # langSwahili → smRoman
90: 0, # langKinyarwanda → smRoman
91: 0, # langRundi → smRoman
92: 0, # langNyanja → smRoman
93: 0, # langMalagasy → smRoman
94: 0, # langEsperanto → smRoman
128: 0, # langWelsh → smRoman (modified)
129: 0, # langBasque → smRoman
130: 0, # langCatalan → smRoman
131: 0, # langLatin → smRoman
132: 0, # langQuechua → smRoman
133: 0, # langGuarani → smRoman
134: 0, # langAymara → smRoman
135: 7, # langTatar → smCyrillic
136: 4, # langUighur → smArabic
137: 26, # langDzongkha → smTibetan
138: 0, # langJavaneseRom → smRoman
139: 0, # langSundaneseRom → smRoman
140: 0, # langGalician → smRoman
141: 0, # langAfrikaans → smRoman
142: 0, # langBreton → smRoman (modified)
143: 28, # langInuktitut → smEthiopic (modified)
144: 0, # langScottishGaelic → smRoman (modified)
145: 0, # langManxGaelic → smRoman (modified)
146: 0, # langIrishGaelicScript → smRoman (modified)
147: 0, # langTongan → smRoman
148: 6, # langGreekAncient → smRoman
149: 0, # langGreenlandic → smRoman
150: 0, # langAzerbaijanRoman → smRoman
151: 0, # langNynorsk → smRoman
}

View File

@ -1,3 +1,5 @@
- [name] Add helper function for building multi-lingual names (#921)
3.10.0 (released 2017-04-14) 3.10.0 (released 2017-04-14)
---------------------------- ----------------------------

View File

@ -2,10 +2,12 @@
from __future__ import print_function, division, absolute_import, unicode_literals from __future__ import print_function, division, absolute_import, unicode_literals
from fontTools.misc.py23 import * from fontTools.misc.py23 import *
from fontTools.misc import sstruct from fontTools.misc import sstruct
from fontTools.misc.xmlWriter import XMLWriter
from fontTools.misc.loggingTools import CapturingLogHandler from fontTools.misc.loggingTools import CapturingLogHandler
from fontTools.misc.testTools import FakeFont
from fontTools.misc.xmlWriter import XMLWriter
import struct import struct
import unittest import unittest
from fontTools.ttLib import newTable
from fontTools.ttLib.tables._n_a_m_e import ( from fontTools.ttLib.tables._n_a_m_e import (
table__n_a_m_e, NameRecord, nameRecordFormat, nameRecordSize, makeName, log) table__n_a_m_e, NameRecord, nameRecordFormat, nameRecordSize, makeName, log)
@ -67,6 +69,34 @@ class NameTableTest(unittest.TestCase):
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
table.addName(b"abc") # must be unicode string table.addName(b"abc") # must be unicode string
def test_addMultilingualName(self):
font = FakeFont(glyphs=[".notdef", "A"])
nameTable = font.tables['name'] = newTable("name")
widthID = nameTable.addMultilingualName(
{"en": "Width", "de-CH": "Breite", "gsw": "Bräiti"},
ttFont=font)
xHeightID = nameTable.addMultilingualName(
{"en": "X-Height", "gsw": "X-Hööchi"}, ttFont=font)
self.assertEqual(widthID, 256)
self.assertEqual(xHeightID, 257)
names = [(n.nameID, n.platformID, n.platEncID, n.langID, n.string)
for n in nameTable.names]
names.sort()
self.assertEqual(names, [
(256, 1, 0, 0, "Width"),
(256, 2, 4, 0, "Breite"),
(256, 2, 4, 1, "Bräiti"),
(256, 3, 1, 0x0409, "Width"),
(256, 3, 1, 0x0484, "Bräiti"),
(256, 3, 1, 0x0807, "Breite"),
(257, 1, 0, 0, "X-Height"),
(257, 2, 4, 1, "X-Hööchi"),
(257, 3, 1, 0x0409, "X-Height"),
(257, 3, 1, 0x0484, "X-Hööchi"),
])
self.assertEqual(set(font.tables.keys()), {"ltag", "name"})
self.assertEqual(font["ltag"].tags, ["de-CH", "gsw"])
def test_decompile_badOffset(self): def test_decompile_badOffset(self):
# https://github.com/behdad/fonttools/issues/525 # https://github.com/behdad/fonttools/issues/525
table = table__n_a_m_e() table = table__n_a_m_e()