Support non-BMP characters for synthetic glyph names

When a font supplies no glyph names in its 'post' table, fontTools builds synthetic glyph names by reversing the 'cmap' table. After this change, the library looks at all 'cmap' subtables for Unicode, irrespective of format or platform. For example, glyph #4 in NotoSansOldItalic-Regular.ttf gets now named "u10300" instead of "glyph00004". Moved the code for building a reversed 'cmap' table into the cmap class, for easier testing.
2015-09-04 10:21:55 +02:00 · 2015-09-04 10:21:55 +02:00 · 07458f62dd
commit 07458f62dd
parent 58f86f318a
3 changed files with 65 additions and 43 deletions
--- a/Lib/fontTools/ttLib/init.py
+++ b/Lib/fontTools/ttLib/init.py
@ -511,47 +511,44 @@ class TTFont(object):
 		# Set the glyph order, so the cmap parser has something
 		# to work with (so we don't get called recursively).
 		self.glyphOrder = glyphOrder
-		# Get a (new) temporary cmap (based on the just invented names)
-		try:
-			tempcmap = self['cmap'].getcmap(3, 1)
-		except KeyError:
-			tempcmap = None
-		if tempcmap is not None:
-			# we have a unicode cmap
-			from fontTools import agl
-			cmap = tempcmap.cmap
-			# create a reverse cmap dict
-			reversecmap = {}
-			for unicode, name in list(cmap.items()):
-				reversecmap[name] = unicode
-			allNames = {}
-			for i in range(numGlyphs):
-				tempName = glyphOrder[i]
-				if tempName in reversecmap:
-					unicode = reversecmap[tempName]
-					if unicode in agl.UV2AGL:
-						# get name from the Adobe Glyph List
-						glyphName = agl.UV2AGL[unicode]
-					else:
-						# create uni<CODE> name
-						glyphName = "uni%04X" % unicode
-					tempName = glyphName
-					n = allNames.get(tempName, 0)
-					if n:
-						tempName = glyphName + "#" + str(n)
-					glyphOrder[i] = tempName
-					allNames[tempName] = n + 1
-			# Delete the temporary cmap table from the cache, so it can
-			# be parsed again with the right names.
-			del self.tables['cmap']
-		else:
-			pass # no unicode cmap available, stick with the invented names
+
+		# Make up glyph names based on the reversed cmap table. Because some
+		# glyphs (eg. ligatures or alternates) may not be reachable via cmap,
+		# this naming table will usually not cover all glyphs in the font.
+		# If the font has no Unicode cmap table, reversecmap will be empty.
+		reversecmap = self['cmap'].buildReversed()
+		useCount = {}
+		for i in range(numGlyphs):
+			tempName = glyphOrder[i]
+			if tempName in reversecmap:
+				# If a font maps both U+0041 LATIN CAPITAL LETTER A and
+				# U+0391 GREEK CAPITAL LETTER ALPHA to the same glyph,
+				# we prefer naming the glyph as "A".
+				glyphName = self._makeGlyphName(min(reversecmap[tempName]))
+				numUses = useCount[glyphName] = useCount.get(glyphName, 0) + 1
+				if numUses > 1:
+					glyphName = "%s.alt%d" % (glyphName, numUses - 1)
+				glyphOrder[i] = glyphName
+
+		# Delete the temporary cmap table from the cache, so it can
+		# be parsed again with the right names.
+		del self.tables['cmap']
 		self.glyphOrder = glyphOrder
 		if cmapLoading:
 			# restore partially loaded cmap, so it can continue loading
 			# using the proper names.
 			self.tables['cmap'] = cmapLoading

+	@staticmethod
+	def _makeGlyphName(codepoint):
+		from fontTools import agl  # Adobe Glyph List
+		if codepoint in agl.UV2AGL:
+			return agl.UV2AGL[codepoint]
+		elif codepoint <= 0xFFFF:
+			return "uni%04X" % codepoint
+		else:
+			return "u%X" % codepoint
+
 	def getGlyphNames(self):
 		"""Get a list of glyph names, sorted alphabetically."""
 		glyphNames = sorted(self.getGlyphOrder()[:])
--- a/Lib/fontTools/ttLib/tables/_c_m_a_p.py
+++ b/Lib/fontTools/ttLib/tables/_c_m_a_p.py
@ -20,6 +20,21 @@ class table__c_m_a_p(DefaultTable.DefaultTable):
 				return subtable
 		return None # not found

+	def buildReversed(self):
+		"""Returns a reverse cmap such as {'one':{0x31}, 'A':{0x41,0x391}}.
+
+		The values are sets of Unicode codepoints because
+		some fonts map different codepoints to the same glyph.
+		For example, U+0041 LATIN CAPITAL LETTER A and U+0391
+		GREEK CAPITAL LETTER ALPHA are sometimes the same glyph.
+		"""
+		result = {}
+		for subtable in self.tables:
+			if subtable.isUnicode():
+				for codepoint, name in subtable.cmap.items():
+					result.setdefault(name, set()).add(codepoint)
+		return result
+
 	def decompile(self, data, ttFont):
 		tableVersion, numSubTables = struct.unpack(">HH", data[:4])
 		self.tableVersion = int(tableVersion)
--- a/Lib/fontTools/ttLib/tables/_c_m_a_p_test.py
+++ b/Lib/fontTools/ttLib/tables/_c_m_a_p_test.py
@ -2,37 +2,37 @@ from __future__ import print_function, division, absolute_import, unicode_litera
 from fontTools.misc.py23 import *
 from fontTools import ttLib
 import unittest
-from ._c_m_a_p import CmapSubtable
+from ._c_m_a_p import CmapSubtable, table__c_m_a_p

 class CmapSubtableTest(unittest.TestCase):

-	def makeSubtable(self, platformID, platEncID, langID):
-		subtable = CmapSubtable(None)
+	def makeSubtable(self, cmapFormat, platformID, platEncID, langID):
+		subtable = CmapSubtable.newSubtable(cmapFormat)
 		subtable.platformID, subtable.platEncID, subtable.language = (platformID, platEncID, langID)
 		return subtable

 	def test_toUnicode_utf16be(self):
-		subtable = self.makeSubtable(0, 2, 7)
+		subtable = self.makeSubtable(4, 0, 2, 7)
 		self.assertEqual("utf_16_be", subtable.getEncoding())
 		self.assertEqual(True, subtable.isUnicode())

 	def test_toUnicode_macroman(self):
-		subtable = self.makeSubtable(1, 0, 7)  # MacRoman
+		subtable = self.makeSubtable(4, 1, 0, 7)  # MacRoman
 		self.assertEqual("mac_roman", subtable.getEncoding())
 		self.assertEqual(False, subtable.isUnicode())

 	def test_toUnicode_macromanian(self):
-		subtable = self.makeSubtable(1, 0, 37)  # Mac Romanian
+		subtable = self.makeSubtable(4, 1, 0, 37)  # Mac Romanian
 		self.assertNotEqual(None, subtable.getEncoding())
 		self.assertEqual(False, subtable.isUnicode())

 	def test_extended_mac_encodings(self):
-		subtable = self.makeSubtable(1, 1, 0) # Mac Japanese
+		subtable = self.makeSubtable(4, 1, 1, 0) # Mac Japanese
 		self.assertNotEqual(None, subtable.getEncoding())
 		self.assertEqual(False, subtable.isUnicode())

 	def test_extended_unknown(self):
-		subtable = self.makeSubtable(10, 11, 12)
+		subtable = self.makeSubtable(4, 10, 11, 12)
 		self.assertEqual(subtable.getEncoding(), None)
 		self.assertEqual(subtable.getEncoding("ascii"), "ascii")
 		self.assertEqual(subtable.getEncoding(default="xyz"), "xyz")
@ -49,5 +49,15 @@ class CmapSubtableTest(unittest.TestCase):
 		font.setGlyphOrder([])
 		subtable.decompile(b'\0' * 7 + b'\x10' + b'\0' * 8, font)

+	def test_buildReversed(self):
+		c4 = self.makeSubtable(4, 3, 1, 0)
+		c4.cmap = {0x0041:'A', 0x0391:'A'}
+		c12 = self.makeSubtable(12, 3, 10, 0)
+		c12.cmap = {0x10314: 'u10314'}
+		cmap = table__c_m_a_p()
+		cmap.tables = [c4, c12]
+		self.assertEqual(cmap.buildReversed(), {'A':{0x0041, 0x0391}, 'u10314':{0x10314}})
+
+
 if __name__ == "__main__":
 	unittest.main()