Map AGL glyph names to Unicode (#774)

Implements the mapping algorithm from the [Adobe Glyph List specification]( https://github.com/adobe-type-tools/agl-specification#2-the-mapping). Currently, the implementation only recognizes glyph names from the Adobe Glyph List for New Fonts (AGLFN), not the legacy AGL which is a superset. If there is interest, it would be easy to support the legacy AGL as well, preferrably in a follow-up change. https://github.com/googlei18n/glyphsLib/issues/88#issuecomment-267408215
2016-12-19 15:34:44 +01:00 · 2016-12-19 15:34:44 +01:00 · 428636cfaf
commit 428636cfaf
parent b4d53811e9
2 changed files with 202 additions and 1 deletions
--- a/Lib/fontTools/agl.py
+++ b/Lib/fontTools/agl.py
@ -1,8 +1,12 @@
 # -*- coding: utf-8 -*-
 # The table below is taken from
 # http://www.adobe.com/devnet/opentype/archives/aglfn.txt
-from __future__ import print_function, division, absolute_import
+from __future__ import (print_function, division, absolute_import,
                        unicode_literals)
 from fontTools.misc.py23 import *
 import re
 _aglText = """\
 # -----------------------------------------------------------
@ -736,3 +740,135 @@ def _builddicts():
 		UV2AGL[unicode] = glyphName
 _builddicts()
 def toUnicode(glyph, isZapfDingbats=False):
 	"""Convert glyph names to Unicode, such as 'longs_t.oldstyle' --> u'ſt'
 	If isZapfDingbats is True, the implementation recognizes additional
 	glyph names (as required by the AGL specification).
 	"""
 	# https://github.com/adobe-type-tools/agl-specification#2-the-mapping
 	#
 	# 1. Drop all the characters from the glyph name starting with
 	#    the first occurrence of a period (U+002E; FULL STOP), if any.
 	glyph = glyph.split(".", 1)[0]
 	# 2. Split the remaining string into a sequence of components,
 	#    using underscore (U+005F; LOW LINE) as the delimiter.
 	components = glyph.split("_")
 	# 3. Map each component to a character string according to the
 	#    procedure below, and concatenate those strings; the result
 	#     is the character string to which the glyph name is mapped.
 	result = [_glyphComponentToUnicode(c, isZapfDingbats)
                  for c in components]
 	return "".join(result)
 def _glyphComponentToUnicode(component, isZapfDingbats):
 	# If the font is Zapf Dingbats (PostScript FontName: ZapfDingbats),
 	# and the component is in the ITC Zapf Dingbats Glyph List, then
 	# map it to the corresponding character in that list.
 	dingbat = _zapfDingbatsToUnicode(component) if isZapfDingbats else None
 	if dingbat:
 		return dingbat
 	# Otherwise, if the component is in AGL, then map it
 	# to the corresponding character in that list.
 	#
 	# TODO: We currently use the AGLFN (Adobe glyph list for new fonts),
 	# although the spec actually mandates the legacy AGL which is
 	# a superset of the AGLFN.
 	uchar = AGL2UV.get(component)
 	if uchar:
 		return unichr(uchar)
 	# Otherwise, if the component is of the form "uni" (U+0075,
 	# U+006E, and U+0069) followed by a sequence of uppercase
 	# hexadecimal digits (0–9 and A–F, meaning U+0030 through
 	# U+0039 and U+0041 through U+0046), if the length of that
 	# sequence is a multiple of four, and if each group of four
 	# digits represents a value in the ranges 0000 through D7FF
 	# or E000 through FFFF, then interpret each as a Unicode scalar
 	# value and map the component to the string made of those
 	# scalar values. Note that the range and digit-length
 	# restrictions mean that the "uni" glyph name prefix can be
 	# used only with UVs in the Basic Multilingual Plane (BMP).
 	uni = _uniToUnicode(component)
 	if uni:
 		return uni
 	# Otherwise, if the component is of the form "u" (U+0075)
 	# followed by a sequence of four to six uppercase hexadecimal
 	# digits (0–9 and A–F, meaning U+0030 through U+0039 and
 	# U+0041 through U+0046), and those digits represents a value
 	# in the ranges 0000 through D7FF or E000 through 10FFFF, then
 	# interpret it as a Unicode scalar value and map the component
 	# to the string made of this scalar value.
 	uni = _uToUnicode(component)
 	if uni:
 		return uni
 	# Otherwise, map the component to an empty string.
 	return ''
 # https://github.com/adobe-type-tools/agl-aglfn/blob/master/zapfdingbats.txt
 _AGL_ZAPF_DINGBATS = (
 	" ✁✂✄☎✆✝✞✟✠✡☛☞✌✍✎✏✑✒✓✔✕✖✗✘✙✚✛✜✢✣✤✥✦✧★✩✪✫✬✭✮✯✰✱✲✳✴✵✶✷✸✹✺✻✼✽✾✿❀"
 	"❁❂❃❄❅❆❇❈❉❊❋●❍■❏❑▲▼◆❖ ◗❘❙❚❯❱❲❳❨❩❬❭❪❫❴❵❛❜❝❞❡❢❣❤✐❥❦❧♠♥♦♣    ✉✈✇"
 	"①②③④⑤⑥⑦⑧⑨⑩❶❷❸❹❺❻❼❽❾❿➀➁➂➃➄➅➆➇➈➉➊➋➌➍➎➏➐➑➒➓➔→➣↔"
 	"↕➙➛➜➝➞➟➠➡➢➤➥➦➧➨➩➫➭➯➲➳➵➸➺➻➼➽➾➚➪➶➹➘➴➷➬➮➱✃❐❒❮❰")
 def _zapfDingbatsToUnicode(glyph):
 	"""Helper for toUnicode()."""
 	if len(glyph) < 2 or glyph[0] != 'a':
 		return None
 	try:
 		gid = int(glyph[1:])
 	except ValueError:
 		return None
 	if gid < 0 or gid >= len(_AGL_ZAPF_DINGBATS):
 		return None
 	uchar = _AGL_ZAPF_DINGBATS[gid]
 	return uchar if uchar != ' ' else None
 _re_uni = re.compile("^uni([0-9A-F]+)$")
 def _uniToUnicode(component):
 	"""Helper for toUnicode() to handle "uniABCD" components."""
 	match = _re_uni.match(component)
 	if match is None:
 		return None
 	digits = match.group(1)
 	if len(digits) % 4 != 0:
 		return None
 	chars = [int(digits[i : i + 4], 16)
                 for i in range(0, len(digits), 4)]
 	if any(c >= 0xD800 and c <= 0xDFFF for c in chars):
 		# The AGL specification explicitly excluded surrogate pairs.
 		return None
 	return ''.join([unichr(c) for c in chars])
 _re_u = re.compile("^u([0-9A-F]{4,6})$")
 def _uToUnicode(component):
 	"""Helper for toUnicode() to handle "u1ABCD" components."""
 	match = _re_u.match(component)
 	if match is None:
 		return None
 	digits = match.group(1)
 	try:
 		value = int(digits, 16)
 	except ValueError:
 		return None
 	if ((value >= 0x0000 and value <= 0xD7FF) or
 	    (value >= 0xE000 and value <= 0x10FFFF)):
 		return unichr(value)
 	return None
--- a/Lib/fontTools/agl_test.py
+++ b/Lib/fontTools/agl_test.py
@ -0,0 +1,65 @@
 # -*- coding: utf-8 -*-
 from __future__ import (print_function, division, absolute_import,
                        unicode_literals)
 from fontTools.misc.py23 import *
 from fontTools import agl
 import unittest
 class AglToUnicodeTest(unittest.TestCase):
    def test_spec_examples(self):
        # https://github.com/adobe-type-tools/agl-specification#3-examples
        # TODO: Currently, we only handle AGLFN instead of legacy AGL names.
        # Therefore, the test cases below use Iogonek instead of Lcommaaccent.
        # Change Iogonek to Lcommaaccent as soon as the implementation has
        # been fixed to also support legacy AGL names.
        self.assertEqual(agl.toUnicode("Iogonek"), "Į")
        self.assertEqual(agl.toUnicode("uni20AC0308"), "\u20AC\u0308")
        self.assertEqual(agl.toUnicode("u1040C"), "\U0001040C")
        self.assertEqual(agl.toUnicode("uniD801DC0C"), "")
        self.assertEqual(agl.toUnicode("uni20ac"), "")
        self.assertEqual(
            agl.toUnicode("Iogonek_uni20AC0308_u1040C.alternate"),
            "\u012E\u20AC\u0308\U0001040C")
        self.assertEqual(agl.toUnicode("Iogonek_uni012E_u012E"), "ĮĮĮ")
        self.assertEqual(agl.toUnicode("foo"), "")
        self.assertEqual(agl.toUnicode(".notdef"), "")
    def test_aglfn(self):
        self.assertEqual(agl.toUnicode("longs_t"), "ſt")
        self.assertEqual(agl.toUnicode("f_f_i.alt123"), "ffi")
    def test_uniABCD(self):
        self.assertEqual(agl.toUnicode("uni0041"), "A")
        self.assertEqual(agl.toUnicode("uni0041_uni0042_uni0043"), "ABC")
        self.assertEqual(agl.toUnicode("uni004100420043"), "ABC")
        self.assertEqual(agl.toUnicode("uni"), "")
        self.assertEqual(agl.toUnicode("uni41"), "")
        self.assertEqual(agl.toUnicode("uni004101"), "")
        self.assertEqual(agl.toUnicode("uniDC00"), "")
    def test_uABCD(self):
        self.assertEqual(agl.toUnicode("u0041"), "A")
        self.assertEqual(agl.toUnicode("u00041"), "A")
        self.assertEqual(agl.toUnicode("u000041"), "A")
        self.assertEqual(agl.toUnicode("u0000041"), "")
        self.assertEqual(agl.toUnicode("u0041_uni0041_A.alt"), "AAA")
    def test_union(self):
        # Interesting test case because "uni" is a prefix of "union".
        self.assertEqual(agl.toUnicode("union"), "∪")
        # U+222A U+FE00 is a Standardized Variant for UNION WITH SERIFS.
        self.assertEqual(agl.toUnicode("union_uniFE00"), "\u222A\uFE00")
    def test_dingbats(self):
        self.assertEqual(agl.toUnicode("a20", isZapfDingbats=True), "✔")
        self.assertEqual(agl.toUnicode("a20.alt", isZapfDingbats=True), "✔")
        self.assertEqual(agl.toUnicode("a206", isZapfDingbats=True), "❰")
        self.assertEqual(agl.toUnicode("a20", isZapfDingbats=False), "")
        self.assertEqual(agl.toUnicode("a0", isZapfDingbats=True), "")
        self.assertEqual(agl.toUnicode("a207", isZapfDingbats=True), "")
        self.assertEqual(agl.toUnicode("abcdef", isZapfDingbats=True), "")
 if __name__ == "__main__":
    unittest.main()