From ba238344b1ca23962bf18b2e068fd4c1930dbb15 Mon Sep 17 00:00:00 2001
From: Sascha Brawer <sascha@brawer.ch>
Date: Sat, 1 Aug 2015 17:34:02 +0200
Subject: [PATCH] [feaLib] Implement top-level glyph class definitions

---
 Lib/fontTools/feaLib/ast.py         | 15 ++++-
 Lib/fontTools/feaLib/lexer.py       | 10 ++-
 Lib/fontTools/feaLib/lexer_test.py  | 14 +++-
 Lib/fontTools/feaLib/parser.py      | 99 +++++++++++++++++++++++++----
 Lib/fontTools/feaLib/parser_test.py | 56 ++++++++++++++++
 5 files changed, 179 insertions(+), 15 deletions(-)

diff --git a/Lib/fontTools/feaLib/ast.py b/Lib/fontTools/feaLib/ast.py
index 59a9072d9..7c268119d 100644
--- a/Lib/fontTools/feaLib/ast.py
+++ b/Lib/fontTools/feaLib/ast.py
@@ -6,7 +6,7 @@ def write(buffer, text):
     buffer.write(text.encode("utf-8"))
 
 
-class FeatureFile:
+class FeatureFile(object):
     def __init__(self):
         self.statements = []
 
@@ -15,7 +15,18 @@ class FeatureFile:
             s.write(out, linesep)
 
 
-class LanguageSystemStatement:
+class GlyphClassDefinition(object):
+    def __init__(self, location, name, glyphs):
+        self.location = location
+        self.name = name
+        self.glyphs = glyphs
+
+    def write(self, out, linesep):
+        glyphs = " ".join(sorted(self.glyphs))
+        write(out, "@%s = [%s];%s" % (self.name, glyphs, linesep))
+
+
+class LanguageSystemStatement(object):
     def __init__(self, location, script, language):
         self.location = location
         self.script, self.language = (script, language)
diff --git a/Lib/fontTools/feaLib/lexer.py b/Lib/fontTools/feaLib/lexer.py
index b544378eb..7d4c29fa6 100644
--- a/Lib/fontTools/feaLib/lexer.py
+++ b/Lib/fontTools/feaLib/lexer.py
@@ -23,6 +23,7 @@ class Lexer(object):
     STRING = "STRING"
     NAME = "NAME"
     FILENAME = "FILENAME"
+    GLYPHCLASS = "GLYPHCLASS"
     CID = "CID"
     SYMBOL = "SYMBOL"
     COMMENT = "COMMENT"
@@ -30,7 +31,7 @@ class Lexer(object):
 
     CHAR_WHITESPACE_ = " \t"
     CHAR_NEWLINE_ = "\r\n"
-    CHAR_SYMBOL_ = ";:@-+'{}[]<>()"
+    CHAR_SYMBOL_ = ";:-+'{}[]<>()="
     CHAR_DIGIT_ = "0123456789"
     CHAR_LETTER_ = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
     CHAR_NAME_START_ = CHAR_LETTER_ + "_.\\"
@@ -101,6 +102,13 @@ class Lexer(object):
             self.pos_ += 1
             self.scan_over_(Lexer.CHAR_DIGIT_)
             return (Lexer.CID, int(text[start + 1:self.pos_], 10), location)
+        if cur_char == "@":
+            self.pos_ += 1
+            self.scan_over_(Lexer.CHAR_NAME_CONTINUATION_)
+            glyphclass = text[start + 1:self.pos_]
+            if len(glyphclass) < 1:
+                raise LexerError("Expected glyph class name", location)
+            return (Lexer.GLYPHCLASS, glyphclass, location)
         if cur_char in Lexer.CHAR_NAME_START_:
             self.pos_ += 1
             self.scan_over_(Lexer.CHAR_NAME_CONTINUATION_)
diff --git a/Lib/fontTools/feaLib/lexer_test.py b/Lib/fontTools/feaLib/lexer_test.py
index 6fa3d42f0..92da8c588 100644
--- a/Lib/fontTools/feaLib/lexer_test.py
+++ b/Lib/fontTools/feaLib/lexer_test.py
@@ -20,6 +20,13 @@ class LexerErrorTest(unittest.TestCase):
 
 
 class LexerTest(unittest.TestCase):
+    def __init__(self, methodName):
+        unittest.TestCase.__init__(self, methodName)
+        # Python 3 renamed assertRaisesRegexp to assertRaisesRegex,
+        # and fires deprecation warnings if a program uses the old name.
+        if not hasattr(self, "assertRaisesRegex"):
+            self.assertRaisesRegex = self.assertRaisesRegexp
+
     def test_empty(self):
         self.assertEqual(lex(""), [])
         self.assertEqual(lex(" \t "), [])
@@ -34,6 +41,11 @@ class LexerTest(unittest.TestCase):
     def test_cid(self):
         self.assertEqual(lex("\\0 \\987"), [(Lexer.CID, 0), (Lexer.CID, 987)])
 
+    def test_glyphclass(self):
+        self.assertEqual(lex("@Vowel.sc"), [(Lexer.GLYPHCLASS, "Vowel.sc")])
+        self.assertRaisesRegex(LexerError, "Expected glyph class", lex, "@(a)")
+        self.assertRaisesRegex(LexerError, "Expected glyph class", lex, "@ A")
+
     def test_include(self):
         self.assertEqual(lex("include (~/foo/bar baz.fea);"), [
             (Lexer.NAME, "include"),
@@ -81,7 +93,7 @@ class LexerTest(unittest.TestCase):
                           for (_, _, loc) in Lexer(s, "test.fea")]
         self.assertEqual(locs("a b # Comment\n12 @x"), [
             "test.fea:1:1", "test.fea:1:3", "test.fea:2:1",
-            "test.fea:2:4", "test.fea:2:5"
+            "test.fea:2:4"
         ])
 
     def test_scan_over_(self):
diff --git a/Lib/fontTools/feaLib/parser.py b/Lib/fontTools/feaLib/parser.py
index 9a930e4df..d5e7256e3 100644
--- a/Lib/fontTools/feaLib/parser.py
+++ b/Lib/fontTools/feaLib/parser.py
@@ -1,8 +1,9 @@
 from __future__ import print_function, division, absolute_import
 from __future__ import unicode_literals
 from fontTools.feaLib.lexer import Lexer, IncludingLexer
-
 import fontTools.feaLib.ast as ast
+import os
+import re
 
 
 class ParserError(Exception):
@@ -30,13 +31,46 @@ class Parser(object):
 
     def parse(self):
         while self.next_token_type_ is not None:
-            keyword = self.expect_keyword_({"feature", "languagesystem"})
-            if keyword == "languagesystem":
+            self.advance_lexer_()
+            if self.cur_token_type_ is Lexer.GLYPHCLASS:
+                self.parse_glyphclass_definition_()
+            elif self.is_cur_keyword_("languagesystem"):
                 self.parse_languagesystem_()
-            elif keyword == "feature":
+            elif self.is_cur_keyword_("feature"):
                 break  # TODO: Implement
+            else:
+                raise ParserError("Expected languagesystem, feature, or "
+                                  "glyph class definition",
+                                  self.cur_token_location_)
         return self.doc_
 
+    def parse_glyphclass_definition_(self):
+        location, name = self.cur_token_location_, self.cur_token_
+        self.expect_symbol_("=")
+        glyphs = self.parse_glyphclass_reference_()
+        self.expect_symbol_(";")
+        glyphclass = ast.GlyphClassDefinition(location, name, glyphs)
+        self.doc_.statements.append(glyphclass)
+
+    def parse_glyphclass_reference_(self):
+        result = set()
+        self.expect_symbol_("[")
+        while self.next_token_ != "]":
+            if self.next_token_type_ is Lexer.NAME:
+                self.advance_lexer_()
+                if self.next_token_ == "-":
+                    range_location_ = self.cur_token_location_
+                    range_start = self.cur_token_
+                    self.expect_symbol_("-")
+                    range_end = self.expect_name_()
+                    result.update(self.make_glyph_range_(range_location_,
+                                                         range_start,
+                                                         range_end))
+                else:
+                    result.add(self.cur_token_)
+        self.expect_symbol_("]")
+        return result
+
     def parse_languagesystem_(self):
         location = self.cur_token_location_
         script, language = self.expect_tag_(), self.expect_tag_()
@@ -44,13 +78,8 @@ class Parser(object):
         langsys = ast.LanguageSystemStatement(location, script, language)
         self.doc_.statements.append(langsys)
 
-    def expect_keyword_(self, keywords):
-        self.advance_lexer_()
-        if self.cur_token_type_ is Lexer.NAME and self.cur_token_ in keywords:
-            return self.cur_token_
-        s = ", ".join(sorted(list(keywords)))
-        raise ParserError("Expected one of %s" % s,
-                          self.cur_token_location_)
+    def is_cur_keyword_(self, k):
+        return (self.cur_token_type_ is Lexer.NAME) and (self.cur_token_ == k)
 
     def expect_tag_(self):
         self.advance_lexer_()
@@ -67,6 +96,12 @@ class Parser(object):
             return symbol
         raise ParserError("Expected '%s'" % symbol, self.cur_token_location_)
 
+    def expect_name_(self):
+        self.advance_lexer_()
+        if self.cur_token_type_ is Lexer.NAME:
+            return self.cur_token_
+        raise ParserError("Expected a name", self.cur_token_location_)
+
     def advance_lexer_(self):
         self.cur_token_type_, self.cur_token_, self.cur_token_location_ = (
             self.next_token_type_, self.next_token_, self.next_token_location_)
@@ -75,3 +110,45 @@ class Parser(object):
              self.next_token_location_) = self.lexer_.next()
         except StopIteration:
             self.next_token_type_, self.next_token_ = (None, None)
+
+    def make_glyph_range_(self, location, start, limit):
+        """("a.sc", "d.sc") --> {"a.sc", "b.sc", "c.sc", "d.sc"}"""
+        result = set()
+        if len(start) != len(limit):
+            raise ParserError(
+                "Bad range: \"%s\" and \"%s\" should have the same length" %
+                (start, limit), location)
+        rev = lambda s: ''.join(reversed(list(s)))  # string reversal
+        prefix = os.path.commonprefix([start, limit])
+        suffix = rev(os.path.commonprefix([rev(start), rev(limit)]))
+        if len(suffix) > 0:
+            start_range = start[len(prefix):-len(suffix)]
+            limit_range = limit[len(prefix):-len(suffix)]
+        else:
+            start_range = start[len(prefix):]
+            limit_range = limit[len(prefix):]
+
+        if start_range >= limit_range:
+            raise ParserError("Start of range must be smaller than its end",
+                              location)
+
+        uppercase = re.compile(r'^[A-Z]$')
+        if uppercase.match(start_range) and uppercase.match(limit_range):
+            for c in range(ord(start_range), ord(limit_range) + 1):
+                result.add("%s%c%s" % (prefix, c, suffix))
+            return result
+
+        lowercase = re.compile(r'^[a-z]$')
+        if lowercase.match(start_range) and lowercase.match(limit_range):
+            for c in range(ord(start_range), ord(limit_range) + 1):
+                result.add("%s%c%s" % (prefix, c, suffix))
+            return result
+
+        digits = re.compile(r'^[0-9]{1,3}$')
+        if digits.match(start_range) and digits.match(limit_range):
+            for i in range(int(start_range, 10), int(limit_range, 10) + 1):
+                number = ("000" + str(i))[-len(start_range):]
+                result.add("%s%s%s" % (prefix, number, suffix))
+            return result
+
+        raise ParserError("Bad range: \"%s-%s\"" % (start, limit), location)
diff --git a/Lib/fontTools/feaLib/parser_test.py b/Lib/fontTools/feaLib/parser_test.py
index ea6f95801..572b98f71 100644
--- a/Lib/fontTools/feaLib/parser_test.py
+++ b/Lib/fontTools/feaLib/parser_test.py
@@ -19,6 +19,62 @@ class ParserTest(unittest.TestCase):
         if not hasattr(self, "assertRaisesRegex"):
             self.assertRaisesRegex = self.assertRaisesRegexp
 
+    def test_glyphclass(self):
+        [gc] = self.parse("@dash = [endash emdash figuredash];").statements
+        self.assertEqual(gc.name, "dash")
+        self.assertEqual(gc.glyphs, {"endash", "emdash", "figuredash"})
+
+    def test_glyphclass_range_uppercase(self):
+        [gc] = self.parse("@swashes = [X.swash-Z.swash];").statements
+        self.assertEqual(gc.name, "swashes")
+        self.assertEqual(gc.glyphs, {"X.swash", "Y.swash", "Z.swash"})
+
+    def test_glyphclass_range_lowercase(self):
+        [gc] = self.parse("@defg.sc = [d.sc-g.sc];").statements
+        self.assertEqual(gc.name, "defg.sc")
+        self.assertEqual(gc.glyphs, {"d.sc", "e.sc", "f.sc", "g.sc"})
+
+    def test_glyphclass_range_digit1(self):
+        [gc] = self.parse("@range = [foo.2-foo.5];").statements
+        self.assertEqual(gc.glyphs, {"foo.2", "foo.3", "foo.4", "foo.5"})
+
+    def test_glyphclass_range_digit2(self):
+        [gc] = self.parse("@range = [foo.09-foo.11];").statements
+        self.assertEqual(gc.glyphs, {"foo.09", "foo.10", "foo.11"})
+
+    def test_glyphclass_range_digit3(self):
+        [gc] = self.parse("@range = [foo.123-foo.125];").statements
+        self.assertEqual(gc.glyphs, {"foo.123", "foo.124", "foo.125"})
+
+    def test_glyphclass_range_bad(self):
+        self.assertRaisesRegex(
+            ParserError,
+            "Bad range: \"a\" and \"foobar\" should have the same length",
+            self.parse, "@bad = [a-foobar];")
+        self.assertRaisesRegex(
+            ParserError, "Bad range: \"A.swash-z.swash\"",
+            self.parse, "@bad = [A.swash-z.swash];")
+        self.assertRaisesRegex(
+            ParserError, "Start of range must be smaller than its end",
+            self.parse, "@bad = [B.swash-A.swash];")
+        self.assertRaisesRegex(
+            ParserError, "Bad range: \"foo.1234-foo.9876\"",
+            self.parse, "@bad = [foo.1234-foo.9876];")
+
+    def test_glyphclass_range_mixed(self):
+        [gc] = self.parse("@range = [a foo.09-foo.11 X.sc-Z.sc];").statements
+        self.assertEqual(gc.glyphs, {
+            "a", "foo.09", "foo.10", "foo.11", "X.sc", "Y.sc", "Z.sc"
+        })
+
+    # TODO: self.parse("@foo = [a b]; @bar = [@foo];")
+    # TODO: self.parse("@foo = [a b]; @bar = @foo;")
+
+    def test_glyphclass_empty(self):
+        [gc] = self.parse("@empty_set = [];").statements
+        self.assertEqual(gc.name, "empty_set")
+        self.assertEqual(gc.glyphs, set())
+
     def test_languagesystem(self):
         [langsys] = self.parse("languagesystem latn DEU;").statements
         self.assertEqual(langsys.script, "latn")