2008-01-17 06:20:03 +00:00
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
"""FontLab Tokenize
|
|
|
|
|
|
|
|
|
|
Tokenize FontLab’s preview/metrics text into single characters
|
|
|
|
|
respecting escaped glyph names (eg. “/A.smcp”) and providing a
|
|
|
|
|
lossless reverse function. Sample usage (and actual test suite):
|
|
|
|
|
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> tokenize('/A/B/C')
|
2008-01-17 06:20:03 +00:00
|
|
|
|
['/A', '/B', '/C']
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> tokenize('abcde/B/C')
|
2008-01-17 06:20:03 +00:00
|
|
|
|
['a', 'b', 'c', 'd', 'e', '/B', '/C']
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> tokenize('foo/A.smcp/B.smcp abc')
|
2008-01-17 06:20:03 +00:00
|
|
|
|
['f', 'o', 'o', '/A.smcp', '/B.smcp', 'a', 'b', 'c']
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> p = ['f', 'o', 'o', '/A.smcp', '/B.smcp', 'a', 'b', 'c']
|
|
|
|
|
>>> serialize(p)
|
2008-01-17 06:20:03 +00:00
|
|
|
|
'foo/A.smcp/B.smcp abc'
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> tokenize('/a /b /c')
|
2008-01-17 06:20:03 +00:00
|
|
|
|
['/a', '/b', '/c']
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> tokenize('/a/b c')
|
2008-01-17 06:20:03 +00:00
|
|
|
|
['/a', '/b', 'c']
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> tokenize('@a@b@')
|
2008-01-17 06:20:03 +00:00
|
|
|
|
['@', 'a', '@', 'b', '@']
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> tokenize('abc def ghi ')
|
2008-01-17 06:20:03 +00:00
|
|
|
|
['a', 'b', 'c', ' ', 'd', 'e', 'f', ' ', 'g', 'h', 'i', ' ']
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> p = ['a', 'b', 'c', ' ', 'd', 'e', 'f', ' ', 'g', 'h', 'i', ' ']
|
|
|
|
|
>>> serialize(p)
|
2008-01-17 06:20:03 +00:00
|
|
|
|
'abc def ghi '
|
2008-01-19 12:09:29 +00:00
|
|
|
|
>>> serialize(['/a', 'b', '/c', 'd'])
|
2008-01-17 06:20:03 +00:00
|
|
|
|
'/a b/c d'
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
__author__ = 'Antonio Cavedoni <http://cavedoni.com/>'
|
|
|
|
|
__version__ = '0.1'
|
|
|
|
|
__svnid__ = '$Id$'
|
|
|
|
|
__license__ = 'Python'
|
|
|
|
|
|
|
|
|
|
def tokenize(input):
|
|
|
|
|
tokens = []
|
|
|
|
|
escaped = []
|
|
|
|
|
for i in range(len(input)):
|
|
|
|
|
x = input[i]
|
|
|
|
|
if x != '/' and not escaped:
|
|
|
|
|
tokens.append(x)
|
|
|
|
|
else:
|
|
|
|
|
if x == '/' and not escaped:
|
|
|
|
|
# append the slash so the escaped list is no longer
|
|
|
|
|
# false: starts capturing elements
|
|
|
|
|
escaped.append(x)
|
|
|
|
|
elif x != '/' and escaped:
|
|
|
|
|
if i == (len(input) - 1):
|
|
|
|
|
escaped.append(x)
|
|
|
|
|
tokens.append("".join(escaped))
|
|
|
|
|
else:
|
|
|
|
|
if x == ' ':
|
|
|
|
|
tokens.append("".join(escaped))
|
|
|
|
|
escaped = []
|
|
|
|
|
else:
|
|
|
|
|
escaped.append(x)
|
|
|
|
|
elif x == '/' and escaped:
|
|
|
|
|
# starts a new sequence so, flush the escaped buffer
|
|
|
|
|
# and start anew
|
|
|
|
|
tokens.append("".join(escaped))
|
|
|
|
|
escaped = [x]
|
|
|
|
|
|
|
|
|
|
return tokens
|
|
|
|
|
|
|
|
|
|
def serialize(tokens):
|
|
|
|
|
series = []
|
|
|
|
|
for i in range(len(tokens)):
|
|
|
|
|
t = tokens[i]
|
|
|
|
|
if t.startswith('/') and i != (len(tokens) - 1):
|
|
|
|
|
if not tokens[i+1].startswith('/'):
|
|
|
|
|
series.append(t + ' ')
|
|
|
|
|
else:
|
|
|
|
|
series.append(t)
|
|
|
|
|
else:
|
|
|
|
|
series.append(t)
|
|
|
|
|
|
|
|
|
|
return "".join(series)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
import doctest
|
|
|
|
|
doctest.testmod()
|