fonttools/Scripts/Contributed/FontLabTokenize.py
Erik van Blokland d7fca1c1e5 Added >>> for the benefit of the doctests.
git-svn-id: http://svn.robofab.com/trunk@25 b5fa9d6c-a76f-4ffd-b3cb-f825fc41095c
2008-01-19 12:09:29 +00:00

85 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""FontLab Tokenize
Tokenize FontLabs preview/metrics text into single characters
respecting escaped glyph names (eg. “/A.smcp”) and providing a
lossless reverse function. Sample usage (and actual test suite):
>>> tokenize('/A/B/C')
['/A', '/B', '/C']
>>> tokenize('abcde/B/C')
['a', 'b', 'c', 'd', 'e', '/B', '/C']
>>> tokenize('foo/A.smcp/B.smcp abc')
['f', 'o', 'o', '/A.smcp', '/B.smcp', 'a', 'b', 'c']
>>> p = ['f', 'o', 'o', '/A.smcp', '/B.smcp', 'a', 'b', 'c']
>>> serialize(p)
'foo/A.smcp/B.smcp abc'
>>> tokenize('/a /b /c')
['/a', '/b', '/c']
>>> tokenize('/a/b c')
['/a', '/b', 'c']
>>> tokenize('@a@b@')
['@', 'a', '@', 'b', '@']
>>> tokenize('abc def ghi ')
['a', 'b', 'c', ' ', 'd', 'e', 'f', ' ', 'g', 'h', 'i', ' ']
>>> p = ['a', 'b', 'c', ' ', 'd', 'e', 'f', ' ', 'g', 'h', 'i', ' ']
>>> serialize(p)
'abc def ghi '
>>> serialize(['/a', 'b', '/c', 'd'])
'/a b/c d'
"""
__author__ = 'Antonio Cavedoni <http://cavedoni.com/>'
__version__ = '0.1'
__svnid__ = '$Id$'
__license__ = 'Python'
def tokenize(input):
tokens = []
escaped = []
for i in range(len(input)):
x = input[i]
if x != '/' and not escaped:
tokens.append(x)
else:
if x == '/' and not escaped:
# append the slash so the escaped list is no longer
# false: starts capturing elements
escaped.append(x)
elif x != '/' and escaped:
if i == (len(input) - 1):
escaped.append(x)
tokens.append("".join(escaped))
else:
if x == ' ':
tokens.append("".join(escaped))
escaped = []
else:
escaped.append(x)
elif x == '/' and escaped:
# starts a new sequence so, flush the escaped buffer
# and start anew
tokens.append("".join(escaped))
escaped = [x]
return tokens
def serialize(tokens):
series = []
for i in range(len(tokens)):
t = tokens[i]
if t.startswith('/') and i != (len(tokens) - 1):
if not tokens[i+1].startswith('/'):
series.append(t + ' ')
else:
series.append(t)
else:
series.append(t)
return "".join(series)
if __name__ == "__main__":
import doctest
doctest.testmod()