diff --git a/Lib/fontTools/ttLib/tables/O_S_2f_2.py b/Lib/fontTools/ttLib/tables/O_S_2f_2.py index 7e29a991c..04200cadd 100644 --- a/Lib/fontTools/ttLib/tables/O_S_2f_2.py +++ b/Lib/fontTools/ttLib/tables/O_S_2f_2.py @@ -2,7 +2,7 @@ from __future__ import print_function, division, absolute_import from fontTools.misc.py23 import * from fontTools.misc import sstruct from fontTools.misc.textTools import safeEval, num2binary, binary2num -from . import DefaultTable +from fontTools.ttLib.tables import DefaultTable import warnings @@ -228,3 +228,276 @@ class table_O_S_2f_2(DefaultTable.DefaultTable): @fsLastCharIndex.setter def fsLastCharIndex(self, value): self.usLastCharIndex = value + + def getUnicodeRanges(self): + """ Return the set of 'ulUnicodeRange*' bits currently enabled. """ + bits = set() + ul1, ul2 = self.ulUnicodeRange1, self.ulUnicodeRange2 + ul3, ul4 = self.ulUnicodeRange3, self.ulUnicodeRange4 + for i in range(32): + if ul1 & (1 << i): + bits.add(i) + if ul2 & (1 << i): + bits.add(i + 32) + if ul3 & (1 << i): + bits.add(i + 64) + if ul4 & (1 << i): + bits.add(i + 96) + return bits + + def setUnicodeRanges(self, bits): + """ Set the 'ulUnicodeRange*' fields to the specified 'bits'. """ + ul1, ul2, ul3, ul4 = 0, 0, 0, 0 + for bit in bits: + if 0 <= bit < 32: + ul1 |= (1 << bit) + elif 32 <= bit < 64: + ul2 |= (1 << (bit - 32)) + elif 64 <= bit < 96: + ul3 |= (1 << (bit - 64)) + elif 96 <= bit < 123: + ul4 |= (1 << (bit - 96)) + else: + raise ValueError('expected 0 <= int <= 122, found: %r' % bit) + self.ulUnicodeRange1, self.ulUnicodeRange2 = ul1, ul2 + self.ulUnicodeRange3, self.ulUnicodeRange4 = ul3, ul4 + + def recalcUnicodeRanges(self, ttFont, pruneOnly=False): + """ Intersect the codepoints in the font's Unicode cmap subtables with + the Unicode block ranges defined in the OpenType specification (v1.7), + and set the respective 'ulUnicodeRange*' bits if there is at least ONE + intersection. + If 'pruneOnly' is True, only clear unused bits with NO intersection. + """ + unicodes = set() + for table in ttFont['cmap'].tables: + if table.isUnicode(): + unicodes.update(table.cmap.keys()) + if pruneOnly: + empty = intersectUnicodeRanges(unicodes, inverse=True) + bits = self.getUnicodeRanges() - empty + else: + bits = intersectUnicodeRanges(unicodes) + self.setUnicodeRanges(bits) + return bits + + +# Unicode ranges data from the OpenType OS/2 table specification v1.7 + +OS2_UNICODE_RANGES = ( + (('Basic Latin', (0x0000, 0x007F)),), + (('Latin-1 Supplement', (0x0080, 0x00FF)),), + (('Latin Extended-A', (0x0100, 0x017F)),), + (('Latin Extended-B', (0x0180, 0x024F)),), + (('IPA Extensions', (0x0250, 0x02AF)), + ('Phonetic Extensions', (0x1D00, 0x1D7F)), + ('Phonetic Extensions Supplement', (0x1D80, 0x1DBF))), + (('Spacing Modifier Letters', (0x02B0, 0x02FF)), + ('Modifier Tone Letters', (0xA700, 0xA71F))), + (('Combining Diacritical Marks', (0x0300, 0x036F)), + ('Combining Diacritical Marks Supplement', (0x1DC0, 0x1DFF))), + (('Greek and Coptic', (0x0370, 0x03FF)),), + (('Coptic', (0x2C80, 0x2CFF)),), + (('Cyrillic', (0x0400, 0x04FF)), + ('Cyrillic Supplement', (0x0500, 0x052F)), + ('Cyrillic Extended-A', (0x2DE0, 0x2DFF)), + ('Cyrillic Extended-B', (0xA640, 0xA69F))), + (('Armenian', (0x0530, 0x058F)),), + (('Hebrew', (0x0590, 0x05FF)),), + (('Vai', (0xA500, 0xA63F)),), + (('Arabic', (0x0600, 0x06FF)), + ('Arabic Supplement', (0x0750, 0x077F))), + (('NKo', (0x07C0, 0x07FF)),), + (('Devanagari', (0x0900, 0x097F)),), + (('Bengali', (0x0980, 0x09FF)),), + (('Gurmukhi', (0x0A00, 0x0A7F)),), + (('Gujarati', (0x0A80, 0x0AFF)),), + (('Oriya', (0x0B00, 0x0B7F)),), + (('Tamil', (0x0B80, 0x0BFF)),), + (('Telugu', (0x0C00, 0x0C7F)),), + (('Kannada', (0x0C80, 0x0CFF)),), + (('Malayalam', (0x0D00, 0x0D7F)),), + (('Thai', (0x0E00, 0x0E7F)),), + (('Lao', (0x0E80, 0x0EFF)),), + (('Georgian', (0x10A0, 0x10FF)), + ('Georgian Supplement', (0x2D00, 0x2D2F))), + (('Balinese', (0x1B00, 0x1B7F)),), + (('Hangul Jamo', (0x1100, 0x11FF)),), + (('Latin Extended Additional', (0x1E00, 0x1EFF)), + ('Latin Extended-C', (0x2C60, 0x2C7F)), + ('Latin Extended-D', (0xA720, 0xA7FF))), + (('Greek Extended', (0x1F00, 0x1FFF)),), + (('General Punctuation', (0x2000, 0x206F)), + ('Supplemental Punctuation', (0x2E00, 0x2E7F))), + (('Superscripts And Subscripts', (0x2070, 0x209F)),), + (('Currency Symbols', (0x20A0, 0x20CF)),), + (('Combining Diacritical Marks For Symbols', (0x20D0, 0x20FF)),), + (('Letterlike Symbols', (0x2100, 0x214F)),), + (('Number Forms', (0x2150, 0x218F)),), + (('Arrows', (0x2190, 0x21FF)), + ('Supplemental Arrows-A', (0x27F0, 0x27FF)), + ('Supplemental Arrows-B', (0x2900, 0x297F)), + ('Miscellaneous Symbols and Arrows', (0x2B00, 0x2BFF))), + (('Mathematical Operators', (0x2200, 0x22FF)), + ('Supplemental Mathematical Operators', (0x2A00, 0x2AFF)), + ('Miscellaneous Mathematical Symbols-A', (0x27C0, 0x27EF)), + ('Miscellaneous Mathematical Symbols-B', (0x2980, 0x29FF))), + (('Miscellaneous Technical', (0x2300, 0x23FF)),), + (('Control Pictures', (0x2400, 0x243F)),), + (('Optical Character Recognition', (0x2440, 0x245F)),), + (('Enclosed Alphanumerics', (0x2460, 0x24FF)),), + (('Box Drawing', (0x2500, 0x257F)),), + (('Block Elements', (0x2580, 0x259F)),), + (('Geometric Shapes', (0x25A0, 0x25FF)),), + (('Miscellaneous Symbols', (0x2600, 0x26FF)),), + (('Dingbats', (0x2700, 0x27BF)),), + (('CJK Symbols And Punctuation', (0x3000, 0x303F)),), + (('Hiragana', (0x3040, 0x309F)),), + (('Katakana', (0x30A0, 0x30FF)), + ('Katakana Phonetic Extensions', (0x31F0, 0x31FF))), + (('Bopomofo', (0x3100, 0x312F)), + ('Bopomofo Extended', (0x31A0, 0x31BF))), + (('Hangul Compatibility Jamo', (0x3130, 0x318F)),), + (('Phags-pa', (0xA840, 0xA87F)),), + (('Enclosed CJK Letters And Months', (0x3200, 0x32FF)),), + (('CJK Compatibility', (0x3300, 0x33FF)),), + (('Hangul Syllables', (0xAC00, 0xD7AF)),), + (('Non-Plane 0 *', (0xD800, 0xDFFF)),), + (('Phoenician', (0x10900, 0x1091F)),), + (('CJK Unified Ideographs', (0x4E00, 0x9FFF)), + ('CJK Radicals Supplement', (0x2E80, 0x2EFF)), + ('Kangxi Radicals', (0x2F00, 0x2FDF)), + ('Ideographic Description Characters', (0x2FF0, 0x2FFF)), + ('CJK Unified Ideographs Extension A', (0x3400, 0x4DBF)), + ('CJK Unified Ideographs Extension B', (0x20000, 0x2A6DF)), + ('Kanbun', (0x3190, 0x319F))), + (('Private Use Area (plane 0)', (0xE000, 0xF8FF)),), + (('CJK Strokes', (0x31C0, 0x31EF)), + ('CJK Compatibility Ideographs', (0xF900, 0xFAFF)), + ('CJK Compatibility Ideographs Supplement', (0x2F800, 0x2FA1F))), + (('Alphabetic Presentation Forms', (0xFB00, 0xFB4F)),), + (('Arabic Presentation Forms-A', (0xFB50, 0xFDFF)),), + (('Combining Half Marks', (0xFE20, 0xFE2F)),), + (('Vertical Forms', (0xFE10, 0xFE1F)), + ('CJK Compatibility Forms', (0xFE30, 0xFE4F))), + (('Small Form Variants', (0xFE50, 0xFE6F)),), + (('Arabic Presentation Forms-B', (0xFE70, 0xFEFF)),), + (('Halfwidth And Fullwidth Forms', (0xFF00, 0xFFEF)),), + (('Specials', (0xFFF0, 0xFFFF)),), + (('Tibetan', (0x0F00, 0x0FFF)),), + (('Syriac', (0x0700, 0x074F)),), + (('Thaana', (0x0780, 0x07BF)),), + (('Sinhala', (0x0D80, 0x0DFF)),), + (('Myanmar', (0x1000, 0x109F)),), + (('Ethiopic', (0x1200, 0x137F)), + ('Ethiopic Supplement', (0x1380, 0x139F)), + ('Ethiopic Extended', (0x2D80, 0x2DDF))), + (('Cherokee', (0x13A0, 0x13FF)),), + (('Unified Canadian Aboriginal Syllabics', (0x1400, 0x167F)),), + (('Ogham', (0x1680, 0x169F)),), + (('Runic', (0x16A0, 0x16FF)),), + (('Khmer', (0x1780, 0x17FF)), + ('Khmer Symbols', (0x19E0, 0x19FF))), + (('Mongolian', (0x1800, 0x18AF)),), + (('Braille Patterns', (0x2800, 0x28FF)),), + (('Yi Syllables', (0xA000, 0xA48F)), + ('Yi Radicals', (0xA490, 0xA4CF))), + (('Tagalog', (0x1700, 0x171F)), + ('Hanunoo', (0x1720, 0x173F)), + ('Buhid', (0x1740, 0x175F)), + ('Tagbanwa', (0x1760, 0x177F))), + (('Old Italic', (0x10300, 0x1032F)),), + (('Gothic', (0x10330, 0x1034F)),), + (('Deseret', (0x10400, 0x1044F)),), + (('Byzantine Musical Symbols', (0x1D000, 0x1D0FF)), + ('Musical Symbols', (0x1D100, 0x1D1FF)), + ('Ancient Greek Musical Notation', (0x1D200, 0x1D24F))), + (('Mathematical Alphanumeric Symbols', (0x1D400, 0x1D7FF)),), + (('Private Use (plane 15)', (0xFF000, 0xFFFFD)), + ('Private Use (plane 16)', (0x100000, 0x10FFFD))), + (('Variation Selectors', (0xFE00, 0xFE0F)), + ('Variation Selectors Supplement', (0xE0100, 0xE01EF))), + (('Tags', (0xE0000, 0xE007F)),), + (('Limbu', (0x1900, 0x194F)),), + (('Tai Le', (0x1950, 0x197F)),), + (('New Tai Lue', (0x1980, 0x19DF)),), + (('Buginese', (0x1A00, 0x1A1F)),), + (('Glagolitic', (0x2C00, 0x2C5F)),), + (('Tifinagh', (0x2D30, 0x2D7F)),), + (('Yijing Hexagram Symbols', (0x4DC0, 0x4DFF)),), + (('Syloti Nagri', (0xA800, 0xA82F)),), + (('Linear B Syllabary', (0x10000, 0x1007F)), + ('Linear B Ideograms', (0x10080, 0x100FF)), + ('Aegean Numbers', (0x10100, 0x1013F))), + (('Ancient Greek Numbers', (0x10140, 0x1018F)),), + (('Ugaritic', (0x10380, 0x1039F)),), + (('Old Persian', (0x103A0, 0x103DF)),), + (('Shavian', (0x10450, 0x1047F)),), + (('Osmanya', (0x10480, 0x104AF)),), + (('Cypriot Syllabary', (0x10800, 0x1083F)),), + (('Kharoshthi', (0x10A00, 0x10A5F)),), + (('Tai Xuan Jing Symbols', (0x1D300, 0x1D35F)),), + (('Cuneiform', (0x12000, 0x123FF)), + ('Cuneiform Numbers and Punctuation', (0x12400, 0x1247F))), + (('Counting Rod Numerals', (0x1D360, 0x1D37F)),), + (('Sundanese', (0x1B80, 0x1BBF)),), + (('Lepcha', (0x1C00, 0x1C4F)),), + (('Ol Chiki', (0x1C50, 0x1C7F)),), + (('Saurashtra', (0xA880, 0xA8DF)),), + (('Kayah Li', (0xA900, 0xA92F)),), + (('Rejang', (0xA930, 0xA95F)),), + (('Cham', (0xAA00, 0xAA5F)),), + (('Ancient Symbols', (0x10190, 0x101CF)),), + (('Phaistos Disc', (0x101D0, 0x101FF)),), + (('Carian', (0x102A0, 0x102DF)), + ('Lycian', (0x10280, 0x1029F)), + ('Lydian', (0x10920, 0x1093F))), + (('Domino Tiles', (0x1F030, 0x1F09F)), + ('Mahjong Tiles', (0x1F000, 0x1F02F))), +) + + +_unicodeRangeSets = [] + +def _getUnicodeRangeSets(): + # build the sets of codepoints for each unicode range bit, and cache result + global _unicodeRangeSets + if not _unicodeRangeSets: + for bit, blocks in enumerate(OS2_UNICODE_RANGES): + rangeset = set() + for _, (start, stop) in blocks: + rangeset.update(set(range(start, stop+1))) + if bit == 57: + # The spec says that bit 57 ("Non Plane 0") implies that there's + # at least one codepoint beyond the BMP; so I also include all + # the non-BMP codepoints here + rangeset.update(set(range(0x10000, 0x110000))) + _unicodeRangeSets.append(rangeset) + return _unicodeRangeSets + + +def intersectUnicodeRanges(unicodes, inverse=False): + """ Intersect a sequence of (int) Unicode codepoints with the Unicode block + ranges defined in the OpenType specification v1.7, and return the set of + 'ulUnicodeRanges' bits for which there is at least ONE intersection. + If 'inverse' is True, return the the bits for which there is NO intersection. + + >>> intersectUnicodeRanges([0x0410]) == {9} + True + >>> intersectUnicodeRanges([0x0410, 0x1F000]) == {9, 57, 122} + True + >>> intersectUnicodeRanges([0x0410, 0x1F000], inverse=True) == ( + ... set(range(123)) - {9, 57, 122}) + True + """ + unicodes = set(unicodes) + uniranges = _getUnicodeRangeSets() + bits = set([ + bit for bit, unirange in enumerate(uniranges) + if not unirange.isdisjoint(unicodes) ^ inverse]) + return bits + + +if __name__ == "__main__": + import doctest, sys + sys.exit(doctest.testmod().failed)