Don’t calculate whole sets of unicode codepoints

_getUnicodeRangeSets used to calculate sets containing lots of numbers, only to
get intersections between a set and ranges. Creating and manipulating a lot of
big sets requires a lot of memory.

The function has been replaced by _getUnicodeRanges, returning a list of range
starts boundaries and a list of range stops + corresponding bits.

Tests on intersectUnicodeRanges save about 130 MB (!) of RAM, with no
significant speed penalty.
This commit is contained in:
Guillaume Ayoub 2020-06-04 00:33:26 +02:00
parent 26ac716a8d
commit 2a1f09dbb2

View File

@ -2,6 +2,7 @@ from fontTools.misc.py23 import *
from fontTools.misc import sstruct
from fontTools.misc.textTools import safeEval, num2binary, binary2num
from fontTools.ttLib.tables import DefaultTable
import bisect
import logging
@ -475,22 +476,19 @@ OS2_UNICODE_RANGES = (
)
_unicodeRangeSets = []
_unicodeStarts = []
_unicodeValues = [None]
def _getUnicodeRangeSets():
# build the sets of codepoints for each unicode range bit, and cache result
if not _unicodeRangeSets:
for bit, blocks in enumerate(OS2_UNICODE_RANGES):
rangeset = set()
for _, (start, stop) in blocks:
rangeset.update(set(range(start, stop+1)))
if bit == 57:
# The spec says that bit 57 ("Non Plane 0") implies that there's
# at least one codepoint beyond the BMP; so I also include all
# the non-BMP codepoints here
rangeset.update(set(range(0x10000, 0x110000)))
_unicodeRangeSets.append(rangeset)
return _unicodeRangeSets
def _getUnicodeRanges():
# build the ranges of codepoints for each unicode range bit, and cache result
if not _unicodeStarts:
unicodeRanges = [
(start, (stop, bit)) for bit, blocks in enumerate(OS2_UNICODE_RANGES)
for _, (start, stop) in blocks]
for start, (stop, bit) in sorted(unicodeRanges):
_unicodeStarts.append(start)
_unicodeValues.append((stop, bit))
return _unicodeStarts, _unicodeValues
def intersectUnicodeRanges(unicodes, inverse=False):
@ -504,15 +502,22 @@ def intersectUnicodeRanges(unicodes, inverse=False):
>>> intersectUnicodeRanges([0x0410, 0x1F000]) == {9, 57, 122}
True
>>> intersectUnicodeRanges([0x0410, 0x1F000], inverse=True) == (
... set(range(123)) - {9, 57, 122})
... set(range(len(OS2_UNICODE_RANGES))) - {9, 57, 122})
True
"""
unicodes = set(unicodes)
uniranges = _getUnicodeRangeSets()
bits = set([
bit for bit, unirange in enumerate(uniranges)
if not unirange.isdisjoint(unicodes) ^ inverse])
return bits
unicodestarts, unicodevalues = _getUnicodeRanges()
bits = set()
for code in unicodes:
stop, bit = unicodevalues[bisect.bisect(unicodestarts, code)]
if code <= stop:
bits.add(bit)
# The spec says that bit 57 ("Non Plane 0") implies that there's
# at least one codepoint beyond the BMP; so I also include all
# the non-BMP codepoints here
if any(0x10000 <= code < 0x110000 for code in unicodes):
bits.add(57)
return set(range(len(OS2_UNICODE_RANGES))) - bits if inverse else bits
if __name__ == "__main__":