Don’t calculate whole sets of unicode codepoints

_getUnicodeRangeSets used to calculate sets containing lots of numbers, only to get intersections between a set and ranges. Creating and manipulating a lot of big sets requires a lot of memory. The function has been replaced by _getUnicodeRanges, returning a list of range starts boundaries and a list of range stops + corresponding bits. Tests on intersectUnicodeRanges save about 130 MB (!) of RAM, with no significant speed penalty.
2020-06-04 00:33:26 +02:00 · 2020-06-04 00:33:26 +02:00 · 2a1f09dbb2
commit 2a1f09dbb2
parent 26ac716a8d
1 changed files with 26 additions and 21 deletions
--- a/Lib/fontTools/ttLib/tables/O_S_2f_2.py
+++ b/Lib/fontTools/ttLib/tables/O_S_2f_2.py
@ -2,6 +2,7 @@ from fontTools.misc.py23 import *
 from fontTools.misc import sstruct
 from fontTools.misc.textTools import safeEval, num2binary, binary2num
 from fontTools.ttLib.tables import DefaultTable
+import bisect
 import logging


@ -475,22 +476,19 @@ OS2_UNICODE_RANGES = (
 )


-_unicodeRangeSets = []
+_unicodeStarts = []
+_unicodeValues = [None]

-def _getUnicodeRangeSets():
-	# build the sets of codepoints for each unicode range bit, and cache result
-	if not _unicodeRangeSets:
-		for bit, blocks in enumerate(OS2_UNICODE_RANGES):
-			rangeset = set()
-			for _, (start, stop) in blocks:
-				rangeset.update(set(range(start, stop+1)))
-			if bit == 57:
-				# The spec says that bit 57 ("Non Plane 0") implies that there's
-				# at least one codepoint beyond the BMP; so I also include all
-				# the non-BMP codepoints here
-				rangeset.update(set(range(0x10000, 0x110000)))
-			_unicodeRangeSets.append(rangeset)
-	return _unicodeRangeSets
+def _getUnicodeRanges():
+	# build the ranges of codepoints for each unicode range bit, and cache result
+	if not _unicodeStarts:
+		unicodeRanges = [
+			(start, (stop, bit)) for bit, blocks in enumerate(OS2_UNICODE_RANGES)
+			for _, (start, stop) in blocks]
+		for start, (stop, bit) in sorted(unicodeRanges):
+			_unicodeStarts.append(start)
+			_unicodeValues.append((stop, bit))
+	return _unicodeStarts, _unicodeValues


 def intersectUnicodeRanges(unicodes, inverse=False):
@ -504,15 +502,22 @@ def intersectUnicodeRanges(unicodes, inverse=False):
 	>>> intersectUnicodeRanges([0x0410, 0x1F000]) == {9, 57, 122}
 	True
 	>>> intersectUnicodeRanges([0x0410, 0x1F000], inverse=True) == (
-	...     set(range(123)) - {9, 57, 122})
+	...     set(range(len(OS2_UNICODE_RANGES))) - {9, 57, 122})
 	True
 	"""
 	unicodes = set(unicodes)
-	uniranges = _getUnicodeRangeSets()
-	bits = set([
-		bit for bit, unirange in enumerate(uniranges)
-		if not unirange.isdisjoint(unicodes) ^ inverse])
-	return bits
+	unicodestarts, unicodevalues = _getUnicodeRanges()
+	bits = set()
+	for code in unicodes:
+		stop, bit = unicodevalues[bisect.bisect(unicodestarts, code)]
+		if code <= stop:
+			bits.add(bit)
+	# The spec says that bit 57 ("Non Plane 0") implies that there's
+	# at least one codepoint beyond the BMP; so I also include all
+	# the non-BMP codepoints here
+	if any(0x10000 <= code < 0x110000 for code in unicodes):
+		bits.add(57)
+	return set(range(len(OS2_UNICODE_RANGES))) - bits if inverse else bits


 if __name__ == "__main__":