[varLib.varStore] Implement VarStore optimizer

On NotoSans-VF.ttf (a 5MB font), saves 15% total size by optimizing GDEF VarStore and resulting GPOS shrinkage.
2018-02-21 00:18:01 -08:00 · 2018-02-21 00:18:01 -08:00 · 3c4203d985
commit 3c4203d985
parent 073227bbb0
1 changed files with 205 additions and 6 deletions
--- a/Lib/fontTools/varLib/varStore.py
+++ b/Lib/fontTools/varLib/varStore.py
@ -7,6 +7,8 @@ from fontTools.varLib.builder import (buildVarRegionList, buildVarStore,
 				      buildVarRegion, buildVarData,
 				      VarData_CalculateNumShorts)
 from functools import partial
 from collections import defaultdict
 from array import array
 def _getLocationKey(loc):
@ -230,19 +232,210 @@ ot.GSUB.remap_device_varidxes = Object_remap_device_varidxes
 ot.GPOS.remap_device_varidxes = Object_remap_device_varidxes
 class _Encoding(object):
 	def __init__(self, chars):
 		self.chars = chars
 		self.width = self._popcount(chars)
 		self.overhead = self._characteristic_overhead(chars)
 		self.items = []
 	def append(self, row):
 		self.items.append(row)
 	def extend(self, lst):
 		self.items.extend(lst)
 	def get_room(self):
 		"""Maximum number of bytes that can be added to characteristic
 		while still being beneficial to merge it into another one."""
 		count = len(self.items)
 		return max(0, (self.overhead - 1) // count - self.width)
 	room = property(get_room)
 	@property
 	def gain(self):
 		"""Maximum possible byte gain from merging this into another
 		characteristic."""
 		count = len(self.items)
 		return max(0, self.overhead - count * (self.width + 1))
 	def sort_key(self):
 		return self.width, self.chars
 	def __len__(self):
 		return len(self.items)
 	def can_encode(self, chars):
 		return not (chars & ~self.chars)
 	def __sub__(self, other):
 		return self._popcount(self.chars & ~other.chars)
 	@staticmethod
 	def _popcount(n):
 		# Apparently this is the fastest native way to do it...
 		# https://stackoverflow.com/a/9831671
 		return bin(n).count('1')
 	@staticmethod
 	def _characteristic_overhead(chars):
 		"""Returns overhead in bytes of encoding this characteristic
 		as a VarData."""
 		c = 6
 		while chars:
 			if chars & 3:
 				c += 2
 			chars >>= 2
 		return c
 	def _find_yourself_best_new_encoding(self, done_by_width):
 		self.best_new_encoding = None
 		for new_width in range(self.width+1, self.width+self.room+1):
 			for new_encoding in done_by_width[new_width]:
 				if new_encoding.can_encode(self.chars):
 					break
 			else:
 				new_encoding = None
 			self.best_new_encoding = new_encoding
 class _EncodingDict(dict):
 	def __missing__(self, chars):
 		r = self[chars] = _Encoding(chars)
 		return r
 	def add_row(self, row):
 		chars = self._row_characteristics(row)
 		self[chars].append(row)
 	@staticmethod
 	def _row_characteristics(row):
 		"""Returns encoding characteristics for a row."""
 		chars = 0
 		i = 1
 		for v in row:
 			if v:
 				chars += i
 			if not (-128 <= v <= 127):
 				chars += i * 2
 			i <<= 2
 		return chars
 def VarStore_optimize(self):
 	"""Optimize storage. Returns mapping from old VarIdxes to new ones."""
 	# TODO
 	# Check that no two VarRegions are the same; if they are, fold them.
 	# Also that same VarData does not reference same VarRegion more than once...
-	mapping = {}
+	n = len(self.VarRegionList.Region) # Number of columns
 	zeroes = array('h', [0]*n)
 	front_mapping = {} # Map from old VarIdxes to full row tuples
 	encodings = _EncodingDict()
 	# Collect all items into a set of full rows (with lots of zeroes.)
 	for major,data in enumerate(self.VarData):
-		for minor,row in enumerate(data.Item):
+		regionIndices = data.VarRegionIndex
 			mapping[(major<<16)+minor] = (major<<16)+minor
 		for minor,item in enumerate(data.Item):
 			row = array('h', zeroes)
 			for regionIdx,v in zip(regionIndices, item):
 				row[regionIdx] += v
 			row = tuple(row)
 			encodings.add_row(row)
 			front_mapping[(major<<16)+minor] = row
 	# Separate encodings that have no gain (are decided) and those having
 	# possible gain (possibly to be merged into others.)
 	encodings = sorted(encodings.values(), key=_Encoding.__len__, reverse=True)
 	done_by_width = defaultdict(list)
 	todo = []
 	for encoding in encodings:
 		if not encoding.gain:
 			done_by_width[encoding.width].append(encoding)
 		else:
 			todo.append(encoding)
 	# For each encoding that is possibly to be merged, find the best match
 	# in the decided encodings, and record that.
 	todo.sort(key=_Encoding.get_room)
 	for encoding in todo:
 		encoding._find_yourself_best_new_encoding(done_by_width)
 	# Walk through todo encodings, for each, see if merging it with
 	# another todo encoding gains more than each of them merging with
 	# their best decided encoding. If yes, merge them and add resulting
 	# encoding back to todo queue.  If not, move the enconding to decided
 	# list.  Repeat till done.
 	while todo:
 		encoding = todo.pop()
 		best_idx = None
 		best_gain = 0
 		for i,other_encoding in enumerate(todo):
 			combined_chars = other_encoding.chars | encoding.chars
 			combined_width = _Encoding._popcount(combined_chars)
 			combined_overhead = _Encoding._characteristic_overhead(combined_chars)
 			combined_gain = (
 					+ encoding.overhead
 					+ other_encoding.overhead
 					- combined_overhead
 					- (combined_width - encoding.width) * len(encoding)
 					- (combined_width - other_encoding.width) * len(other_encoding)
 					)
 			this_gain = 0 if encoding.best_new_encoding is None else (
 						+ encoding.overhead
 						- (encoding.best_new_encoding.width - encoding.width) * len(encoding)
 					)
 			other_gain = 0 if other_encoding.best_new_encoding is None else (
 						+ other_encoding.overhead
 						- (other_encoding.best_new_encoding.width - other_encoding.width) * len(other_encoding)
 					)
 			separate_gain = this_gain + other_gain
 			if combined_gain > separate_gain:
 				best_idx = i
 				best_gain = combined_gain - separate_gain
 		if best_idx is None:
 			# Encoding is decided as is
 			done_by_width[encoding.width].append(encoding)
 		else:
 			other_encoding = todo[best_idx]
 			combined_chars = other_encoding.chars | encoding.chars
 			combined_encoding = _Encoding(combined_chars)
 			combined_encoding.extend(encoding.items)
 			combined_encoding.extend(other_encoding.items)
 			combined_encoding._find_yourself_best_new_encoding(done_by_width)
 			del todo[best_idx]
 			todo.append(combined_encoding)
 	# Assemble final store.
 	back_mapping = {} # Mapping from full rows to new VarIdxes
 	encodings = sum(done_by_width.values(), [])
 	encodings.sort(key=_Encoding.sort_key)
 	self.VarData = []
 	for major,encoding in enumerate(encodings):
 		data = ot.VarData()
 		self.VarData.append(data)
 		data.VarRegionIndex = range(n)
 		data.VarRegionCount = len(data.VarRegionIndex)
 		data.Item = sorted(encoding.items)
 		for minor,item in enumerate(data.Item):
 			back_mapping[item] = (major<<16)+minor
 	# Compile final mapping.
 	varidx_map = {}
 	for k,v in front_mapping.items():
 		varidx_map[k] = back_mapping[v]
 	# Remove unused regions.
 	self.prune_regions()
 	# Recalculate things and go home.
@ -252,7 +445,7 @@ def VarStore_optimize(self):
 		data.ItemCount = len(data.Item)
 		VarData_CalculateNumShorts(data)
-	return mapping
+	return varidx_map
 ot.VarStore.optimize = VarStore_optimize
@ -283,7 +476,13 @@ def main(args=None):
 	#size = len(writer.getAllData())
 	#print("Before: %7d bytes" % size)
-	store.optimize()
+	varidx_map = store.optimize()
 	gdef.table.remap_device_varidxes(varidx_map)
 	if 'GSUB' in font:
 		font['GSUB'].table.remap_device_varidxes(varidx_map)
 	if 'GPOS' in font:
 		font['GPOS'].table.remap_device_varidxes(varidx_map)
 	writer = OTTableWriter()
 	store.compile(writer, font)