[varLib.varStore] Implement VarStore optimizer

On NotoSans-VF.ttf (a 5MB font), saves 15% total size by optimizing
GDEF VarStore and resulting GPOS shrinkage.
This commit is contained in:
Behdad Esfahbod 2018-02-21 00:18:01 -08:00
parent 073227bbb0
commit 3c4203d985

View File

@ -7,6 +7,8 @@ from fontTools.varLib.builder import (buildVarRegionList, buildVarStore,
buildVarRegion, buildVarData, buildVarRegion, buildVarData,
VarData_CalculateNumShorts) VarData_CalculateNumShorts)
from functools import partial from functools import partial
from collections import defaultdict
from array import array
def _getLocationKey(loc): def _getLocationKey(loc):
@ -230,19 +232,210 @@ ot.GSUB.remap_device_varidxes = Object_remap_device_varidxes
ot.GPOS.remap_device_varidxes = Object_remap_device_varidxes ot.GPOS.remap_device_varidxes = Object_remap_device_varidxes
class _Encoding(object):
def __init__(self, chars):
self.chars = chars
self.width = self._popcount(chars)
self.overhead = self._characteristic_overhead(chars)
self.items = []
def append(self, row):
self.items.append(row)
def extend(self, lst):
self.items.extend(lst)
def get_room(self):
"""Maximum number of bytes that can be added to characteristic
while still being beneficial to merge it into another one."""
count = len(self.items)
return max(0, (self.overhead - 1) // count - self.width)
room = property(get_room)
@property
def gain(self):
"""Maximum possible byte gain from merging this into another
characteristic."""
count = len(self.items)
return max(0, self.overhead - count * (self.width + 1))
def sort_key(self):
return self.width, self.chars
def __len__(self):
return len(self.items)
def can_encode(self, chars):
return not (chars & ~self.chars)
def __sub__(self, other):
return self._popcount(self.chars & ~other.chars)
@staticmethod
def _popcount(n):
# Apparently this is the fastest native way to do it...
# https://stackoverflow.com/a/9831671
return bin(n).count('1')
@staticmethod
def _characteristic_overhead(chars):
"""Returns overhead in bytes of encoding this characteristic
as a VarData."""
c = 6
while chars:
if chars & 3:
c += 2
chars >>= 2
return c
def _find_yourself_best_new_encoding(self, done_by_width):
self.best_new_encoding = None
for new_width in range(self.width+1, self.width+self.room+1):
for new_encoding in done_by_width[new_width]:
if new_encoding.can_encode(self.chars):
break
else:
new_encoding = None
self.best_new_encoding = new_encoding
class _EncodingDict(dict):
def __missing__(self, chars):
r = self[chars] = _Encoding(chars)
return r
def add_row(self, row):
chars = self._row_characteristics(row)
self[chars].append(row)
@staticmethod
def _row_characteristics(row):
"""Returns encoding characteristics for a row."""
chars = 0
i = 1
for v in row:
if v:
chars += i
if not (-128 <= v <= 127):
chars += i * 2
i <<= 2
return chars
def VarStore_optimize(self): def VarStore_optimize(self):
"""Optimize storage. Returns mapping from old VarIdxes to new ones.""" """Optimize storage. Returns mapping from old VarIdxes to new ones."""
# TODO # TODO
# Check that no two VarRegions are the same; if they are, fold them. # Check that no two VarRegions are the same; if they are, fold them.
# Also that same VarData does not reference same VarRegion more than once...
mapping = {} n = len(self.VarRegionList.Region) # Number of columns
zeroes = array('h', [0]*n)
front_mapping = {} # Map from old VarIdxes to full row tuples
encodings = _EncodingDict()
# Collect all items into a set of full rows (with lots of zeroes.)
for major,data in enumerate(self.VarData): for major,data in enumerate(self.VarData):
for minor,row in enumerate(data.Item): regionIndices = data.VarRegionIndex
mapping[(major<<16)+minor] = (major<<16)+minor
for minor,item in enumerate(data.Item):
row = array('h', zeroes)
for regionIdx,v in zip(regionIndices, item):
row[regionIdx] += v
row = tuple(row)
encodings.add_row(row)
front_mapping[(major<<16)+minor] = row
# Separate encodings that have no gain (are decided) and those having
# possible gain (possibly to be merged into others.)
encodings = sorted(encodings.values(), key=_Encoding.__len__, reverse=True)
done_by_width = defaultdict(list)
todo = []
for encoding in encodings:
if not encoding.gain:
done_by_width[encoding.width].append(encoding)
else:
todo.append(encoding)
# For each encoding that is possibly to be merged, find the best match
# in the decided encodings, and record that.
todo.sort(key=_Encoding.get_room)
for encoding in todo:
encoding._find_yourself_best_new_encoding(done_by_width)
# Walk through todo encodings, for each, see if merging it with
# another todo encoding gains more than each of them merging with
# their best decided encoding. If yes, merge them and add resulting
# encoding back to todo queue. If not, move the enconding to decided
# list. Repeat till done.
while todo:
encoding = todo.pop()
best_idx = None
best_gain = 0
for i,other_encoding in enumerate(todo):
combined_chars = other_encoding.chars | encoding.chars
combined_width = _Encoding._popcount(combined_chars)
combined_overhead = _Encoding._characteristic_overhead(combined_chars)
combined_gain = (
+ encoding.overhead
+ other_encoding.overhead
- combined_overhead
- (combined_width - encoding.width) * len(encoding)
- (combined_width - other_encoding.width) * len(other_encoding)
)
this_gain = 0 if encoding.best_new_encoding is None else (
+ encoding.overhead
- (encoding.best_new_encoding.width - encoding.width) * len(encoding)
)
other_gain = 0 if other_encoding.best_new_encoding is None else (
+ other_encoding.overhead
- (other_encoding.best_new_encoding.width - other_encoding.width) * len(other_encoding)
)
separate_gain = this_gain + other_gain
if combined_gain > separate_gain:
best_idx = i
best_gain = combined_gain - separate_gain
if best_idx is None:
# Encoding is decided as is
done_by_width[encoding.width].append(encoding)
else:
other_encoding = todo[best_idx]
combined_chars = other_encoding.chars | encoding.chars
combined_encoding = _Encoding(combined_chars)
combined_encoding.extend(encoding.items)
combined_encoding.extend(other_encoding.items)
combined_encoding._find_yourself_best_new_encoding(done_by_width)
del todo[best_idx]
todo.append(combined_encoding)
# Assemble final store.
back_mapping = {} # Mapping from full rows to new VarIdxes
encodings = sum(done_by_width.values(), [])
encodings.sort(key=_Encoding.sort_key)
self.VarData = []
for major,encoding in enumerate(encodings):
data = ot.VarData()
self.VarData.append(data)
data.VarRegionIndex = range(n)
data.VarRegionCount = len(data.VarRegionIndex)
data.Item = sorted(encoding.items)
for minor,item in enumerate(data.Item):
back_mapping[item] = (major<<16)+minor
# Compile final mapping.
varidx_map = {}
for k,v in front_mapping.items():
varidx_map[k] = back_mapping[v]
# Remove unused regions.
self.prune_regions() self.prune_regions()
# Recalculate things and go home. # Recalculate things and go home.
@ -252,7 +445,7 @@ def VarStore_optimize(self):
data.ItemCount = len(data.Item) data.ItemCount = len(data.Item)
VarData_CalculateNumShorts(data) VarData_CalculateNumShorts(data)
return mapping return varidx_map
ot.VarStore.optimize = VarStore_optimize ot.VarStore.optimize = VarStore_optimize
@ -283,7 +476,13 @@ def main(args=None):
#size = len(writer.getAllData()) #size = len(writer.getAllData())
#print("Before: %7d bytes" % size) #print("Before: %7d bytes" % size)
store.optimize() varidx_map = store.optimize()
gdef.table.remap_device_varidxes(varidx_map)
if 'GSUB' in font:
font['GSUB'].table.remap_device_varidxes(varidx_map)
if 'GPOS' in font:
font['GPOS'].table.remap_device_varidxes(varidx_map)
writer = OTTableWriter() writer = OTTableWriter()
store.compile(writer, font) store.compile(writer, font)