From 527179619bd3ba4792afb829c614b1f47d7a5d9b Mon Sep 17 00:00:00 2001 From: Jany Belluz Date: Mon, 5 Jul 2021 15:46:59 +0100 Subject: [PATCH] Refactor the Cluster class to be top-level instead of nested the function --- Lib/fontTools/otlLib/optimize/gpos.py | 286 ++++++++++++++------------ 1 file changed, 155 insertions(+), 131 deletions(-) diff --git a/Lib/fontTools/otlLib/optimize/gpos.py b/Lib/fontTools/otlLib/optimize/gpos.py index b2097a96f..3d3f83908 100644 --- a/Lib/fontTools/otlLib/optimize/gpos.py +++ b/Lib/fontTools/otlLib/optimize/gpos.py @@ -1,5 +1,5 @@ import logging -from collections import defaultdict +from collections import defaultdict, namedtuple from functools import reduce from itertools import chain from math import log2 @@ -156,6 +156,143 @@ def _classDef_bytes( return min(format1_bytes, format2_bytes) +ClusteringContext = namedtuple( + "ClusteringContext", + [ + "lines", + "all_class1", + "all_class1_data", + "all_class2_data", + "valueFormat1_bytes", + "valueFormat2_bytes", + ], +) + + +class Cluster: + # TODO(Python 3.7): Turn this into a dataclass + # ctx: ClusteringContext + # indices: int + # Caches + # TODO(Python 3.8): use functools.cached_property instead of the + # manually cached properties, and remove the cache fields listed below. + # _indices: Optional[List[int]] = None + # _column_indices: Optional[List[int]] = None + # _cost: Optional[int] = None + + __slots__ = "ctx", "indices_bitmask", "_indices", "_column_indices", "_cost" + + def __init__(self, ctx: ClusteringContext, indices_bitmask: int): + self.ctx = ctx + self.indices_bitmask = indices_bitmask + self._indices = None + self._column_indices = None + self._cost = None + + @property + def indices(self): + if self._indices is None: + self._indices = bit_indices(self.indices_bitmask) + return self._indices + + @property + def column_indices(self): + if self._column_indices is None: + # Indices of columns that have a 1 in at least 1 line + # => binary OR all the lines + bitmask = reduce(int.__or__, (self.ctx.lines[i] for i in self.indices)) + self._column_indices = bit_indices(bitmask) + return self._column_indices + + @property + def width(self): + # Add 1 because Class2=0 cannot be used but needs to be encoded. + return len(self.column_indices) + 1 + + @property + def cost(self): + if self._cost is None: + self._cost = ( + # 2 bytes to store the offset to this subtable in the Lookup table above + 2 + # Contents of the subtable + # From: https://docs.microsoft.com/en-us/typography/opentype/spec/gpos#pair-adjustment-positioning-format-2-class-pair-adjustment + # uint16 posFormat Format identifier: format = 2 + + 2 + # Offset16 coverageOffset Offset to Coverage table, from beginning of PairPos subtable. + + 2 + + self.coverage_bytes + # uint16 valueFormat1 ValueRecord definition — for the first glyph of the pair (may be zero). + + 2 + # uint16 valueFormat2 ValueRecord definition — for the second glyph of the pair (may be zero). + + 2 + # Offset16 classDef1Offset Offset to ClassDef table, from beginning of PairPos subtable — for the first glyph of the pair. + + 2 + + self.classDef1_bytes + # Offset16 classDef2Offset Offset to ClassDef table, from beginning of PairPos subtable — for the second glyph of the pair. + + 2 + + self.classDef2_bytes + # uint16 class1Count Number of classes in classDef1 table — includes Class 0. + + 2 + # uint16 class2Count Number of classes in classDef2 table — includes Class 0. + + 2 + # Class1Record class1Records[class1Count] Array of Class1 records, ordered by classes in classDef1. + + (self.ctx.valueFormat1_bytes + self.ctx.valueFormat2_bytes) + * len(self.indices) + * self.width + ) + return self._cost + + @property + def coverage_bytes(self): + format1_bytes = ( + # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-1 + # uint16 coverageFormat Format identifier — format = 1 + # uint16 glyphCount Number of glyphs in the glyph array + 4 + # uint16 glyphArray[glyphCount] Array of glyph IDs — in numerical order + + sum(len(self.ctx.all_class1[i]) for i in self.indices) * 2 + ) + ranges = sorted( + chain.from_iterable(self.ctx.all_class1_data[i][0] for i in self.indices) + ) + merged_range_count = 0 + last = None + for (start, end) in ranges: + if last is not None and start != last + 1: + merged_range_count += 1 + last = end + format2_bytes = ( + # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-2 + # uint16 coverageFormat Format identifier — format = 2 + # uint16 rangeCount Number of RangeRecords + 4 + # RangeRecord rangeRecords[rangeCount] Array of glyph ranges — ordered by startGlyphID. + # uint16 startGlyphID First glyph ID in the range + # uint16 endGlyphID Last glyph ID in the range + # uint16 startCoverageIndex Coverage Index of first glyph ID in range + + merged_range_count * 6 + ) + return min(format1_bytes, format2_bytes) + + @property + def classDef1_bytes(self): + # We can skip encoding one of the Class1 definitions, and use + # Class1=0 to represent it instead, because Class1 is gated by the + # Coverage definition. Use Class1=0 for the highest byte savings. + # Going through all options takes too long, pick the biggest class + # = what happens in otlLib.builder.ClassDefBuilder.classes() + biggest_index = max(self.indices, key=lambda i: len(self.ctx.all_class1[i])) + return _classDef_bytes( + self.ctx.all_class1_data, [i for i in self.indices if i != biggest_index] + ) + + @property + def classDef2_bytes(self): + # All Class2 need to be encoded because we can't use Class2=0 + return _classDef_bytes(self.ctx.all_class2_data, self.column_indices) + + def cluster_pairs_by_class2_coverage_custom_cost( font: TTFont, pairs: Pairs, @@ -196,134 +333,14 @@ def cluster_pairs_by_class2_coverage_custom_cost( valueFormat1_bytes = bit_count(format1) * 2 valueFormat2_bytes = bit_count(format2) * 2 - # Agglomerative clustering by hand, checking the cost gain of the new - # cluster against the previously separate clusters - # Start with 1 cluster per line - # cluster = set of lines = new subtable - # The class is here so it has a closure over the data above (lines, etc.) - class Cluster: - # TODO(Python 3.7): Turn this into a dataclass - # indices: int - # Caches - # TODO(Python 3.8): use functools.cached_property instead of the - # manually cached properties, and remove the cache fields listed below. - # _indices: Optional[List[int]] = None - # _column_indices: Optional[List[int]] = None - # _cost: Optional[int] = None - - __slots__ = "indices_bitmask", "_indices", "_column_indices", "_cost" - - def __init__(self, indices_bitmask: int): - self.indices_bitmask = indices_bitmask - self._indices = None - self._column_indices = None - self._cost = None - - @property - def indices(self): - if self._indices is None: - self._indices = bit_indices(self.indices_bitmask) - return self._indices - - @property - def column_indices(self): - if self._column_indices is None: - # Indices of columns that have a 1 in at least 1 line - # => binary OR all the lines - bitmask = reduce(int.__or__, (lines[i] for i in self.indices)) - self._column_indices = bit_indices(bitmask) - return self._column_indices - - @property - def width(self): - # Add 1 because Class2=0 cannot be used but needs to be encoded. - return len(self.column_indices) + 1 - - @property - def cost(self): - if self._cost is None: - self._cost = ( - # 2 bytes to store the offset to this subtable in the Lookup table above - 2 - # Contents of the subtable - # From: https://docs.microsoft.com/en-us/typography/opentype/spec/gpos#pair-adjustment-positioning-format-2-class-pair-adjustment - # uint16 posFormat Format identifier: format = 2 - + 2 - # Offset16 coverageOffset Offset to Coverage table, from beginning of PairPos subtable. - + 2 - + self.coverage_bytes - # uint16 valueFormat1 ValueRecord definition — for the first glyph of the pair (may be zero). - + 2 - # uint16 valueFormat2 ValueRecord definition — for the second glyph of the pair (may be zero). - + 2 - # Offset16 classDef1Offset Offset to ClassDef table, from beginning of PairPos subtable — for the first glyph of the pair. - + 2 - + self.classDef1_bytes - # Offset16 classDef2Offset Offset to ClassDef table, from beginning of PairPos subtable — for the second glyph of the pair. - + 2 - + self.classDef2_bytes - # uint16 class1Count Number of classes in classDef1 table — includes Class 0. - + 2 - # uint16 class2Count Number of classes in classDef2 table — includes Class 0. - + 2 - # Class1Record class1Records[class1Count] Array of Class1 records, ordered by classes in classDef1. - + (valueFormat1_bytes + valueFormat2_bytes) - * len(self.indices) - * self.width - ) - return self._cost - - @property - def coverage_bytes(self): - format1_bytes = ( - # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-1 - # uint16 coverageFormat Format identifier — format = 1 - # uint16 glyphCount Number of glyphs in the glyph array - 4 - # uint16 glyphArray[glyphCount] Array of glyph IDs — in numerical order - + sum(len(all_class1[i]) for i in self.indices) * 2 - ) - ranges = sorted( - chain.from_iterable(all_class1_data[i][0] for i in self.indices) - ) - merged_range_count = 0 - last = None - for (start, end) in ranges: - if last is not None and start != last + 1: - merged_range_count += 1 - last = end - format2_bytes = ( - # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-2 - # uint16 coverageFormat Format identifier — format = 2 - # uint16 rangeCount Number of RangeRecords - 4 - # RangeRecord rangeRecords[rangeCount] Array of glyph ranges — ordered by startGlyphID. - # uint16 startGlyphID First glyph ID in the range - # uint16 endGlyphID Last glyph ID in the range - # uint16 startCoverageIndex Coverage Index of first glyph ID in range - + merged_range_count * 6 - ) - return min(format1_bytes, format2_bytes) - - @property - def classDef1_bytes(self): - # We can skip encoding one of the Class1 definitions, and use - # Class1=0 to represent it instead, because Class1 is gated by the - # Coverage definition. Use Class1=0 for the highest byte savings. - # Going through all options takes too long, pick the biggest class - # = what happens in otlLib.builder.ClassDefBuilder.classes() - biggest_index = max(self.indices, key=lambda i: len(all_class1[i])) - return _classDef_bytes( - all_class1_data, [i for i in self.indices if i != biggest_index] - ) - - @property - def classDef2_bytes(self): - # All Class2 need to be encoded because we can't use Class2=0 - return _classDef_bytes(all_class2_data, self.column_indices) - - def merge(self, other: "Cluster") -> "Cluster": - return make_cluster(self.indices_bitmask | other.indices_bitmask) + ctx = ClusteringContext( + lines, + all_class1, + all_class1_data, + all_class2_data, + valueFormat1_bytes, + valueFormat2_bytes, + ) cluster_cache: Dict[int, Cluster] = {} @@ -331,10 +348,17 @@ def cluster_pairs_by_class2_coverage_custom_cost( cluster = cluster_cache.get(indices, None) if cluster is not None: return cluster - cluster = Cluster(indices) + cluster = Cluster(ctx, indices) cluster_cache[indices] = cluster return cluster + def merge(cluster: Cluster, other: Cluster) -> Cluster: + return make_cluster(cluster.indices_bitmask | other.indices_bitmask) + + # Agglomerative clustering by hand, checking the cost gain of the new + # cluster against the previously separate clusters + # Start with 1 cluster per line + # cluster = set of lines = new subtable clusters = [make_cluster(1 << i) for i in range(len(lines))] # Cost of 1 cluster with everything @@ -349,7 +373,7 @@ def cluster_pairs_by_class2_coverage_custom_cost( best_merged = None for i, cluster in enumerate(clusters): for j, other in enumerate(clusters[i + 1 :]): - merged = cluster.merge(other) + merged = merge(cluster, other) cost_change = merged.cost - cluster.cost - other.cost if lowest_cost_change is None or cost_change < lowest_cost_change: lowest_cost_change = cost_change