Refactor the Cluster class to be top-level instead of nested the function

This commit is contained in:
Jany Belluz 2021-07-05 15:46:59 +01:00
parent 016aa4cccc
commit 527179619b

View File

@ -1,5 +1,5 @@
import logging
from collections import defaultdict
from collections import defaultdict, namedtuple
from functools import reduce
from itertools import chain
from math import log2
@ -156,6 +156,143 @@ def _classDef_bytes(
return min(format1_bytes, format2_bytes)
ClusteringContext = namedtuple(
class Cluster:
# TODO(Python 3.7): Turn this into a dataclass
# ctx: ClusteringContext
# indices: int
# Caches
# TODO(Python 3.8): use functools.cached_property instead of the
# manually cached properties, and remove the cache fields listed below.
# _indices: Optional[List[int]] = None
# _column_indices: Optional[List[int]] = None
# _cost: Optional[int] = None
__slots__ = "ctx", "indices_bitmask", "_indices", "_column_indices", "_cost"
def __init__(self, ctx: ClusteringContext, indices_bitmask: int):
self.ctx = ctx
self.indices_bitmask = indices_bitmask
self._indices = None
self._column_indices = None
self._cost = None
def indices(self):
if self._indices is None:
self._indices = bit_indices(self.indices_bitmask)
return self._indices
def column_indices(self):
if self._column_indices is None:
# Indices of columns that have a 1 in at least 1 line
# => binary OR all the lines
bitmask = reduce(int.__or__, (self.ctx.lines[i] for i in self.indices))
self._column_indices = bit_indices(bitmask)
return self._column_indices
def width(self):
# Add 1 because Class2=0 cannot be used but needs to be encoded.
return len(self.column_indices) + 1
def cost(self):
if self._cost is None:
self._cost = (
# 2 bytes to store the offset to this subtable in the Lookup table above
# Contents of the subtable
# From:
# uint16 posFormat Format identifier: format = 2
+ 2
# Offset16 coverageOffset Offset to Coverage table, from beginning of PairPos subtable.
+ 2
+ self.coverage_bytes
# uint16 valueFormat1 ValueRecord definition — for the first glyph of the pair (may be zero).
+ 2
# uint16 valueFormat2 ValueRecord definition — for the second glyph of the pair (may be zero).
+ 2
# Offset16 classDef1Offset Offset to ClassDef table, from beginning of PairPos subtable — for the first glyph of the pair.
+ 2
+ self.classDef1_bytes
# Offset16 classDef2Offset Offset to ClassDef table, from beginning of PairPos subtable — for the second glyph of the pair.
+ 2
+ self.classDef2_bytes
# uint16 class1Count Number of classes in classDef1 table — includes Class 0.
+ 2
# uint16 class2Count Number of classes in classDef2 table — includes Class 0.
+ 2
# Class1Record class1Records[class1Count] Array of Class1 records, ordered by classes in classDef1.
+ (self.ctx.valueFormat1_bytes + self.ctx.valueFormat2_bytes)
* len(self.indices)
* self.width
return self._cost
def coverage_bytes(self):
format1_bytes = (
# From
# uint16 coverageFormat Format identifier — format = 1
# uint16 glyphCount Number of glyphs in the glyph array
# uint16 glyphArray[glyphCount] Array of glyph IDs — in numerical order
+ sum(len(self.ctx.all_class1[i]) for i in self.indices) * 2
ranges = sorted(
chain.from_iterable(self.ctx.all_class1_data[i][0] for i in self.indices)
merged_range_count = 0
last = None
for (start, end) in ranges:
if last is not None and start != last + 1:
merged_range_count += 1
last = end
format2_bytes = (
# From
# uint16 coverageFormat Format identifier — format = 2
# uint16 rangeCount Number of RangeRecords
# RangeRecord rangeRecords[rangeCount] Array of glyph ranges — ordered by startGlyphID.
# uint16 startGlyphID First glyph ID in the range
# uint16 endGlyphID Last glyph ID in the range
# uint16 startCoverageIndex Coverage Index of first glyph ID in range
+ merged_range_count * 6
return min(format1_bytes, format2_bytes)
def classDef1_bytes(self):
# We can skip encoding one of the Class1 definitions, and use
# Class1=0 to represent it instead, because Class1 is gated by the
# Coverage definition. Use Class1=0 for the highest byte savings.
# Going through all options takes too long, pick the biggest class
# = what happens in otlLib.builder.ClassDefBuilder.classes()
biggest_index = max(self.indices, key=lambda i: len(self.ctx.all_class1[i]))
return _classDef_bytes(
self.ctx.all_class1_data, [i for i in self.indices if i != biggest_index]
def classDef2_bytes(self):
# All Class2 need to be encoded because we can't use Class2=0
return _classDef_bytes(self.ctx.all_class2_data, self.column_indices)
def cluster_pairs_by_class2_coverage_custom_cost(
font: TTFont,
pairs: Pairs,
@ -196,134 +333,14 @@ def cluster_pairs_by_class2_coverage_custom_cost(
valueFormat1_bytes = bit_count(format1) * 2
valueFormat2_bytes = bit_count(format2) * 2
# Agglomerative clustering by hand, checking the cost gain of the new
# cluster against the previously separate clusters
# Start with 1 cluster per line
# cluster = set of lines = new subtable
# The class is here so it has a closure over the data above (lines, etc.)
class Cluster:
# TODO(Python 3.7): Turn this into a dataclass
# indices: int
# Caches
# TODO(Python 3.8): use functools.cached_property instead of the
# manually cached properties, and remove the cache fields listed below.
# _indices: Optional[List[int]] = None
# _column_indices: Optional[List[int]] = None
# _cost: Optional[int] = None
__slots__ = "indices_bitmask", "_indices", "_column_indices", "_cost"
def __init__(self, indices_bitmask: int):
self.indices_bitmask = indices_bitmask
self._indices = None
self._column_indices = None
self._cost = None
def indices(self):
if self._indices is None:
self._indices = bit_indices(self.indices_bitmask)
return self._indices
def column_indices(self):
if self._column_indices is None:
# Indices of columns that have a 1 in at least 1 line
# => binary OR all the lines
bitmask = reduce(int.__or__, (lines[i] for i in self.indices))
self._column_indices = bit_indices(bitmask)
return self._column_indices
def width(self):
# Add 1 because Class2=0 cannot be used but needs to be encoded.
return len(self.column_indices) + 1
def cost(self):
if self._cost is None:
self._cost = (
# 2 bytes to store the offset to this subtable in the Lookup table above
# Contents of the subtable
# From:
# uint16 posFormat Format identifier: format = 2
+ 2
# Offset16 coverageOffset Offset to Coverage table, from beginning of PairPos subtable.
+ 2
+ self.coverage_bytes
# uint16 valueFormat1 ValueRecord definition — for the first glyph of the pair (may be zero).
+ 2
# uint16 valueFormat2 ValueRecord definition — for the second glyph of the pair (may be zero).
+ 2
# Offset16 classDef1Offset Offset to ClassDef table, from beginning of PairPos subtable — for the first glyph of the pair.
+ 2
+ self.classDef1_bytes
# Offset16 classDef2Offset Offset to ClassDef table, from beginning of PairPos subtable — for the second glyph of the pair.
+ 2
+ self.classDef2_bytes
# uint16 class1Count Number of classes in classDef1 table — includes Class 0.
+ 2
# uint16 class2Count Number of classes in classDef2 table — includes Class 0.
+ 2
# Class1Record class1Records[class1Count] Array of Class1 records, ordered by classes in classDef1.
+ (valueFormat1_bytes + valueFormat2_bytes)
* len(self.indices)
* self.width
return self._cost
def coverage_bytes(self):
format1_bytes = (
# From
# uint16 coverageFormat Format identifier — format = 1
# uint16 glyphCount Number of glyphs in the glyph array
# uint16 glyphArray[glyphCount] Array of glyph IDs — in numerical order
+ sum(len(all_class1[i]) for i in self.indices) * 2
ranges = sorted(
chain.from_iterable(all_class1_data[i][0] for i in self.indices)
merged_range_count = 0
last = None
for (start, end) in ranges:
if last is not None and start != last + 1:
merged_range_count += 1
last = end
format2_bytes = (
# From
# uint16 coverageFormat Format identifier — format = 2
# uint16 rangeCount Number of RangeRecords
# RangeRecord rangeRecords[rangeCount] Array of glyph ranges — ordered by startGlyphID.
# uint16 startGlyphID First glyph ID in the range
# uint16 endGlyphID Last glyph ID in the range
# uint16 startCoverageIndex Coverage Index of first glyph ID in range
+ merged_range_count * 6
return min(format1_bytes, format2_bytes)
def classDef1_bytes(self):
# We can skip encoding one of the Class1 definitions, and use
# Class1=0 to represent it instead, because Class1 is gated by the
# Coverage definition. Use Class1=0 for the highest byte savings.
# Going through all options takes too long, pick the biggest class
# = what happens in otlLib.builder.ClassDefBuilder.classes()
biggest_index = max(self.indices, key=lambda i: len(all_class1[i]))
return _classDef_bytes(
all_class1_data, [i for i in self.indices if i != biggest_index]
def classDef2_bytes(self):
# All Class2 need to be encoded because we can't use Class2=0
return _classDef_bytes(all_class2_data, self.column_indices)
def merge(self, other: "Cluster") -> "Cluster":
return make_cluster(self.indices_bitmask | other.indices_bitmask)
ctx = ClusteringContext(
cluster_cache: Dict[int, Cluster] = {}
@ -331,10 +348,17 @@ def cluster_pairs_by_class2_coverage_custom_cost(
cluster = cluster_cache.get(indices, None)
if cluster is not None:
return cluster
cluster = Cluster(indices)
cluster = Cluster(ctx, indices)
cluster_cache[indices] = cluster
return cluster
def merge(cluster: Cluster, other: Cluster) -> Cluster:
return make_cluster(cluster.indices_bitmask | other.indices_bitmask)
# Agglomerative clustering by hand, checking the cost gain of the new
# cluster against the previously separate clusters
# Start with 1 cluster per line
# cluster = set of lines = new subtable
clusters = [make_cluster(1 << i) for i in range(len(lines))]
# Cost of 1 cluster with everything
@ -349,7 +373,7 @@ def cluster_pairs_by_class2_coverage_custom_cost(
best_merged = None
for i, cluster in enumerate(clusters):
for j, other in enumerate(clusters[i + 1 :]):
merged = cluster.merge(other)
merged = merge(cluster, other)
cost_change = merged.cost - cluster.cost - other.cost
if lowest_cost_change is None or cost_change < lowest_cost_change:
lowest_cost_change = cost_change