Refactor the Cluster class to be top-level instead of nested the function

2021-07-05 15:46:59 +01:00 · 2021-07-05 15:46:59 +01:00 · 527179619b
commit 527179619b
parent 016aa4cccc
1 changed files with 155 additions and 131 deletions
--- a/Lib/fontTools/otlLib/optimize/gpos.py
+++ b/Lib/fontTools/otlLib/optimize/gpos.py
@ -1,5 +1,5 @@
 import logging
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 from functools import reduce
 from itertools import chain
 from math import log2
@ -156,6 +156,143 @@ def _classDef_bytes(
    return min(format1_bytes, format2_bytes)
 ClusteringContext = namedtuple(
    "ClusteringContext",
    [
        "lines",
        "all_class1",
        "all_class1_data",
        "all_class2_data",
        "valueFormat1_bytes",
        "valueFormat2_bytes",
    ],
 )
 class Cluster:
    # TODO(Python 3.7): Turn this into a dataclass
    # ctx: ClusteringContext
    # indices: int
    # Caches
    # TODO(Python 3.8): use functools.cached_property instead of the
    # manually cached properties, and remove the cache fields listed below.
    # _indices: Optional[List[int]] = None
    # _column_indices: Optional[List[int]] = None
    # _cost: Optional[int] = None
    __slots__ = "ctx", "indices_bitmask", "_indices", "_column_indices", "_cost"
    def __init__(self, ctx: ClusteringContext, indices_bitmask: int):
        self.ctx = ctx
        self.indices_bitmask = indices_bitmask
        self._indices = None
        self._column_indices = None
        self._cost = None
    @property
    def indices(self):
        if self._indices is None:
            self._indices = bit_indices(self.indices_bitmask)
        return self._indices
    @property
    def column_indices(self):
        if self._column_indices is None:
            # Indices of columns that have a 1 in at least 1 line
            #   => binary OR all the lines
            bitmask = reduce(int.__or__, (self.ctx.lines[i] for i in self.indices))
            self._column_indices = bit_indices(bitmask)
        return self._column_indices
    @property
    def width(self):
        # Add 1 because Class2=0 cannot be used but needs to be encoded.
        return len(self.column_indices) + 1
    @property
    def cost(self):
        if self._cost is None:
            self._cost = (
                # 2 bytes to store the offset to this subtable in the Lookup table above
                2
                # Contents of the subtable
                # From: https://docs.microsoft.com/en-us/typography/opentype/spec/gpos#pair-adjustment-positioning-format-2-class-pair-adjustment
                # uint16	posFormat	Format identifier: format = 2
                + 2
                # Offset16	coverageOffset	Offset to Coverage table, from beginning of PairPos subtable.
                + 2
                + self.coverage_bytes
                # uint16	valueFormat1	ValueRecord definition — for the first glyph of the pair (may be zero).
                + 2
                # uint16	valueFormat2	ValueRecord definition — for the second glyph of the pair (may be zero).
                + 2
                # Offset16	classDef1Offset	Offset to ClassDef table, from beginning of PairPos subtable — for the first glyph of the pair.
                + 2
                + self.classDef1_bytes
                # Offset16	classDef2Offset	Offset to ClassDef table, from beginning of PairPos subtable — for the second glyph of the pair.
                + 2
                + self.classDef2_bytes
                # uint16	class1Count	Number of classes in classDef1 table — includes Class 0.
                + 2
                # uint16	class2Count	Number of classes in classDef2 table — includes Class 0.
                + 2
                # Class1Record	class1Records[class1Count]	Array of Class1 records, ordered by classes in classDef1.
                + (self.ctx.valueFormat1_bytes + self.ctx.valueFormat2_bytes)
                * len(self.indices)
                * self.width
            )
        return self._cost
    @property
    def coverage_bytes(self):
        format1_bytes = (
            # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-1
            # uint16	coverageFormat	Format identifier — format = 1
            # uint16	glyphCount	Number of glyphs in the glyph array
            4
            # uint16	glyphArray[glyphCount]	Array of glyph IDs — in numerical order
            + sum(len(self.ctx.all_class1[i]) for i in self.indices) * 2
        )
        ranges = sorted(
            chain.from_iterable(self.ctx.all_class1_data[i][0] for i in self.indices)
        )
        merged_range_count = 0
        last = None
        for (start, end) in ranges:
            if last is not None and start != last + 1:
                merged_range_count += 1
            last = end
        format2_bytes = (
            # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-2
            # uint16	coverageFormat	Format identifier — format = 2
            # uint16	rangeCount	Number of RangeRecords
            4
            # RangeRecord	rangeRecords[rangeCount]	Array of glyph ranges — ordered by startGlyphID.
            # uint16	startGlyphID	First glyph ID in the range
            # uint16	endGlyphID	Last glyph ID in the range
            # uint16	startCoverageIndex	Coverage Index of first glyph ID in range
            + merged_range_count * 6
        )
        return min(format1_bytes, format2_bytes)
    @property
    def classDef1_bytes(self):
        # We can skip encoding one of the Class1 definitions, and use
        # Class1=0 to represent it instead, because Class1 is gated by the
        # Coverage definition. Use Class1=0 for the highest byte savings.
        # Going through all options takes too long, pick the biggest class
        # = what happens in otlLib.builder.ClassDefBuilder.classes()
        biggest_index = max(self.indices, key=lambda i: len(self.ctx.all_class1[i]))
        return _classDef_bytes(
            self.ctx.all_class1_data, [i for i in self.indices if i != biggest_index]
        )
    @property
    def classDef2_bytes(self):
        # All Class2 need to be encoded because we can't use Class2=0
        return _classDef_bytes(self.ctx.all_class2_data, self.column_indices)
 def cluster_pairs_by_class2_coverage_custom_cost(
    font: TTFont,
    pairs: Pairs,
@ -196,134 +333,14 @@ def cluster_pairs_by_class2_coverage_custom_cost(
    valueFormat1_bytes = bit_count(format1) * 2
    valueFormat2_bytes = bit_count(format2) * 2
-    # Agglomerative clustering by hand, checking the cost gain of the new
+    ctx = ClusteringContext(
-    # cluster against the previously separate clusters
+        lines,
-    # Start with 1 cluster per line
+        all_class1,
-    # cluster = set of lines = new subtable
+        all_class1_data,
-    # The class is here so it has a closure over the data above (lines, etc.)
+        all_class2_data,
-    class Cluster:
+        valueFormat1_bytes,
-        # TODO(Python 3.7): Turn this into a dataclass
+        valueFormat2_bytes,
-        # indices: int
+    )
        # Caches
        # TODO(Python 3.8): use functools.cached_property instead of the
        # manually cached properties, and remove the cache fields listed below.
        # _indices: Optional[List[int]] = None
        # _column_indices: Optional[List[int]] = None
        # _cost: Optional[int] = None
        __slots__ = "indices_bitmask", "_indices", "_column_indices", "_cost"
        def __init__(self, indices_bitmask: int):
            self.indices_bitmask = indices_bitmask
            self._indices = None
            self._column_indices = None
            self._cost = None
        @property
        def indices(self):
            if self._indices is None:
                self._indices = bit_indices(self.indices_bitmask)
            return self._indices
        @property
        def column_indices(self):
            if self._column_indices is None:
                # Indices of columns that have a 1 in at least 1 line
                #   => binary OR all the lines
                bitmask = reduce(int.__or__, (lines[i] for i in self.indices))
                self._column_indices = bit_indices(bitmask)
            return self._column_indices
        @property
        def width(self):
            # Add 1 because Class2=0 cannot be used but needs to be encoded.
            return len(self.column_indices) + 1
        @property
        def cost(self):
            if self._cost is None:
                self._cost = (
                    # 2 bytes to store the offset to this subtable in the Lookup table above
                    2
                    # Contents of the subtable
                    # From: https://docs.microsoft.com/en-us/typography/opentype/spec/gpos#pair-adjustment-positioning-format-2-class-pair-adjustment
                    # uint16	posFormat	Format identifier: format = 2
                    + 2
                    # Offset16	coverageOffset	Offset to Coverage table, from beginning of PairPos subtable.
                    + 2
                    + self.coverage_bytes
                    # uint16	valueFormat1	ValueRecord definition — for the first glyph of the pair (may be zero).
                    + 2
                    # uint16	valueFormat2	ValueRecord definition — for the second glyph of the pair (may be zero).
                    + 2
                    # Offset16	classDef1Offset	Offset to ClassDef table, from beginning of PairPos subtable — for the first glyph of the pair.
                    + 2
                    + self.classDef1_bytes
                    # Offset16	classDef2Offset	Offset to ClassDef table, from beginning of PairPos subtable — for the second glyph of the pair.
                    + 2
                    + self.classDef2_bytes
                    # uint16	class1Count	Number of classes in classDef1 table — includes Class 0.
                    + 2
                    # uint16	class2Count	Number of classes in classDef2 table — includes Class 0.
                    + 2
                    # Class1Record	class1Records[class1Count]	Array of Class1 records, ordered by classes in classDef1.
                    + (valueFormat1_bytes + valueFormat2_bytes)
                    * len(self.indices)
                    * self.width
                )
            return self._cost
        @property
        def coverage_bytes(self):
            format1_bytes = (
                # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-1
                # uint16	coverageFormat	Format identifier — format = 1
                # uint16	glyphCount	Number of glyphs in the glyph array
                4
                # uint16	glyphArray[glyphCount]	Array of glyph IDs — in numerical order
                + sum(len(all_class1[i]) for i in self.indices) * 2
            )
            ranges = sorted(
                chain.from_iterable(all_class1_data[i][0] for i in self.indices)
            )
            merged_range_count = 0
            last = None
            for (start, end) in ranges:
                if last is not None and start != last + 1:
                    merged_range_count += 1
                last = end
            format2_bytes = (
                # From https://docs.microsoft.com/en-us/typography/opentype/spec/chapter2#coverage-format-2
                # uint16	coverageFormat	Format identifier — format = 2
                # uint16	rangeCount	Number of RangeRecords
                4
                # RangeRecord	rangeRecords[rangeCount]	Array of glyph ranges — ordered by startGlyphID.
                # uint16	startGlyphID	First glyph ID in the range
                # uint16	endGlyphID	Last glyph ID in the range
                # uint16	startCoverageIndex	Coverage Index of first glyph ID in range
                + merged_range_count * 6
            )
            return min(format1_bytes, format2_bytes)
        @property
        def classDef1_bytes(self):
            # We can skip encoding one of the Class1 definitions, and use
            # Class1=0 to represent it instead, because Class1 is gated by the
            # Coverage definition. Use Class1=0 for the highest byte savings.
            # Going through all options takes too long, pick the biggest class
            # = what happens in otlLib.builder.ClassDefBuilder.classes()
            biggest_index = max(self.indices, key=lambda i: len(all_class1[i]))
            return _classDef_bytes(
                all_class1_data, [i for i in self.indices if i != biggest_index]
            )
        @property
        def classDef2_bytes(self):
            # All Class2 need to be encoded because we can't use Class2=0
            return _classDef_bytes(all_class2_data, self.column_indices)
        def merge(self, other: "Cluster") -> "Cluster":
            return make_cluster(self.indices_bitmask | other.indices_bitmask)
    cluster_cache: Dict[int, Cluster] = {}
@ -331,10 +348,17 @@ def cluster_pairs_by_class2_coverage_custom_cost(
        cluster = cluster_cache.get(indices, None)
        if cluster is not None:
            return cluster
-        cluster = Cluster(indices)
+        cluster = Cluster(ctx, indices)
        cluster_cache[indices] = cluster
        return cluster
    def merge(cluster: Cluster, other: Cluster) -> Cluster:
        return make_cluster(cluster.indices_bitmask | other.indices_bitmask)
    # Agglomerative clustering by hand, checking the cost gain of the new
    # cluster against the previously separate clusters
    # Start with 1 cluster per line
    # cluster = set of lines = new subtable
    clusters = [make_cluster(1 << i) for i in range(len(lines))]
    # Cost of 1 cluster with everything
@ -349,7 +373,7 @@ def cluster_pairs_by_class2_coverage_custom_cost(
        best_merged = None
        for i, cluster in enumerate(clusters):
            for j, other in enumerate(clusters[i + 1 :]):
-                merged = cluster.merge(other)
+                merged = merge(cluster, other)
                cost_change = merged.cost - cluster.cost - other.cost
                if lowest_cost_change is None or cost_change < lowest_cost_change:
                    lowest_cost_change = cost_change