[OS/2] Add recalcCodePageRanges

Ported from the ufo2ft code (which is port from FontForge code), with some additional functions for parity with recalcUnicodeRanges.
2023-11-11 01:22:18 +02:00 · 2023-11-11 01:22:18 +02:00 · be8ec35934
commit be8ec35934
parent 6fa1a76e06
2 changed files with 184 additions and 9 deletions
--- a/Lib/fontTools/ttLib/tables/O_S_2f_2.py
+++ b/Lib/fontTools/ttLib/tables/O_S_2f_2.py
@ -340,6 +340,45 @@ class table_O_S_2f_2(DefaultTable.DefaultTable):
        self.setUnicodeRanges(bits)
        return bits

+    def getCodePageRanges(self):
+        """Return the set of 'ulCodePageRange*' bits currently enabled."""
+        bits = set()
+        ul1, ul2 = self.ulCodePageRange1, self.ulCodePageRange2
+        for i in range(32):
+            if ul1 & (1 << i):
+                bits.add(i)
+            if ul2 & (1 << i):
+                bits.add(i + 32)
+        return bits
+
+    def setCodePageRanges(self, bits):
+        """Set the 'ulCodePageRange*' fields to the specified 'bits'."""
+        ul1, ul2 = 0, 0
+        for bit in bits:
+            if 0 <= bit < 32:
+                ul1 |= 1 << bit
+            elif 32 <= bit < 64:
+                ul2 |= 1 << (bit - 32)
+            else:
+                raise ValueError(f"expected 0 <= int <= 63, found: {bit:r}")
+        self.ulCodePageRange1, self.ulCodePageRange2 = ul1, ul2
+
+    def recalcCodePageRanges(self, ttFont, pruneOnly=False):
+        unicodes = set()
+        for table in ttFont["cmap"].tables:
+            if table.isUnicode():
+                unicodes.update(table.cmap.keys())
+        bits = calcCodePageRanges(unicodes)
+        if pruneOnly:
+            bits &= self.getCodePageRanges()
+        # when no codepage ranges can be enabled, fall back to enabling bit 0
+        # (Latin 1) so that the font works in MS Word:
+        # https://github.com/googlei18n/fontmake/issues/468
+        if not bits:
+            bits = {0}
+        self.setCodePageRanges(bits)
+        return bits
+
    def recalcAvgCharWidth(self, ttFont):
        """Recalculate xAvgCharWidth using metrics from ttFont's 'hmtx' table.

@ -611,6 +650,92 @@ def intersectUnicodeRanges(unicodes, inverse=False):
    return set(range(len(OS2_UNICODE_RANGES))) - bits if inverse else bits


+def calcCodePageRanges(unicodes):
+    """Given a set of Unicode codepoints (integers), calculate the
+    corresponding OS/2 CodePage range bits.
+    This is a direct translation of FontForge implementation:
+    https://github.com/fontforge/fontforge/blob/7b2c074/fontforge/tottf.c#L3158
+    """
+    bits = set()
+    hasAscii = set(range(0x20, 0x7E)).issubset(unicodes)
+    hasLineart = ord("┤") in unicodes
+
+    for uni in unicodes:
+        if uni == ord("Þ") and hasAscii:
+            bits.add(0)  # Latin 1
+        elif uni == ord("Ľ") and hasAscii:
+            bits.add(1)  # Latin 2: Eastern Europe
+            if hasLineart:
+                bits.add(58)  # Latin 2
+        elif uni == ord("Б"):
+            bits.add(2)  # Cyrillic
+            if ord("Ѕ") in unicodes and hasLineart:
+                bits.add(57)  # IBM Cyrillic
+            if ord("╜") in unicodes and hasLineart:
+                bits.add(49)  # MS-DOS Russian
+        elif uni == ord("Ά"):
+            bits.add(3)  # Greek
+            if hasLineart and ord("½") in unicodes:
+                bits.add(48)  # IBM Greek
+            if hasLineart and ord("√") in unicodes:
+                bits.add(60)  # Greek, former 437 G
+        elif uni == ord("İ") and hasAscii:
+            bits.add(4)  # Turkish
+            if hasLineart:
+                bits.add(56)  # IBM turkish
+        elif uni == ord("א"):
+            bits.add(5)  # Hebrew
+            if hasLineart and ord("√") in unicodes:
+                bits.add(53)  # Hebrew
+        elif uni == ord("ر"):
+            bits.add(6)  # Arabic
+            if ord("√") in unicodes:
+                bits.add(51)  # Arabic
+            if hasLineart:
+                bits.add(61)  # Arabic; ASMO 708
+        elif uni == ord("ŗ") and hasAscii:
+            bits.add(7)  # Windows Baltic
+            if hasLineart:
+                bits.add(59)  # MS-DOS Baltic
+        elif uni == ord("₫") and hasAscii:
+            bits.add(8)  # Vietnamese
+        elif uni == ord("ๅ"):
+            bits.add(16)  # Thai
+        elif uni == ord("エ"):
+            bits.add(17)  # JIS/Japan
+        elif uni == ord("ㄅ"):
+            bits.add(18)  # Chinese: Simplified
+        elif uni == ord("ㄱ"):
+            bits.add(19)  # Korean wansung
+        elif uni == ord("央"):
+            bits.add(20)  # Chinese: Traditional
+        elif uni == ord("곴"):
+            bits.add(21)  # Korean Johab
+        elif uni == ord("♥") and hasAscii:
+            bits.add(30)  # OEM Character Set
+        # TODO: Symbol bit has a special meaning (check the spec), we need
+        # to confirm if this is wanted by default.
+        # elif chr(0xF000) <= char <= chr(0xF0FF):
+        #    codepageRanges.add(31)          # Symbol Character Set
+        elif uni == ord("þ") and hasAscii and hasLineart:
+            bits.add(54)  # MS-DOS Icelandic
+        elif uni == ord("╚") and hasAscii:
+            bits.add(62)  # WE/Latin 1
+            bits.add(63)  # US
+        elif hasAscii and hasLineart and ord("√") in unicodes:
+            if uni == ord("Å"):
+                bits.add(50)  # MS-DOS Nordic
+            elif uni == ord("é"):
+                bits.add(52)  # MS-DOS Canadian French
+            elif uni == ord("õ"):
+                bits.add(55)  # MS-DOS Portuguese
+
+    if hasAscii and ord("‰") in unicodes and ord("∑") in unicodes:
+        bits.add(29)  # Macintosh Character Set (US Roman)
+
+    return bits
+
+
 if __name__ == "__main__":
    import doctest, sys

--- a/Tests/ttLib/tables/O_S_2f_2_test.py
+++ b/Tests/ttLib/tables/O_S_2f_2_test.py
@ -4,6 +4,18 @@ import unittest


 class OS2TableTest(unittest.TestCase):
+    @staticmethod
+    def makeOS2_cmap(mapping):
+        font = TTFont()
+        font["OS/2"] = os2 = newTable("OS/2")
+        font["cmap"] = cmap = newTable("cmap")
+        st = getTableModule("cmap").CmapSubtable.newSubtable(4)
+        st.platformID, st.platEncID, st.language = 3, 1, 0
+        st.cmap = mapping
+        cmap.tables = []
+        cmap.tables.append(st)
+        return font, os2, cmap
+
    def test_getUnicodeRanges(self):
        table = table_O_S_2f_2()
        table.ulUnicodeRange1 = 0xFFFFFFFF
@ -27,14 +39,9 @@ class OS2TableTest(unittest.TestCase):
            table.setUnicodeRanges([-1, 127, 255])

    def test_recalcUnicodeRanges(self):
-        font = TTFont()
-        font["OS/2"] = os2 = newTable("OS/2")
-        font["cmap"] = cmap = newTable("cmap")
-        st = getTableModule("cmap").CmapSubtable.newSubtable(4)
-        st.platformID, st.platEncID, st.language = 3, 1, 0
-        st.cmap = {0x0041: "A", 0x03B1: "alpha", 0x0410: "Acyr"}
-        cmap.tables = []
-        cmap.tables.append(st)
+        font, os2, cmap = self.makeOS2_cmap(
+            {0x0041: "A", 0x03B1: "alpha", 0x0410: "Acyr"}
+        )
        os2.setUnicodeRanges({0, 1, 9})
        # 'pruneOnly' will clear any bits for which there's no intersection:
        # bit 1 ('Latin 1 Supplement'), in this case. However, it won't set
@ -43,7 +50,7 @@ class OS2TableTest(unittest.TestCase):
        # try again with pruneOnly=False: bit 7 is now set.
        self.assertEqual(os2.recalcUnicodeRanges(font), {0, 7, 9})
        # add a non-BMP char from 'Mahjong Tiles' block (bit 122)
-        st.cmap[0x1F000] = "eastwindtile"
+        cmap.tables[0].cmap[0x1F000] = "eastwindtile"
        # the bit 122 and the special bit 57 ('Non Plane 0') are also enabled
        self.assertEqual(os2.recalcUnicodeRanges(font), {0, 7, 9, 57, 122})

@ -55,6 +62,49 @@ class OS2TableTest(unittest.TestCase):
            (set(range(123)) - {9, 57, 122}),
        )

+    def test_getCodePageRanges(self):
+        table = table_O_S_2f_2()
+        table.ulCodePageRange1 = 0xFFFFFFFF
+        table.ulCodePageRange2 = 0xFFFFFFFF
+        bits = table.getCodePageRanges()
+        for i in range(63):
+            self.assertIn(i, bits)
+
+    def test_setCodePageRanges(self):
+        table = table_O_S_2f_2()
+        table.ulCodePageRange1 = 0
+        table.ulCodePageRange2 = 0
+        bits = set(range(64))
+        table.setCodePageRanges(bits)
+        self.assertEqual(table.getCodePageRanges(), bits)
+        with self.assertRaises(ValueError):
+            table.setCodePageRanges([-1])
+        with self.assertRaises(ValueError):
+            table.setCodePageRanges([64])
+        with self.assertRaises(ValueError):
+            table.setCodePageRanges([255])
+
+    def test_recalcCodePageRanges(self):
+        font, os2, cmap = self.makeOS2_cmap(
+            {ord("A"): "A", ord("Ά"): "Alphatonos", ord("Б"): "Be"}
+        )
+        os2.setCodePageRanges({0, 2, 9})
+
+        # With pruneOnly=True, should clear any CodePage for which there are no
+        # characters in the cmap.
+        self.assertEqual(os2.recalcCodePageRanges(font, pruneOnly=True), {2})
+
+        # With pruneOnly=False, should also set CodePages not initially set.
+        self.assertEqual(os2.recalcCodePageRanges(font), {2, 3})
+
+        # Add a Korean character, should set CodePage 21 (Korean Johab)
+        cmap.tables[0].cmap[ord("곴")] = "goss"
+        self.assertEqual(os2.recalcCodePageRanges(font), {2, 3, 21})
+
+        # Remove all characters from cmap, should still set CodePage 0 (Latin 1)
+        cmap.tables[0].cmap = {}
+        self.assertEqual(os2.recalcCodePageRanges(font), {0})
+

 if __name__ == "__main__":
    import sys