[tfmLib] New library for reading TFM files

New library for reading TeX Font Metrics files. Does not support writing them back. Fixes https://github.com/fonttools/fonttools/issues/2352
2021-07-05 02:13:09 +02:00 · 2021-07-05 02:13:09 +02:00 · a7ac2de5cd
commit a7ac2de5cd
parent 383e70fc39
6 changed files with 550 additions and 0 deletions
--- a/Lib/fontTools/tfmLib.py
+++ b/Lib/fontTools/tfmLib.py
@ -0,0 +1,460 @@
+"""Module for reading TFM (TeX Font Metrics) files.
+
+The TFM format is described in the TFtoPL WEB source code, whose typeset form
+can be found on `CTAN <http://mirrors.ctan.org/info/knuth-pdf/texware/tftopl.pdf>`_.
+
+	>>> from fontTools.tfmLib import TFM
+	>>> tfm = TFM("Tests/tfmLib/data/cmr10.tfm")
+	>>>
+	>>> # Accessing an attribute gets you metadata.
+	>>> tfm.checksum
+	1274110073
+	>>> tfm.designsize
+	10.0
+	>>> tfm.codingscheme
+	'TeX text'
+	>>> tfm.family
+	'CMR'
+	>>> tfm.seven_bit_safe_flag
+	False
+	>>> tfm.face
+	234
+	>>> tfm.extraheader
+	{}
+	>>> tfm.fontdimens
+	{'SLANT': 0.0, 'SPACE': 0.33333396911621094, 'STRETCH': 0.16666698455810547, 'SHRINK': 0.11111164093017578, 'XHEIGHT': 0.4305553436279297, 'QUAD': 1.0000028610229492, 'EXTRASPACE': 0.11111164093017578}
+	>>> # Accessing a character gets you its metrics.
+	>>> # “width” is always available, other metrics are available only when
+	>>> # applicable. All values are relative to “designsize”.
+	>>> tfm.chars[ord("g")]
+	{'width': 0.5000019073486328, 'height': 0.4305553436279297, 'depth': 0.1944446563720703, 'italic': 0.013888359069824219}
+	>>> # Kerning and ligature can be accessed as well.
+	>>> tfm.kerning[ord("c")]
+	{104: -0.02777862548828125, 107: -0.02777862548828125}
+	>>> tfm.ligatures[ord("f")]
+	{105: ('LIG', 12), 102: ('LIG', 11), 108: ('LIG', 13)}
+"""
+
+from types import SimpleNamespace
+
+from fontTools.misc.sstruct import calcsize, unpack, unpack2
+
+SIZES_FORMAT = """
+    >
+    lf: h    # length of the entire file, in words
+    lh: h    # length of the header data, in words
+    bc: h    # smallest character code in the font
+    ec: h    # largest character code in the font
+    nw: h    # number of words in the width table
+    nh: h    # number of words in the height table
+    nd: h    # number of words in the depth table
+    ni: h    # number of words in the italic correction table
+    nl: h    # number of words in the ligature/kern table
+    nk: h    # number of words in the kern table
+    ne: h    # number of words in the extensible character table
+    np: h    # number of font parameter words
+"""
+
+SIZES_SIZE = calcsize(SIZES_FORMAT)
+
+FIXED_FORMAT = "12.20F"
+
+HEADER_FORMAT1 = f"""
+    >
+    checksum:            L
+    designsize:          {FIXED_FORMAT}
+"""
+
+HEADER_FORMAT2 = f"""
+    {HEADER_FORMAT1}
+    codingscheme:        40p
+"""
+
+HEADER_FORMAT3 = f"""
+    {HEADER_FORMAT2}
+    family:              20p
+"""
+
+HEADER_FORMAT4 = f"""
+    {HEADER_FORMAT3}
+    seven_bit_safe_flag: ?
+    ignored:             x
+    ignored:             x
+    face:                B
+"""
+
+HEADER_SIZE1 = calcsize(HEADER_FORMAT1)
+HEADER_SIZE2 = calcsize(HEADER_FORMAT2)
+HEADER_SIZE3 = calcsize(HEADER_FORMAT3)
+HEADER_SIZE4 = calcsize(HEADER_FORMAT4)
+
+LIG_KERN_COMMAND = """
+    >
+    skip_byte: B
+    next_char: B
+    op_byte: B
+    remainder: B
+"""
+
+BASE_PARAMS = [
+    "SLANT",
+    "SPACE",
+    "STRETCH",
+    "SHRINK",
+    "XHEIGHT",
+    "QUAD",
+    "EXTRASPACE",
+]
+
+MATHSY_PARAMS = [
+    "NUM1",
+    "NUM2",
+    "NUM3",
+    "DENOM1",
+    "DENOM2",
+    "SUP1",
+    "SUP2",
+    "SUP3",
+    "SUB1",
+    "SUB2",
+    "SUPDROP",
+    "SUBDROP",
+    "DELIM1",
+    "DELIM2",
+    "AXISHEIGHT",
+]
+
+MATHEX_PARAMS = [
+    "DEFAULTRULETHICKNESS",
+    "BIGOPSPACING1",
+    "BIGOPSPACING2",
+    "BIGOPSPACING3",
+    "BIGOPSPACING4",
+    "BIGOPSPACING5",
+]
+
+VANILLA = 0
+MATHSY = 1
+MATHEX = 2
+
+UNREACHABLE = 0
+PASSTHROUGH = 1
+ACCESSABLE = 2
+
+NO_TAG = 0
+LIG_TAG = 1
+LIST_TAG = 2
+EXT_TAG = 3
+
+STOP_FLAG = 128
+KERN_FLAG = 128
+
+
+class TFMException(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class TFM:
+    def __init__(self, file):
+        self._read(file)
+
+    def __repr__(self):
+        return (
+            f"<TFM"
+            f" for {self.family}"
+            f" in {self.codingscheme}"
+            f" at {self.designsize:g}pt>"
+        )
+
+    def _read(self, file):
+        if hasattr(file, "read"):
+            data = file.read()
+        else:
+            with open(file, "rb") as fp:
+                data = fp.read()
+
+        self._data = data
+
+        if len(data) < SIZES_SIZE:
+            raise TFMException("Too short input file")
+
+        sizes = SimpleNamespace()
+        unpack2(SIZES_FORMAT, data, sizes)
+
+        # Do some file structure sanity checks.
+        # TeX and TFtoPL do additional functional checks and might even correct
+        # “errors” in the input file, but we instead try to output the file as
+        # it is as long as it is parsable, even if the data make no sense.
+
+        if sizes.lf < 0:
+            raise TFMException("The file claims to have negative or zero length!")
+
+        if len(data) < sizes.lf * 4:
+            raise TFMException("The file has fewer bytes than it claims!")
+
+        for name, length in vars(sizes).items():
+            if length < 0:
+                raise TFMException("The subfile size: '{name}' is negative!")
+
+        if sizes.lh < 2:
+            raise TFMException(f"The header length is only {sizes.lh}!")
+
+        if sizes.bc > sizes.ec + 1 or sizes.ec > 255:
+            raise TFMException(
+                f"The character code range {sizes.bc}..{sizes.ec} is illegal!"
+            )
+
+        if sizes.nw == 0 or sizes.nh == 0 or sizes.nd == 0 or sizes.ni == 0:
+            raise TFMException("Incomplete subfiles for character dimensions!")
+
+        if sizes.ne > 256:
+            raise TFMException(f"There are {ne} extensible recipes!")
+
+        if sizes.lf != (
+            6
+            + sizes.lh
+            + (sizes.ec - sizes.bc + 1)
+            + sizes.nw
+            + sizes.nh
+            + sizes.nd
+            + sizes.ni
+            + sizes.nl
+            + sizes.nk
+            + sizes.ne
+            + sizes.np
+        ):
+            raise TFMException("Subfile sizes don’t add up to the stated total")
+
+        # Subfile offsets, used in the helper function below. These all are
+        # 32-bit word offsets not 8-bit byte offsets.
+        char_base = 6 + sizes.lh - sizes.bc
+        width_base = char_base + sizes.ec + 1
+        height_base = width_base + sizes.nw
+        depth_base = height_base + sizes.nh
+        italic_base = depth_base + sizes.nd
+        lig_kern_base = italic_base + sizes.ni
+        kern_base = lig_kern_base + sizes.nl
+        exten_base = kern_base + sizes.nk
+        param_base = exten_base + sizes.ne
+
+        # Helper functions for accessing individual data. If this looks
+        # nonidiomatic Python, I blame the effect of reading the literate WEB
+        # documentation of TFtoPL.
+        def char_info(c):
+            return 4 * (char_base + c)
+
+        def width_index(c):
+            return data[char_info(c)]
+
+        def noneexistent(c):
+            return c < sizes.bc or c > sizes.ec or width_index(c) == 0
+
+        def height_index(c):
+            return data[char_info(c) + 1] // 16
+
+        def depth_index(c):
+            return data[char_info(c) + 1] % 16
+
+        def italic_index(c):
+            return data[char_info(c) + 2] // 4
+
+        def tag(c):
+            return data[char_info(c) + 2] % 4
+
+        def remainder(c):
+            return data[char_info(c) + 3]
+
+        def width(c):
+            r = 4 * (width_base + width_index(c))
+            return read_fixed(r, "v")["v"]
+
+        def height(c):
+            r = 4 * (height_base + height_index(c))
+            return read_fixed(r, "v")["v"]
+
+        def depth(c):
+            r = 4 * (depth_base + depth_index(c))
+            return read_fixed(r, "v")["v"]
+
+        def italic(c):
+            r = 4 * (italic_base + italic_index(c))
+            return read_fixed(r, "v")["v"]
+
+        def exten(c):
+            return 4 * (exten_base + remainder(c))
+
+        def lig_step(i):
+            return 4 * (lig_kern_base + i)
+
+        def lig_kern_command(i):
+            command = SimpleNamespace()
+            unpack2(LIG_KERN_COMMAND, data[i:], command)
+            return command
+
+        def kern(i):
+            r = 4 * (kern_base + i)
+            return read_fixed(r, "v")["v"]
+
+        def param(i):
+            return 4 * (param_base + i)
+
+        def read_fixed(index, key, obj=None):
+            ret = unpack2(f">;{key}:{FIXED_FORMAT}", data[index:], obj)
+            return ret[0]
+
+        # Set all attributes to empty values regardless of the header size.
+        unpack(HEADER_FORMAT4, [0] * HEADER_SIZE4, self)
+
+        offset = 24
+        length = sizes.lh * 4
+        self.extraheader = {}
+        if length >= HEADER_SIZE4:
+            rest = unpack2(HEADER_FORMAT4, data[offset:], self)[1]
+            if self.face < 18:
+                s = self.face % 2
+                b = self.face // 2
+                self.face = "MBL"[b % 3] + "RI"[s] + "RCE"[b // 3]
+            for i in range(sizes.lh - HEADER_SIZE4 // 4):
+                rest = unpack2(f">;HEADER{i + 18}:l", rest, self.extraheader)[1]
+        elif length >= HEADER_SIZE3:
+            unpack2(HEADER_FORMAT3, data[offset:], self)
+        elif length >= HEADER_SIZE2:
+            unpack2(HEADER_FORMAT2, data[offset:], self)
+        elif length >= HEADER_SIZE1:
+            unpack2(HEADER_FORMAT1, data[offset:], self)
+
+        self.fonttype = VANILLA
+        scheme = self.codingscheme.upper()
+        if scheme.startswith("TEX MATH SY"):
+            self.fonttype = MATHSY
+        elif scheme.startswith("TEX MATH EX"):
+            self.fonttype = MATHEX
+
+        self.fontdimens = {}
+        for i in range(sizes.np):
+            name = f"PARAMETER{i+1}"
+            if i <= 6:
+                name = BASE_PARAMS[i]
+            elif self.fonttype == MATHSY and i <= 21:
+                name = MATHSY_PARAMS[i - 7]
+            elif self.fonttype == MATHEX and i <= 12:
+                name = MATHEX_PARAMS[i - 7]
+            read_fixed(param(i), name, self.fontdimens)
+
+        lig_kern_map = {}
+        self.right_boundary_char = None
+        self.left_boundary_char = None
+        if sizes.nl > 0:
+            cmd = lig_kern_command(lig_step(0))
+            if cmd.skip_byte == 255:
+                self.right_boundary_char = cmd.next_char
+
+            cmd = lig_kern_command(lig_step((sizes.nl - 1)))
+            if cmd.skip_byte == 255:
+                self.left_boundary_char = 256
+                r = 256 * cmd.op_byte + cmd.remainder
+                lig_kern_map[self.left_boundary_char] = r
+
+        self.chars = {}
+        for c in range(sizes.bc, sizes.ec + 1):
+            if width_index(c) > 0:
+                self.chars[c] = info = {}
+                info["width"] = width(c)
+                if height_index(c) > 0:
+                    info["height"] = height(c)
+                if depth_index(c) > 0:
+                    info["depth"] = depth(c)
+                if italic_index(c) > 0:
+                    info["italic"] = italic(c)
+                char_tag = tag(c)
+                if char_tag == NO_TAG:
+                    pass
+                elif char_tag == LIG_TAG:
+                    lig_kern_map[c] = remainder(c)
+                elif char_tag == LIST_TAG:
+                    info["nextlarger"] = remainder(c)
+                elif char_tag == EXT_TAG:
+                    info["varchar"] = varchar = {}
+                    for i in range(4):
+                        part = data[exten(c) + i]
+                        if i == 3 or part > 0:
+                            name = "rep"
+                            if i == 0:
+                                name = "top"
+                            elif i == 1:
+                                name = "mid"
+                            elif i == 2:
+                                name = "bot"
+                            if noneexistent(part):
+                                varchar[name] = c
+                            else:
+                                varchar[name] = part
+
+        self.ligatures = {}
+        self.kerning = {}
+        for c, i in sorted(lig_kern_map.items()):
+            cmd = lig_kern_command(lig_step(i))
+            if cmd.skip_byte > STOP_FLAG:
+                i = 256 * cmd.op_byte + cmd.remainder
+
+            while i < sizes.nl:
+                cmd = lig_kern_command(lig_step(i))
+                if cmd.skip_byte > STOP_FLAG:
+                    pass
+                else:
+                    if cmd.op_byte >= KERN_FLAG:
+                        r = 256 * (cmd.op_byte - KERN_FLAG) + cmd.remainder
+                        self.kerning.setdefault(c, {})[cmd.next_char] = kern(r)
+                    else:
+                        r = cmd.op_byte
+                        if r == 4 or (r > 7 and r != 11):
+                            # Ligature step with nonstandard code, we output
+                            # the code verbatim.
+                            lig = r
+                        else:
+                            lig = ""
+                            if r % 4 > 1:
+                                lig += "/"
+                            lig += "LIG"
+                            if r % 2 != 0:
+                                lig += "/"
+                            while r > 3:
+                                lig += ">"
+                                r -= 4
+                        self.ligatures.setdefault(c, {})[cmd.next_char] = (
+                            lig,
+                            cmd.remainder,
+                        )
+
+                if cmd.skip_byte >= STOP_FLAG:
+                    break
+                i += cmd.skip_byte + 1
+
+
+if __name__ == "__main__":
+    import sys
+
+    tfm = TFM(sys.argv[1])
+    print(
+        "\n".join(
+            x
+            for x in [
+                f"tfm.checksum={tfm.checksum}",
+                f"tfm.designsize={tfm.designsize}",
+                f"tfm.codingscheme={tfm.codingscheme}",
+                f"tfm.fonttype={tfm.fonttype}",
+                f"tfm.family={tfm.family}",
+                f"tfm.seven_bit_safe_flag={tfm.seven_bit_safe_flag}",
+                f"tfm.face={tfm.face}",
+                f"tfm.extraheader={tfm.extraheader}",
+                f"tfm.fontdimens={tfm.fontdimens}",
+                f"tfm.right_boundary_char={tfm.right_boundary_char}",
+                f"tfm.left_boundary_char={tfm.left_boundary_char}",
+                f"tfm.kerning={tfm.kerning}",
+                f"tfm.ligatures={tfm.ligatures}",
+                f"tfm.chars={tfm.chars}",
+            ]
+        )
+    )
+    print(tfm)
--- a/Tests/tfmLib/data/cmex10.tfm
+++ b/Tests/tfmLib/data/cmex10.tfm
--- a/Tests/tfmLib/data/cmr10.tfm
+++ b/Tests/tfmLib/data/cmr10.tfm
--- a/Tests/tfmLib/data/cmsy10.tfm
+++ b/Tests/tfmLib/data/cmsy10.tfm
--- a/Tests/tfmLib/data/dummy-space.tfm
+++ b/Tests/tfmLib/data/dummy-space.tfm
--- a/Tests/tfmLib/tfmLib_test.py
+++ b/Tests/tfmLib/tfmLib_test.py
@ -0,0 +1,90 @@
+import glob
+import os
+
+import pytest
+
+from fontTools import tfmLib
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
+
+
+@pytest.mark.parametrize("path", glob.glob(f"{DATA_DIR}/cm*.tfm"))
+def test_read(path):
+    tfm = tfmLib.TFM(path)
+    assert tfm.designsize == 10.0
+    assert tfm.fontdimens
+    assert len(tfm.fontdimens) >= 7
+    assert tfm.extraheader == {}
+    assert tfm.right_boundary_char is None
+    assert tfm.left_boundary_char is None
+    assert len(tfm.chars) == 128
+
+
+def test_read_boundary_char():
+    path = os.path.join(DATA_DIR, "dummy-space.tfm")
+    tfm = tfmLib.TFM(path)
+    assert tfm.right_boundary_char == 1
+    assert tfm.left_boundary_char == 256
+
+
+def test_read_fontdimens_vanilla():
+    path = os.path.join(DATA_DIR, "cmr10.tfm")
+    tfm = tfmLib.TFM(path)
+    assert tfm.fontdimens == {
+        "SLANT": 0.0,
+        "SPACE": 0.33333396911621094,
+        "STRETCH": 0.16666698455810547,
+        "SHRINK": 0.11111164093017578,
+        "XHEIGHT": 0.4305553436279297,
+        "QUAD": 1.0000028610229492,
+        "EXTRASPACE": 0.11111164093017578,
+    }
+
+
+def test_read_fontdimens_mathex():
+    path = os.path.join(DATA_DIR, "cmex10.tfm")
+    tfm = tfmLib.TFM(path)
+    assert tfm.fontdimens == {
+        "SLANT": 0.0,
+        "SPACE": 0.0,
+        "STRETCH": 0.0,
+        "SHRINK": 0.0,
+        "XHEIGHT": 0.4305553436279297,
+        "QUAD": 1.0000028610229492,
+        "EXTRASPACE": 0.0,
+        "DEFAULTRULETHICKNESS": 0.03999900817871094,
+        "BIGOPSPACING1": 0.11111164093017578,
+        "BIGOPSPACING2": 0.16666698455810547,
+        "BIGOPSPACING3": 0.19999980926513672,
+        "BIGOPSPACING4": 0.6000003814697266,
+        "BIGOPSPACING5": 0.10000038146972656,
+    }
+
+
+def test_read_fontdimens_mathsy():
+    path = os.path.join(DATA_DIR, "cmsy10.tfm")
+    tfm = tfmLib.TFM(path)
+    assert tfm.fontdimens == {
+        "SLANT": 0.25,
+        "SPACE": 0.0,
+        "STRETCH": 0.0,
+        "SHRINK": 0.0,
+        "XHEIGHT": 0.4305553436279297,
+        "QUAD": 1.0000028610229492,
+        "EXTRASPACE": 0.0,
+        "NUM1": 0.6765079498291016,
+        "NUM2": 0.39373207092285156,
+        "NUM3": 0.44373130798339844,
+        "DENOM1": 0.6859512329101562,
+        "DENOM2": 0.34484100341796875,
+        "SUP1": 0.41289234161376953,
+        "SUP2": 0.36289215087890625,
+        "SUP3": 0.28888893127441406,
+        "SUB1": 0.14999961853027344,
+        "SUB2": 0.24721717834472656,
+        "SUBDROP": 0.05000019073486328,
+        "SUPDROP": 0.3861083984375,
+        "DELIM1": 2.3899993896484375,
+        "DELIM2": 1.010000228881836,
+        "AXISHEIGHT": 0.25,
+    }