subset/svg: support resolving cross-references and pruning elements

With this I can finally follow xlink:href and url(#...) sort of references within the SVG doc and subset the elements accordingly so that only those that are reachable from the initial set of glyph elements are kept.
2021-11-12 17:04:03 +00:00 · 2021-11-12 17:04:03 +00:00 · a4df567718
commit a4df567718
parent 05e6d577cd
1 changed files with 162 additions and 53 deletions
--- a/Lib/fontTools/subset/svg.py
+++ b/Lib/fontTools/subset/svg.py
@ -1,9 +1,14 @@
+from __future__ import annotations
+
 import re
-from itertools import groupby
+from itertools import chain, count, groupby
+from typing import Dict, Iterable, Iterator, List, Set, Tuple

 try:
    from lxml import etree
 except ModuleNotFoundError:
+    # lxml is required for subsetting SVG, but we prefer to delay the import error
+    # until subset_glyphs() is called (i.e. if font to subset has an 'SVG ' table)
    etree = None

 from fontTools import ttLib
@ -15,21 +20,117 @@ __all__ = ["subset_glyphs"]

 GID_RE = re.compile("^glyph(\d+)$")

+NAMESPACES = {
+    "svg": "http://www.w3.org/2000/svg",
+    "xlink": "http://www.w3.org/1999/xlink",
+}
+XLINK_HREF = f'{{{NAMESPACES["xlink"]}}}href'

-XLINK_HREF = "{http://www.w3.org/1999/xlink}href"
+if etree is not None:
+    # it's more efficient to compile XPath objects upfront than calling the same
+    # Element.xpath() several times
+    find_svg_elements_with_id = etree.XPath(".//svg:*[@id]", namespaces=NAMESPACES)
+
+    # We currently support xlink:href (as used by <use> and gradient templates),
+    # and local url(#...) links found in fill or clip-path attributes
+    # TODO(anthrotype): Check we aren't missing other supported kinds of reference
+    find_svg_elements_with_references = etree.XPath(
+        ".//svg:*[ "
+        "starts-with(@xlink:href, '#') "
+        "or starts-with(@fill, 'url(#') "
+        "or starts-with(@clip-path, 'url(#') "
+        "or contains(@style, ':url(#') "
+        "]",
+        namespaces=NAMESPACES,
+    )
+
+    find_svg_elements_with_glyph_href = etree.XPath(
+        ".//svg:*[starts-with(@xlink:href, '#glyph')]", namespaces=NAMESPACES
+    )


-def remap_glyph_ids(svg, glyph_index_map):
+def group_elements_by_id(tree: etree.Element) -> Dict[str, etree.Element]:
+    return {el.attrib["id"]: el for el in find_svg_elements_with_id(tree)}
+
+
+def parse_css_declarations(style_attr: str) -> Dict[str, str]:
+    # https://developer.mozilla.org/en-US/docs/Web/SVG/Attribute/style
+    # https://developer.mozilla.org/en-US/docs/Web/CSS/Syntax#css_declarations
+    result = {}
+    for declaration in style_attr.split(";"):
+        if declaration.count(":") == 1:
+            property_name, value = declaration.split(":")
+            property_name = property_name.strip()
+            result[property_name] = value.strip()
+        elif declaration.strip():
+            raise ValueError(f"Invalid CSS declaration syntax: {declaration}")
+    return result
+
+
+def iter_referenced_ids(tree: etree.Element) -> Iterator[str]:
+    # Yield all the ids that can be reached via references from within this element tree
+    for el in chain([tree], find_svg_elements_with_references(tree)):
+        if XLINK_HREF in el.attrib:
+            href = el.attrib[XLINK_HREF]
+            if href.startswith("#"):
+                ref_id = href[1:]
+                assert ref_id
+                yield ref_id
+
+        attrs = el.attrib
+        if "style" in attrs:
+            attrs = {**attrs, **parse_css_declarations(el.attrib["style"])}
+        for attr in ("fill", "clip-path"):
+            if attr in attrs:
+                value = attrs[attr]
+                if value.startswith("url(#") and value.endswith(")"):
+                    ref_id = value[5:-1]
+                    assert ref_id
+                    yield ref_id
+
+
+def closure_element_ids(
+    elements: Dict[str, etree.Element], element_ids: Set[str]
+) -> None:
+    # Expand the initial subset of element ids to include ids that can be reached
+    # via references from the initial set.
+    unvisited = element_ids
+    while unvisited:
+        referenced: Set[str] = set()
+        for el_id in unvisited:
+            if el_id not in elements:
+                # ignore dangling reference; not our job to validate svg
+                continue
+            referenced.update(iter_referenced_ids(elements[el_id]))
+        referenced -= element_ids
+        element_ids.update(referenced)
+        unvisited = referenced
+
+
+def subset_elements(el: etree.Element, ids: Set[str]) -> bool:
+    # Keep elements if their id is in the subset, or any of their children's id is.
+    # Drop elements whose id is not in the subset, and either have no children,
+    # or all their children are being dropped.
+    el_id = el.attrib.get("id")
+    if el_id is not None and el_id in ids:
+        # if id is in the set, don't recurse; keep whole subtree
+        return True
+    keep = False
+    for e in el:
+        keep |= subset_elements(e, ids)
+    if keep:
+        return True
+    el.getparent().remove(el)
+    return False
+
+
+def remap_glyph_ids(
+    elements: Dict[str, etree.Element], glyph_index_map: Dict[int, int]
+) -> Dict[str, str]:
+    # Given {old_gid: new_gid} map, rename all elements containing id="glyph{gid}"
+    # special attributes
    id_map = {}
-    href_elements = []
-    for el in svg.iter("*"):
-        # we'll rename these later
-        if XLINK_HREF in el.attrib and el.attrib[XLINK_HREF].startswith("#"):
-            href_elements.append(el)
-
-        el_id = el.attrib.get("id")
-        if el_id is None:
-            continue
+    for el_id, el in elements.items():
        m = GID_RE.match(el_id)
        if not m:
            continue
@ -40,34 +141,31 @@ def remap_glyph_ids(svg, glyph_index_map):
                continue
            new_id = f"glyph{new_index}"
        else:
-            # If the old id is missing, the element correspond to a glyph that was
+            # If the old index is missing, the element correspond to a glyph that was
            # excluded from the font's subset.
-            # For now we keep it around, renamed to avoid clashes with the new GID
-            # (though technically there could still be clashes even after we insert
-            # a tilde at the beginning, e.g. '~glyphXXX' is still a valid id...).
-            # TODO Figure out how to best prune the SVG document of unused elements.
-            # https://github.com/fonttools/fonttools/issues/534
-            new_id = f"~{el_id}"
+            # We rename it to avoid clashes with the new GIDs or other element ids.
+            new_id = f".{el_id}"
+            n = count(1)
+            while new_id in elements:
+                new_id = f"{new_id}.{next(n)}"

        id_map[el_id] = new_id
        el.attrib["id"] = new_id

-    if not id_map:
-        return
+    return id_map

-    # update xlink:href="#..." that refer to the old id to point to the new one
-    for el in href_elements:
-        ref = el.attrib[XLINK_HREF]
-        # we only care about local #fragment identifiers
-        assert ref.startswith("#")
-        old_id = ref[1:]
+
+def update_glyph_href_links(svg: etree.Element, id_map: Dict[str, str]) -> None:
+    # update all xlink:href="#glyph..." attributes to point to the new glyph ids
+    for el in find_svg_elements_with_glyph_href(svg):
+        old_id = el.attrib[XLINK_HREF][1:]
        if old_id in id_map:
            new_id = id_map[old_id]
            el.attrib[XLINK_HREF] = f"#{new_id}"


-def ranges(ints):
-    """Yield (min, max) ranges of consecutive integers from the input set"""
+def ranges(ints: Iterable[int]) -> Iterator[Tuple[int, int]]:
+    # Yield (min, max) ranges of consecutive integers from the input set
    sorted_ints = sorted(set(ints))
    # to group together consecutive ints, we use as 'key' the difference
    # between their index in the (sorted) list and themselves, which stays
@ -78,43 +176,54 @@ def ranges(ints):


@_add_method(ttLib.getTableClass("SVG "))
-def subset_glyphs(self, s):
+def subset_glyphs(self, s) -> bool:
    if etree is None:
        raise ModuleNotFoundError("No module named 'lxml', required to subset SVG")

-    # ordered list of glyph names (before subsetting)
-    glyph_order = s.orig_glyph_order
+    # glyph names (before subsetting)
+    glyph_order: List[str] = s.orig_glyph_order
    # map from glyph names to original glyph indices
-    rev_orig_glyph_map = s.reverseOrigGlyphMap
+    rev_orig_glyph_map: Dict[str, int] = s.reverseOrigGlyphMap
    # map from original to new glyph indices (after subsetting)
-    glyph_index_map = s.glyph_index_map
+    glyph_index_map: Dict[int, int] = s.glyph_index_map

-    new_docs = []
-    for doc, start_gid, end_gid in self.docList:
-        old_glyphs = {glyph_order[i] for i in range(start_gid, end_gid + 1)}
-        new_glyphs = old_glyphs.intersection(s.glyphs)
-        if not new_glyphs:
+    new_docs: List[Tuple[bytes, int, int]] = []
+    for doc, start, end in self.docList:
+
+        glyphs = {glyph_order[i] for i in range(start, end + 1)}.intersection(s.glyphs)
+        if not glyphs:
            # no intersection: we can drop the whole record
            continue

-        # NOTE If new_glyphs != old_glyphs, there's only a partial intersection: i.e.
-        # we'll likely end up with unused garbage until we figure out how to prune
-        # the unused refereces from the SVG doc.
+        svg = etree.fromstring(
+            doc,
+            parser=etree.XMLParser(
+                # Disable libxml2 security restrictions to support very deep trees.
+                # Without this we would get an error like this:
+                # `lxml.etree.XMLSyntaxError: internal error: Huge input lookup`
+                # when parsing big fonts e.g. noto-emoji-picosvg.ttf.
+                huge_tree=True,
+                # ignore blank text as it's not meaningful in OT-SVG; it also prevents
+                # dangling tail text after removing an element when pretty_print=True
+                remove_blank_text=True,
+            ),
+        )

-        # NOTE huge_tree=True disables security restrictions and support very deep trees
-        # and very long text content. Without it I would get an error like this:
-        # `lxml.etree.XMLSyntaxError: internal error: Huge input lookup`
-        # when parsing noto-emoji-picosvg.svg from googlefonts/color-fonts.
-        # This is lxml-only API, won't work with built-in ElementTree...
-        svg = etree.fromstring(doc, parser=etree.XMLParser(huge_tree=True))
+        elements = group_elements_by_id(svg)
+        gids = {rev_orig_glyph_map[g] for g in glyphs}
+        element_ids = {f"glyph{i}" for i in gids}
+        closure_element_ids(elements, element_ids)
+        subset_elements(svg, element_ids)

-        remap_glyph_ids(svg, glyph_index_map)
+        if not s.options.retain_gids:
+            id_map = remap_glyph_ids(elements, glyph_index_map)
+            update_glyph_href_links(svg, id_map)

-        new_doc = etree.tostring(svg)
+        new_doc = etree.tostring(svg, pretty_print=s.options.pretty_svg)

-        new_gids = {glyph_index_map[rev_orig_glyph_map[g]] for g in new_glyphs}
-        for start_gid, end_gid in ranges(new_gids):
-            new_docs.append((new_doc, start_gid, end_gid))
+        new_gids = (glyph_index_map[i] for i in gids)
+        for start, end in ranges(new_gids):
+            new_docs.append((new_doc, start, end))

    self.docList = new_docs