subset/svg: support resolving cross-references and pruning elements
With this I can finally follow xlink:href and url(#...) sort of references within the SVG doc and subset the elements accordingly so that only those that are reachable from the initial set of glyph elements are kept.
This commit is contained in:
parent
05e6d577cd
commit
a4df567718
@ -1,9 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from itertools import groupby
|
||||
from itertools import chain, count, groupby
|
||||
from typing import Dict, Iterable, Iterator, List, Set, Tuple
|
||||
|
||||
try:
|
||||
from lxml import etree
|
||||
except ModuleNotFoundError:
|
||||
# lxml is required for subsetting SVG, but we prefer to delay the import error
|
||||
# until subset_glyphs() is called (i.e. if font to subset has an 'SVG ' table)
|
||||
etree = None
|
||||
|
||||
from fontTools import ttLib
|
||||
@ -15,21 +20,117 @@ __all__ = ["subset_glyphs"]
|
||||
|
||||
GID_RE = re.compile("^glyph(\d+)$")
|
||||
|
||||
NAMESPACES = {
|
||||
"svg": "http://www.w3.org/2000/svg",
|
||||
"xlink": "http://www.w3.org/1999/xlink",
|
||||
}
|
||||
XLINK_HREF = f'{{{NAMESPACES["xlink"]}}}href'
|
||||
|
||||
XLINK_HREF = "{http://www.w3.org/1999/xlink}href"
|
||||
if etree is not None:
|
||||
# it's more efficient to compile XPath objects upfront than calling the same
|
||||
# Element.xpath() several times
|
||||
find_svg_elements_with_id = etree.XPath(".//svg:*[@id]", namespaces=NAMESPACES)
|
||||
|
||||
# We currently support xlink:href (as used by <use> and gradient templates),
|
||||
# and local url(#...) links found in fill or clip-path attributes
|
||||
# TODO(anthrotype): Check we aren't missing other supported kinds of reference
|
||||
find_svg_elements_with_references = etree.XPath(
|
||||
".//svg:*[ "
|
||||
"starts-with(@xlink:href, '#') "
|
||||
"or starts-with(@fill, 'url(#') "
|
||||
"or starts-with(@clip-path, 'url(#') "
|
||||
"or contains(@style, ':url(#') "
|
||||
"]",
|
||||
namespaces=NAMESPACES,
|
||||
)
|
||||
|
||||
find_svg_elements_with_glyph_href = etree.XPath(
|
||||
".//svg:*[starts-with(@xlink:href, '#glyph')]", namespaces=NAMESPACES
|
||||
)
|
||||
|
||||
|
||||
def remap_glyph_ids(svg, glyph_index_map):
|
||||
def group_elements_by_id(tree: etree.Element) -> Dict[str, etree.Element]:
|
||||
return {el.attrib["id"]: el for el in find_svg_elements_with_id(tree)}
|
||||
|
||||
|
||||
def parse_css_declarations(style_attr: str) -> Dict[str, str]:
|
||||
# https://developer.mozilla.org/en-US/docs/Web/SVG/Attribute/style
|
||||
# https://developer.mozilla.org/en-US/docs/Web/CSS/Syntax#css_declarations
|
||||
result = {}
|
||||
for declaration in style_attr.split(";"):
|
||||
if declaration.count(":") == 1:
|
||||
property_name, value = declaration.split(":")
|
||||
property_name = property_name.strip()
|
||||
result[property_name] = value.strip()
|
||||
elif declaration.strip():
|
||||
raise ValueError(f"Invalid CSS declaration syntax: {declaration}")
|
||||
return result
|
||||
|
||||
|
||||
def iter_referenced_ids(tree: etree.Element) -> Iterator[str]:
|
||||
# Yield all the ids that can be reached via references from within this element tree
|
||||
for el in chain([tree], find_svg_elements_with_references(tree)):
|
||||
if XLINK_HREF in el.attrib:
|
||||
href = el.attrib[XLINK_HREF]
|
||||
if href.startswith("#"):
|
||||
ref_id = href[1:]
|
||||
assert ref_id
|
||||
yield ref_id
|
||||
|
||||
attrs = el.attrib
|
||||
if "style" in attrs:
|
||||
attrs = {**attrs, **parse_css_declarations(el.attrib["style"])}
|
||||
for attr in ("fill", "clip-path"):
|
||||
if attr in attrs:
|
||||
value = attrs[attr]
|
||||
if value.startswith("url(#") and value.endswith(")"):
|
||||
ref_id = value[5:-1]
|
||||
assert ref_id
|
||||
yield ref_id
|
||||
|
||||
|
||||
def closure_element_ids(
|
||||
elements: Dict[str, etree.Element], element_ids: Set[str]
|
||||
) -> None:
|
||||
# Expand the initial subset of element ids to include ids that can be reached
|
||||
# via references from the initial set.
|
||||
unvisited = element_ids
|
||||
while unvisited:
|
||||
referenced: Set[str] = set()
|
||||
for el_id in unvisited:
|
||||
if el_id not in elements:
|
||||
# ignore dangling reference; not our job to validate svg
|
||||
continue
|
||||
referenced.update(iter_referenced_ids(elements[el_id]))
|
||||
referenced -= element_ids
|
||||
element_ids.update(referenced)
|
||||
unvisited = referenced
|
||||
|
||||
|
||||
def subset_elements(el: etree.Element, ids: Set[str]) -> bool:
|
||||
# Keep elements if their id is in the subset, or any of their children's id is.
|
||||
# Drop elements whose id is not in the subset, and either have no children,
|
||||
# or all their children are being dropped.
|
||||
el_id = el.attrib.get("id")
|
||||
if el_id is not None and el_id in ids:
|
||||
# if id is in the set, don't recurse; keep whole subtree
|
||||
return True
|
||||
keep = False
|
||||
for e in el:
|
||||
keep |= subset_elements(e, ids)
|
||||
if keep:
|
||||
return True
|
||||
el.getparent().remove(el)
|
||||
return False
|
||||
|
||||
|
||||
def remap_glyph_ids(
|
||||
elements: Dict[str, etree.Element], glyph_index_map: Dict[int, int]
|
||||
) -> Dict[str, str]:
|
||||
# Given {old_gid: new_gid} map, rename all elements containing id="glyph{gid}"
|
||||
# special attributes
|
||||
id_map = {}
|
||||
href_elements = []
|
||||
for el in svg.iter("*"):
|
||||
# we'll rename these later
|
||||
if XLINK_HREF in el.attrib and el.attrib[XLINK_HREF].startswith("#"):
|
||||
href_elements.append(el)
|
||||
|
||||
el_id = el.attrib.get("id")
|
||||
if el_id is None:
|
||||
continue
|
||||
for el_id, el in elements.items():
|
||||
m = GID_RE.match(el_id)
|
||||
if not m:
|
||||
continue
|
||||
@ -40,34 +141,31 @@ def remap_glyph_ids(svg, glyph_index_map):
|
||||
continue
|
||||
new_id = f"glyph{new_index}"
|
||||
else:
|
||||
# If the old id is missing, the element correspond to a glyph that was
|
||||
# If the old index is missing, the element correspond to a glyph that was
|
||||
# excluded from the font's subset.
|
||||
# For now we keep it around, renamed to avoid clashes with the new GID
|
||||
# (though technically there could still be clashes even after we insert
|
||||
# a tilde at the beginning, e.g. '~glyphXXX' is still a valid id...).
|
||||
# TODO Figure out how to best prune the SVG document of unused elements.
|
||||
# https://github.com/fonttools/fonttools/issues/534
|
||||
new_id = f"~{el_id}"
|
||||
# We rename it to avoid clashes with the new GIDs or other element ids.
|
||||
new_id = f".{el_id}"
|
||||
n = count(1)
|
||||
while new_id in elements:
|
||||
new_id = f"{new_id}.{next(n)}"
|
||||
|
||||
id_map[el_id] = new_id
|
||||
el.attrib["id"] = new_id
|
||||
|
||||
if not id_map:
|
||||
return
|
||||
return id_map
|
||||
|
||||
# update xlink:href="#..." that refer to the old id to point to the new one
|
||||
for el in href_elements:
|
||||
ref = el.attrib[XLINK_HREF]
|
||||
# we only care about local #fragment identifiers
|
||||
assert ref.startswith("#")
|
||||
old_id = ref[1:]
|
||||
|
||||
def update_glyph_href_links(svg: etree.Element, id_map: Dict[str, str]) -> None:
|
||||
# update all xlink:href="#glyph..." attributes to point to the new glyph ids
|
||||
for el in find_svg_elements_with_glyph_href(svg):
|
||||
old_id = el.attrib[XLINK_HREF][1:]
|
||||
if old_id in id_map:
|
||||
new_id = id_map[old_id]
|
||||
el.attrib[XLINK_HREF] = f"#{new_id}"
|
||||
|
||||
|
||||
def ranges(ints):
|
||||
"""Yield (min, max) ranges of consecutive integers from the input set"""
|
||||
def ranges(ints: Iterable[int]) -> Iterator[Tuple[int, int]]:
|
||||
# Yield (min, max) ranges of consecutive integers from the input set
|
||||
sorted_ints = sorted(set(ints))
|
||||
# to group together consecutive ints, we use as 'key' the difference
|
||||
# between their index in the (sorted) list and themselves, which stays
|
||||
@ -78,43 +176,54 @@ def ranges(ints):
|
||||
|
||||
|
||||
@_add_method(ttLib.getTableClass("SVG "))
|
||||
def subset_glyphs(self, s):
|
||||
def subset_glyphs(self, s) -> bool:
|
||||
if etree is None:
|
||||
raise ModuleNotFoundError("No module named 'lxml', required to subset SVG")
|
||||
|
||||
# ordered list of glyph names (before subsetting)
|
||||
glyph_order = s.orig_glyph_order
|
||||
# glyph names (before subsetting)
|
||||
glyph_order: List[str] = s.orig_glyph_order
|
||||
# map from glyph names to original glyph indices
|
||||
rev_orig_glyph_map = s.reverseOrigGlyphMap
|
||||
rev_orig_glyph_map: Dict[str, int] = s.reverseOrigGlyphMap
|
||||
# map from original to new glyph indices (after subsetting)
|
||||
glyph_index_map = s.glyph_index_map
|
||||
glyph_index_map: Dict[int, int] = s.glyph_index_map
|
||||
|
||||
new_docs = []
|
||||
for doc, start_gid, end_gid in self.docList:
|
||||
old_glyphs = {glyph_order[i] for i in range(start_gid, end_gid + 1)}
|
||||
new_glyphs = old_glyphs.intersection(s.glyphs)
|
||||
if not new_glyphs:
|
||||
new_docs: List[Tuple[bytes, int, int]] = []
|
||||
for doc, start, end in self.docList:
|
||||
|
||||
glyphs = {glyph_order[i] for i in range(start, end + 1)}.intersection(s.glyphs)
|
||||
if not glyphs:
|
||||
# no intersection: we can drop the whole record
|
||||
continue
|
||||
|
||||
# NOTE If new_glyphs != old_glyphs, there's only a partial intersection: i.e.
|
||||
# we'll likely end up with unused garbage until we figure out how to prune
|
||||
# the unused refereces from the SVG doc.
|
||||
svg = etree.fromstring(
|
||||
doc,
|
||||
parser=etree.XMLParser(
|
||||
# Disable libxml2 security restrictions to support very deep trees.
|
||||
# Without this we would get an error like this:
|
||||
# `lxml.etree.XMLSyntaxError: internal error: Huge input lookup`
|
||||
# when parsing big fonts e.g. noto-emoji-picosvg.ttf.
|
||||
huge_tree=True,
|
||||
# ignore blank text as it's not meaningful in OT-SVG; it also prevents
|
||||
# dangling tail text after removing an element when pretty_print=True
|
||||
remove_blank_text=True,
|
||||
),
|
||||
)
|
||||
|
||||
# NOTE huge_tree=True disables security restrictions and support very deep trees
|
||||
# and very long text content. Without it I would get an error like this:
|
||||
# `lxml.etree.XMLSyntaxError: internal error: Huge input lookup`
|
||||
# when parsing noto-emoji-picosvg.svg from googlefonts/color-fonts.
|
||||
# This is lxml-only API, won't work with built-in ElementTree...
|
||||
svg = etree.fromstring(doc, parser=etree.XMLParser(huge_tree=True))
|
||||
elements = group_elements_by_id(svg)
|
||||
gids = {rev_orig_glyph_map[g] for g in glyphs}
|
||||
element_ids = {f"glyph{i}" for i in gids}
|
||||
closure_element_ids(elements, element_ids)
|
||||
subset_elements(svg, element_ids)
|
||||
|
||||
remap_glyph_ids(svg, glyph_index_map)
|
||||
if not s.options.retain_gids:
|
||||
id_map = remap_glyph_ids(elements, glyph_index_map)
|
||||
update_glyph_href_links(svg, id_map)
|
||||
|
||||
new_doc = etree.tostring(svg)
|
||||
new_doc = etree.tostring(svg, pretty_print=s.options.pretty_svg)
|
||||
|
||||
new_gids = {glyph_index_map[rev_orig_glyph_map[g]] for g in new_glyphs}
|
||||
for start_gid, end_gid in ranges(new_gids):
|
||||
new_docs.append((new_doc, start_gid, end_gid))
|
||||
new_gids = (glyph_index_map[i] for i in gids)
|
||||
for start, end in ranges(new_gids):
|
||||
new_docs.append((new_doc, start, end))
|
||||
|
||||
self.docList = new_docs
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user