Clean up svgDoc compression

This is based on bungeman's https://github.com/fonttools/fonttools/pull/2627

Previously, an entire `SVG ` table would be marked as compressed if any
of the decoded SVG documents in it were compressed. Then on encoding all
SVG documents would be considered for compression. The XML format had no
means to indicate if compression was desired.

Instead, mark each svgDoc with its compression status. When decoding
mark the svgDoc as compressed if the data was compressed. When encoding
try to compress the svgDoc if it is marked as compressed. In the XML
format the data itself is always uncompressed, but allow an optional
`compressed` boolean attribute (defaults to false) to indicate the
svgDoc should be compressed when encoded.

We also try to make sure that older code that relies on docList containing
sequences of three items (doc, startGID, endGID) will continue to work
without modification.
This commit is contained in:
Cosimo Lupo 2022-06-09 13:02:07 +01:00
parent a367e8acf5
commit be623e257f
2 changed files with 55 additions and 16 deletions

View File

@ -14,6 +14,7 @@ except ModuleNotFoundError:
from fontTools import ttLib from fontTools import ttLib
from fontTools.subset.util import _add_method from fontTools.subset.util import _add_method
from fontTools.ttLib.tables.S_V_G_ import SVGDocument
__all__ = ["subset_glyphs"] __all__ = ["subset_glyphs"]
@ -201,10 +202,12 @@ def subset_glyphs(self, s) -> bool:
# map from original to new glyph indices (after subsetting) # map from original to new glyph indices (after subsetting)
glyph_index_map: Dict[int, int] = s.glyph_index_map glyph_index_map: Dict[int, int] = s.glyph_index_map
new_docs: List[Tuple[bytes, int, int]] = [] new_docs: List[SVGDocument] = []
for doc, start, end in self.docList: for doc in self.docList:
glyphs = {glyph_order[i] for i in range(start, end + 1)}.intersection(s.glyphs) glyphs = {
glyph_order[i] for i in range(doc.startGlyphID, doc.endGlyphID + 1)
}.intersection(s.glyphs)
if not glyphs: if not glyphs:
# no intersection: we can drop the whole record # no intersection: we can drop the whole record
continue continue
@ -212,7 +215,7 @@ def subset_glyphs(self, s) -> bool:
svg = etree.fromstring( svg = etree.fromstring(
# encode because fromstring dislikes xml encoding decl if input is str. # encode because fromstring dislikes xml encoding decl if input is str.
# SVG xml encoding must be utf-8 as per OT spec. # SVG xml encoding must be utf-8 as per OT spec.
doc.encode("utf-8"), doc.data.encode("utf-8"),
parser=etree.XMLParser( parser=etree.XMLParser(
# Disable libxml2 security restrictions to support very deep trees. # Disable libxml2 security restrictions to support very deep trees.
# Without this we would get an error like this: # Without this we would get an error like this:
@ -241,7 +244,7 @@ def subset_glyphs(self, s) -> bool:
new_gids = (glyph_index_map[i] for i in gids) new_gids = (glyph_index_map[i] for i in gids)
for start, end in ranges(new_gids): for start, end in ranges(new_gids):
new_docs.append((new_doc, start, end)) new_docs.append(SVGDocument(new_doc, start, end, doc.compressed))
self.docList = new_docs self.docList = new_docs

View File

@ -17,9 +17,11 @@ The XML format is:
</SVG> </SVG>
""" """
from fontTools.misc.textTools import bytesjoin, strjoin, tobytes, tostr from fontTools.misc.textTools import bytesjoin, safeEval, strjoin, tobytes, tostr
from fontTools.misc import sstruct from fontTools.misc import sstruct
from . import DefaultTable from . import DefaultTable
from collections.abc import Sequence
from dataclasses import dataclass, astuple
from io import BytesIO from io import BytesIO
import struct import struct
import logging import logging
@ -75,15 +77,18 @@ class table_S_V_G_(DefaultTable.DefaultTable):
start = entry.svgDocOffset + subTableStart start = entry.svgDocOffset + subTableStart
end = start + entry.svgDocLength end = start + entry.svgDocLength
doc = data[start:end] doc = data[start:end]
compressed = False
if doc.startswith(b"\x1f\x8b"): if doc.startswith(b"\x1f\x8b"):
import gzip import gzip
bytesIO = BytesIO(doc) bytesIO = BytesIO(doc)
with gzip.GzipFile(None, "r", fileobj=bytesIO) as gunzipper: with gzip.GzipFile(None, "r", fileobj=bytesIO) as gunzipper:
doc = gunzipper.read() doc = gunzipper.read()
self.compressed = True
del bytesIO del bytesIO
compressed = True
doc = tostr(doc, "utf_8") doc = tostr(doc, "utf_8")
self.docList.append( [doc, entry.startGlyphID, entry.endGlyphID] ) self.docList.append(
SVGDocument(doc, entry.startGlyphID, entry.endGlyphID, compressed)
)
def compile(self, ttFont): def compile(self, ttFont):
version = 0 version = 0
@ -96,9 +101,13 @@ class table_S_V_G_(DefaultTable.DefaultTable):
entryList.append(datum) entryList.append(datum)
curOffset = len(datum) + doc_index_entry_format_0Size*numEntries curOffset = len(datum) + doc_index_entry_format_0Size*numEntries
seenDocs = {} seenDocs = {}
for doc, startGlyphID, endGlyphID in self.docList: allCompressed = getattr(self, "compressed", False)
docBytes = tobytes(doc, encoding="utf_8") for i, doc in enumerate(self.docList):
if getattr(self, "compressed", False) and not docBytes.startswith(b"\x1f\x8b"): if isinstance(doc, (list, tuple)):
doc = SVGDocument(*doc)
self.docList[i] = doc
docBytes = tobytes(doc.data, encoding="utf_8")
if (allCompressed or doc.compressed) and not docBytes.startswith(b"\x1f\x8b"):
import gzip import gzip
bytesIO = BytesIO() bytesIO = BytesIO()
with gzip.GzipFile(None, "w", fileobj=bytesIO) as gzipper: with gzip.GzipFile(None, "w", fileobj=bytesIO) as gzipper:
@ -115,7 +124,7 @@ class table_S_V_G_(DefaultTable.DefaultTable):
curOffset += docLength curOffset += docLength
seenDocs[docBytes] = docOffset seenDocs[docBytes] = docOffset
docList.append(docBytes) docList.append(docBytes)
entry = struct.pack(">HHLL", startGlyphID, endGlyphID, docOffset, docLength) entry = struct.pack(">HHLL", doc.startGlyphID, doc.endGlyphID, docOffset, docLength)
entryList.append(entry) entryList.append(entry)
entryList.extend(docList) entryList.extend(docList)
svgDocData = bytesjoin(entryList) svgDocData = bytesjoin(entryList)
@ -127,10 +136,16 @@ class table_S_V_G_(DefaultTable.DefaultTable):
return data return data
def toXML(self, writer, ttFont): def toXML(self, writer, ttFont):
for doc, startGID, endGID in self.docList: for i, doc in enumerate(self.docList):
writer.begintag("svgDoc", startGlyphID=startGID, endGlyphID=endGID) if isinstance(doc, (list, tuple)):
doc = SVGDocument(*doc)
self.docList[i] = doc
attrs = {"startGlyphID": doc.startGlyphID, "endGlyphID": doc.endGlyphID}
if doc.compressed:
attrs["compressed"] = 1
writer.begintag("svgDoc", **attrs)
writer.newline() writer.newline()
writer.writecdata(doc) writer.writecdata(doc.data)
writer.newline() writer.newline()
writer.endtag("svgDoc") writer.endtag("svgDoc")
writer.newline() writer.newline()
@ -143,7 +158,8 @@ class table_S_V_G_(DefaultTable.DefaultTable):
doc = doc.strip() doc = doc.strip()
startGID = int(attrs["startGlyphID"]) startGID = int(attrs["startGlyphID"])
endGID = int(attrs["endGlyphID"]) endGID = int(attrs["endGlyphID"])
self.docList.append( [doc, startGID, endGID] ) compressed = bool(safeEval(attrs.get("compressed", "0")))
self.docList.append(SVGDocument(doc, startGID, endGID, compressed))
else: else:
log.warning("Unknown %s %s", name, content) log.warning("Unknown %s %s", name, content)
@ -157,3 +173,23 @@ class DocumentIndexEntry(object):
def __repr__(self): def __repr__(self):
return "startGlyphID: %s, endGlyphID: %s, svgDocOffset: %s, svgDocLength: %s" % (self.startGlyphID, self.endGlyphID, self.svgDocOffset, self.svgDocLength) return "startGlyphID: %s, endGlyphID: %s, svgDocOffset: %s, svgDocLength: %s" % (self.startGlyphID, self.endGlyphID, self.svgDocOffset, self.svgDocLength)
@dataclass
class SVGDocument(Sequence):
data: str
startGlyphID: int
endGlyphID: int
compressed: bool = False
# Previously, the SVG table's docList attribute contained a lists of 3 items:
# [doc, startGlyphID, endGlyphID]; later, we added a `compressed` attribute.
# For backward compatibility with code that depends of them being sequences of
# fixed length=3, we subclass the Sequence abstract base class and pretend only
# the first three items are present. 'compressed' is only accessible via named
# attribute lookup like regular dataclasses: i.e. `doc.compressed`, not `doc[3]`
def __getitem__(self, index):
return astuple(self)[:3][index]
def __len__(self):
return 3