[docs] Fully document the cmap table (#2454)

* Fully document the cmap table

* Try not to completely break cmap handling while writing docs, Simon
This commit is contained in:
Simon Cozens 2021-12-02 15:32:03 +00:00 committed by GitHub
parent 0f03e6529a
commit f887389d59
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 91 additions and 15 deletions

View File

@ -1,7 +1,6 @@
``cmap``: Character to Glyph Index Mapping Table ``cmap``: Character to Glyph Index Mapping Table
------------------------------------------------ ------------------------------------------------
.. automodule:: fontTools.ttLib.tables._c_m_a_p .. autoclass:: fontTools.ttLib.tables._c_m_a_p.table__c_m_a_p
:inherited-members:
:members: .. autoclass:: fontTools.ttLib.tables._c_m_a_p.CmapSubtable
:undoc-members:

View File

@ -23,8 +23,52 @@ def _make_map(font, chars, gids):
return cmap return cmap
class table__c_m_a_p(DefaultTable.DefaultTable): class table__c_m_a_p(DefaultTable.DefaultTable):
"""Character to Glyph Index Mapping Table
This class represents the `cmap <https://docs.microsoft.com/en-us/typography/opentype/spec/cmap>`_
table, which maps between input characters (in Unicode or other system encodings)
and glyphs within the font. The ``cmap`` table contains one or more subtables
which determine the mapping of of characters to glyphs across different platforms
and encoding systems.
``table__c_m_a_p`` objects expose an accessor ``.tables`` which provides access
to the subtables, although it is normally easier to retrieve individual subtables
through the utility methods described below. To add new subtables to a font,
first determine the subtable format (if in doubt use format 4 for glyphs within
the BMP, format 12 for glyphs outside the BMP, and format 14 for Unicode Variation
Sequences) construct subtable objects with ``CmapSubtable.newSubtable(format)``,
and append them to the ``.tables`` list.
Within a subtable, the mapping of characters to glyphs is provided by the ``.cmap``
attribute.
Example::
cmap4_0_3 = CmapSubtable.newSubtable(4)
cmap4_0_3.platformID = 0
cmap4_0_3.platEncID = 3
cmap4_0_3.language = 0
cmap4_0_3.cmap = { 0xC1: "Aacute" }
cmap = newTable("cmap")
cmap.tableVersion = 0
cmap.tables = [cmap4_0_3]
"""
def getcmap(self, platformID, platEncID): def getcmap(self, platformID, platEncID):
"""Returns the first subtable which matches the given platform and encoding.
Args:
platformID (int): The platform ID. Use 0 for Unicode, 1 for Macintosh
(deprecated for new fonts), 2 for ISO (deprecated) and 3 for Windows.
encodingID (int): Encoding ID. Interpretation depends on the platform ID.
See the OpenType specification for details.
Returns:
An object which is a subclass of :py:class:`CmapSubtable` if a matching
subtable is found within the font, or ``None`` otherwise.
"""
for subtable in self.tables: for subtable in self.tables:
if (subtable.platformID == platformID and if (subtable.platformID == platformID and
subtable.platEncID == platEncID): subtable.platEncID == platEncID):
@ -32,13 +76,22 @@ class table__c_m_a_p(DefaultTable.DefaultTable):
return None # not found return None # not found
def getBestCmap(self, cmapPreferences=((3, 10), (0, 6), (0, 4), (3, 1), (0, 3), (0, 2), (0, 1), (0, 0))): def getBestCmap(self, cmapPreferences=((3, 10), (0, 6), (0, 4), (3, 1), (0, 3), (0, 2), (0, 1), (0, 0))):
"""Return the 'best' unicode cmap dictionary available in the font, """Returns the 'best' Unicode cmap dictionary available in the font
or None, if no unicode cmap subtable is available. or ``None``, if no Unicode cmap subtable is available.
By default it will search for the following (platformID, platEncID) By default it will search for the following (platformID, platEncID)
pairs: pairs in order::
(3, 10), (0, 6), (0, 4), (3, 1), (0, 3), (0, 2), (0, 1), (0, 0)
This can be customized via the cmapPreferences argument. (3, 10), # Windows Unicode full repertoire
(0, 6), # Unicode full repertoire (format 13 subtable)
(0, 4), # Unicode 2.0 full repertoire
(3, 1), # Windows Unicode BMP
(0, 3), # Unicode 2.0 BMP
(0, 2), # Unicode ISO/IEC 10646
(0, 1), # Unicode 1.1
(0, 0) # Unicode 1.0
This order can be customized via the ``cmapPreferences`` argument.
""" """
for platformID, platEncID in cmapPreferences: for platformID, platEncID in cmapPreferences:
cmapSubtable = self.getcmap(platformID, platEncID) cmapSubtable = self.getcmap(platformID, platEncID)
@ -47,12 +100,20 @@ class table__c_m_a_p(DefaultTable.DefaultTable):
return None # None of the requested cmap subtables were found return None # None of the requested cmap subtables were found
def buildReversed(self): def buildReversed(self):
"""Returns a reverse cmap such as {'one':{0x31}, 'A':{0x41,0x391}}. """Builds a reverse mapping dictionary
Iterates over all Unicode cmap tables and returns a dictionary mapping
glyphs to sets of codepoints, such as::
{
'one': {0x31}
'A': {0x41,0x391}
}
The values are sets of Unicode codepoints because The values are sets of Unicode codepoints because
some fonts map different codepoints to the same glyph. some fonts map different codepoints to the same glyph.
For example, U+0041 LATIN CAPITAL LETTER A and U+0391 For example, ``U+0041 LATIN CAPITAL LETTER A`` and ``U+0391
GREEK CAPITAL LETTER ALPHA are sometimes the same glyph. GREEK CAPITAL LETTER ALPHA`` are sometimes the same glyph.
""" """
result = {} result = {}
for subtable in self.tables: for subtable in self.tables:
@ -140,6 +201,16 @@ class table__c_m_a_p(DefaultTable.DefaultTable):
class CmapSubtable(object): class CmapSubtable(object):
"""Base class for all cmap subtable formats.
Subclasses which handle the individual subtable formats are named
``cmap_format_0``, ``cmap_format_2`` etc. Use :py:meth:`getSubtableClass`
to retrieve the concrete subclass, or :py:meth:`newSubtable` to get a
new subtable object for a given format.
The object exposes a ``.cmap`` attribute, which contains a dictionary mapping
character codepoints to glyph names.
"""
@staticmethod @staticmethod
def getSubtableClass(format): def getSubtableClass(format):
@ -148,7 +219,8 @@ class CmapSubtable(object):
@staticmethod @staticmethod
def newSubtable(format): def newSubtable(format):
"""Return a new instance of a subtable for format.""" """Return a new instance of a subtable for the given format
."""
subtableClass = CmapSubtable.getSubtableClass(format) subtableClass = CmapSubtable.getSubtableClass(format)
return subtableClass(format) return subtableClass(format)
@ -156,6 +228,9 @@ class CmapSubtable(object):
self.format = format self.format = format
self.data = None self.data = None
self.ttFont = None self.ttFont = None
self.platformID = None #: The platform ID of this subtable
self.platEncID = None #: The encoding ID of this subtable (interpretation depends on ``platformID``)
self.language = None #: The language ID of this subtable (Macintosh platform only)
def __getattr__(self, attr): def __getattr__(self, attr):
# allow lazy decompilation of subtables. # allow lazy decompilation of subtables.
@ -193,20 +268,22 @@ class CmapSubtable(object):
def getEncoding(self, default=None): def getEncoding(self, default=None):
"""Returns the Python encoding name for this cmap subtable based on its platformID, """Returns the Python encoding name for this cmap subtable based on its platformID,
platEncID, and language. If encoding for these values is not known, by default platEncID, and language. If encoding for these values is not known, by default
None is returned. That can be overriden by passing a value to the default ``None`` is returned. That can be overridden by passing a value to the ``default``
argument. argument.
Note that if you want to choose a "preferred" cmap subtable, most of the time Note that if you want to choose a "preferred" cmap subtable, most of the time
self.isUnicode() is what you want as that one only returns true for the modern, ``self.isUnicode()`` is what you want as that one only returns true for the modern,
commonly used, Unicode-compatible triplets, not the legacy ones. commonly used, Unicode-compatible triplets, not the legacy ones.
""" """
return getEncoding(self.platformID, self.platEncID, self.language, default) return getEncoding(self.platformID, self.platEncID, self.language, default)
def isUnicode(self): def isUnicode(self):
"""Returns true if the characters are interpreted as Unicode codepoints."""
return (self.platformID == 0 or return (self.platformID == 0 or
(self.platformID == 3 and self.platEncID in [0, 1, 10])) (self.platformID == 3 and self.platEncID in [0, 1, 10]))
def isSymbol(self): def isSymbol(self):
"""Returns true if the subtable is for the Symbol encoding (3,0)"""
return self.platformID == 3 and self.platEncID == 0 return self.platformID == 3 and self.platEncID == 0
def _writeCodes(self, codes, writer): def _writeCodes(self, codes, writer):