[cantarell-fonts] Add script to test font coverage against character sets.

From: Nikolaus Waxweiler <nwaxweiler src gnome org>
To: commits-list gnome org
Cc:
Subject: [cantarell-fonts] Add script to test font coverage against character sets.
Date: Fri, 15 Jan 2016 16:37:41 +0000 (UTC)

commit 091805588902f8f14c08df3ea84c245d1c1c9d2c
Author: Nikolaus Waxweiler <madigens gmail com>
Date:   Fri Jan 15 17:36:47 2016 +0100

    Add script to test font coverage against character sets.

 scripts/Makefile.am      |    3 +-
 scripts/test-coverage.py |   64 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 1 deletions(-)
---
diff --git a/scripts/Makefile.am b/scripts/Makefile.am
index 332ad03..cfb5dd8 100644
--- a/scripts/Makefile.am
+++ b/scripts/Makefile.am
@@ -1,5 +1,6 @@
 noinst_DATA = \
-       generate.sh
+       generate.sh \
+       test-coverage.py
 
 EXTRA_DIST = $(noinst_DATA)
 
diff --git a/scripts/test-coverage.py b/scripts/test-coverage.py
new file mode 100755
index 0000000..e4a25c8
--- /dev/null
+++ b/scripts/test-coverage.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+#
+# Test Unicode coverage of a given font file against a list with Unicode
+# points, currently the Adobe Latin and Cyrillic precomposed glyphs lists.
+#
+# https://adobe-type-tools.github.io/adobe-latin-charsets/
+# https://adobe-type-tools.github.io/adobe-cyrillic-charsets/
+
+import os
+import argparse
+from fontTools.ttLib import TTFont
+from urllib.request import urlopen
+
+parser = argparse.ArgumentParser()
+parser.add_argument("fonts", nargs='+',
+                    help="One or more font files (.otf/.ttf) you want to test for coverage.")
+args = parser.parse_args()
+
+charset_list = [
+"https://adobe-type-tools.github.io/adobe-latin-charsets/adobe-latin-4-precomposed.txt";,
+"https://adobe-type-tools.github.io/adobe-cyrillic-charsets/adobe-cyrillic-2.txt";
+]
+
+for charset in charset_list:
+  charset_table = {}
+
+  # Parse charset file into charset_table.
+  with urlopen(charset) as c:
+    # Split table manually and slice off header.
+    raw_table = c.read().decode().split("\n")[1:]
+
+    # We care only about the first column with the hex code and the third
+    # column with the plain English description of that code point. The first
+    # column must be converted from e.g. a string of a hex "20AE" to an int
+    # 8366.
+    for raw_line in raw_table:
+      if raw_line: # Skip empty lines.
+        sliced_line = raw_line.split("\t")
+        charset_table[int(sliced_line[0], 16)] = sliced_line[3] # { 8366: "TUGRIK SIGN", ... }
+
+  charset_set = frozenset(charset_table.keys())
+
+  # Now compare each given font against this charset.
+  for font_file in args.fonts:
+    font = TTFont(font_file)
+
+    # Font can contain multiple cmaps that map unicode code points (U+0020) to
+    # glyph names ("space"), we want the code points from all Unicode cmaps and
+    # flatten them into a (unique) set.
+    codepoints = [[y[0] for y in x.cmap.items()] 
+                  for x in font['cmap'].tables if x.isUnicode()]
+    codepoints_set = frozenset([item for sublist in codepoints
+                               for item in sublist])
+    missing_codepoints = charset_set.difference(codepoints_set)
+
+    if missing_codepoints:
+      font_filename = os.path.basename(font_file)
+      charset_filename = charset.rpartition("/")[-1]
+      print("\n" + font_filename + " is missing from " + charset_filename + ":")
+      
+      for m in missing_codepoints:
+        print("U+" + format(m, "04X") + " " + charset_table[m])
+
+    font.close()

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]