[gnome-keysign: 30/75] gpgkey: Make the UID "safe" for direct consumption



commit 4feef12c8cac584eeb70bbbd35c263807fb1980c
Author: Tobias Mueller <muelli cryptobitch de>
Date:   Sun Jul 23 10:29:50 2017 +0200

    gpgkey: Make the UID "safe" for direct consumption
    
    If we have a key with a non-UTF-8 UID gpgme will return a proper string
    with surrogates.  These are the actual undecodable bytes encoded in a
    way that allows to get hold of the actual undecodable bytes later when
    we need them.  We don't though, because we don't have to call gpg with
    the exact byte sequence again.  Hence we can make the UID safe for
    display centrally, i.e. in the gpgkey.UID class.

 keysign/KeyPresent.py    |  5 ++---
 keysign/gpgkey.py        | 39 ++++++++++++++++++++++++++++++++-------
 keysign/keyconfirm.py    |  5 ++---
 keysign/keylistwidget.py |  3 +--
 keysign/util.py          | 22 ----------------------
 tests/test_gpgmeh.py     | 12 ++++++++----
 tests/test_uids.py       | 13 +++++++------
 7 files changed, 52 insertions(+), 47 deletions(-)
---
diff --git a/keysign/KeyPresent.py b/keysign/KeyPresent.py
index 9ec7717..d5177d0 100644
--- a/keysign/KeyPresent.py
+++ b/keysign/KeyPresent.py
@@ -40,7 +40,7 @@ if  __name__ == "__main__" and __package__ is None:
 from .__init__ import __version__
 from .gpgmh import get_usable_keys
 from .QRCode import QRImage
-from .util import format_fingerprint, glib_markup_escape_rencoded_text
+from .util import format_fingerprint
 
 
 
@@ -96,8 +96,7 @@ class KeyPresentWidget(Gtk.Widget):
         self.key_id_label.set_markup(
             format_fingerprint(key.fingerprint).replace('\n', '  '))
         self.uids_label.set_markup("\n".join(
-                                        [glib_markup_escape_rencoded_text(
-                                            uid.uid.decode('utf-8', 'replace'))
+                                        [GLib.markup_escape_text(uid.uid)
                                         for uid
                                         in key.uidslist]))
         self.fingerprint_label.set_markup(format_fingerprint(key.fingerprint))
diff --git a/keysign/gpgkey.py b/keysign/gpgkey.py
index 95034c6..9b872dd 100644
--- a/keysign/gpgkey.py
+++ b/keysign/gpgkey.py
@@ -23,6 +23,22 @@ import warnings
 
 log = logging.getLogger(__name__)
 
+
+def to_valid_utf8_string(s, errors='replace', replacement='?'):
+    """Takes a string and returns a valid utf8 encodable string
+
+    Not every Python string is utf-8 encodable.
+    Take 'fo\udcf6e\udce9ba <foo@bma.d>' for example.
+    This function replaces undecodable characters with a '?'
+    """
+    try:
+        safe = s.encode('utf-8', errors=errors).decode('utf-8', errors=errors)
+    except UnicodeDecodeError:
+        # This is the Python 2 way...
+        safe = s.decode('utf-8', errors=errors).replace(u"\uFFFD", replacement)
+    return safe
+
+
 def parse_uid(uid, errors='replace'):
     """Parses a GnuPG UID into it's name, comment, and email component
     
@@ -113,7 +129,7 @@ class Key(namedtuple("Key", ["expiry", "fingerprint", "uidslist"])):
 
     @classmethod
     def from_gpgme(cls, key):
-        "Creates a new Key from an existing monkeysign key"
+        "Creates a new Key from an existing gpgme key"
         uids = [UID.from_gpgme(uid) for uid in  key.uids]
         expiry = parse_expiry(key.subkeys[0].expires)
         fingerprint = key.fpr
@@ -130,23 +146,32 @@ class UID(namedtuple("UID", "expiry uid name comment email")):
         # We expect to get raw bytes.
         # While RFC4880 demands UTF-8 encoded data,
         # real-life has produced non UTF-8 keys...
-        rawuid = uid.uid
+        rawuid = to_valid_utf8_string(uid.uid).encode('utf-8')
         log.debug("UidStr (%d): %r", len(rawuid), rawuid)
         name, comment, email = parse_uid(rawuid)
         expiry = parse_expiry(uid.expire)
 
-        return cls(expiry, rawuid, name, comment, email)
+        return cls(expiry, rawuid.decode('utf-8'),
+                   name, comment, email)
 
     @classmethod
     def from_gpgme(cls, uid):
-        "Creates a new UID from a monkeysign key"
+        "Creates a new UID from a gpgme UID"
         # Weird. I would expect the uid to be raw bytes,
         # because how would gpgme know what encoding to apply?
         # Also, you can have invalid encodings.
-        rawuid = uid.uid.encode('utf-8', 'replace')
-        name = uid.name
+        # Turns out, that Python strings can be encoded according to PEP 383
+        # which basically encodes invalid bytes as 0xDC80 + byte.
+        # That's the "surrogateescape" error handler available in Python 3.
+        # Here, we don't care about that, though. We are in the user facing
+        # abstraction for a UID. As such, we ensure that it can be rendered.
+        # So we take the string we get from gpgme and try to convert it to
+        # to utf-8 bytes.
+        log.debug("UID from gpgme: %r", uid.uid)
+        rawuid = to_valid_utf8_string(uid.uid)
+        name = to_valid_utf8_string(uid.name)
         comment = '' # FIXME: uid.comment
-        email = uid.email
+        email = to_valid_utf8_string(uid.email)
         expiry = None  #  FIXME: Maybe UIDs don't expire themselves but via the binding signature
 
         return cls(expiry, rawuid, name, comment, email)
diff --git a/keysign/keyconfirm.py b/keysign/keyconfirm.py
index 3283ba6..b5c6d0e 100644
--- a/keysign/keyconfirm.py
+++ b/keysign/keyconfirm.py
@@ -47,7 +47,7 @@ if  __name__ == "__main__" and __package__ is None:
 
 from .gpgmh import get_usable_keys
 from .scan_barcode import ScalingImage
-from .util import format_fingerprint, glib_markup_escape_rencoded_text
+from .util import format_fingerprint
 
 log = logging.getLogger(__name__)
 
@@ -69,8 +69,7 @@ def format_key_header(fpr, length='2048', creation_time=None):
 def format_uidslist(uidslist):
     result = ""
     for uid in uidslist:
-        uidstr = glib_markup_escape_rencoded_text(
-            uid.uid.decode('utf-8', 'replace'))
+        uidstr = GLib.markup_escape_text(uid.uid)
         result += ("{}\n".format(uidstr))
 
     return result
diff --git a/keysign/keylistwidget.py b/keysign/keylistwidget.py
index fe85d38..e23a3e6 100644
--- a/keysign/keylistwidget.py
+++ b/keysign/keylistwidget.py
@@ -22,7 +22,6 @@ if  __name__ == "__main__" and __package__ is None:
     __package__ = str('keysign')
 
 from .gpgmh import get_usable_keys
-from .util import glib_markup_escape_rencoded_text
 
 log = logging.getLogger(__name__)
 
@@ -48,7 +47,7 @@ class ListBoxRowWithKey(Gtk.ListBoxRow):
                           for k in items}
         log.info("format dicT: %r", format_dict)
         d = {k: (log.debug("handling kv: %r %r", k, v),
-                  glib_markup_escape_rencoded_text(
+                  GLib.markup_escape_text(
                     "{}".format(v)))[1]
              for k, v in format_dict.items()}
         log.info("Formatting UID %r", d)
diff --git a/keysign/util.py b/keysign/util.py
index 89ac2e6..565ff6b 100644
--- a/keysign/util.py
+++ b/keysign/util.py
@@ -31,8 +31,6 @@ except ImportError:
 
 import requests
 
-from gi.repository import GLib
-
 from .gpgmh import fingerprint_from_keydata
 from .gpgmh import sign_keydata_and_encrypt
 
@@ -222,23 +220,3 @@ def download_key_http(address, port):
     data = requests.get(url.geturl(), timeout=5).content
     log.debug("finished downloading %d bytes", len(data))
     return data
-
-
-def glib_markup_escape_rencoded_text(s, errors='replace'):
-    """Calls GLib.markup_escape and the re-encoded text.
-    The re-encoding is for getting rid of surrogates in unicode strings.
-    Those surrogates appear when the UID contains non UTF-8 bytes, e.g.
-    latin1. gpgme will return a unicode string with those surrogates.
-    Because surrogates cannot be encoded as utf-8, we replace the
-    errornous bytes (with '?').  You can control that behaviour via the
-    errors parameter.
-    You better pass a string here that we can `encode` in first place.
-    """
-    log.debug('markup rencode escape %s %r (%r)', type(s), s, errors)
-    encoded = s.encode('utf-8', errors)
-    decoded = encoded.decode('utf-8')
-    log.debug('Decoded: %r', decoded)
-    replaced = decoded.replace('\ufffd', '?')
-    escaped = GLib.markup_escape_text(replaced)
-    log.debug('escaped: %r', escaped)
-    return escaped
diff --git a/tests/test_gpgmeh.py b/tests/test_gpgmeh.py
index c05a811..01ed433 100644
--- a/tests/test_gpgmeh.py
+++ b/tests/test_gpgmeh.py
@@ -437,12 +437,15 @@ class TestSignAndEncrypt:
 
         sigs_before = [s for l in get_signatures_for_uids_on_key(sender,
                                     key).values() for s in l]
+        # FIXME: Refactor this a little bit.
+        # We have duplication of code with the other test below.
         for uid, uid_enc in zip(uids_before, uid_encrypted):
+            uid_enc_str = uid_enc[0].uid
             # The test doesn't work so well, because comments
             # are not rendered :-/
             # assert_equals(uid, uid_enc[0])
-            assert_in(uid.name, uid_enc[0].uid)
-            assert_in(uid.email, uid_enc[0].uid)
+            assert_in(uid.name, uid_enc_str)
+            assert_in(uid.email, uid_enc_str)
             ciphertext = uid_enc[1]
             log.debug("Decrypting %r", ciphertext)
             plaintext, result, vrfy = sender.decrypt(ciphertext)
@@ -497,9 +500,10 @@ class TestSignAndEncrypt:
         sigs_before = [s for l in get_signatures_for_uids_on_key(sender,
                                     sender_key).values() for s in l]
         for uid, uid_enc in zip(uids_before, uid_encrypted):
+            uid_enc_str = uid_enc[0].uid
             # FIXME: assert_equals(uid, uid_enc[0])
-            assert_in(uid.name, uid_enc[0].uid)
-            assert_in(uid.email, uid_enc[0].uid)
+            assert_in(uid.name, uid_enc_str)
+            assert_in(uid.email, uid_enc_str)
             ciphertext = uid_enc[1]
             log.debug("Decrypting %r", ciphertext)
             plaintext, result, vrfy = sender.decrypt(ciphertext)
diff --git a/tests/test_uids.py b/tests/test_uids.py
index 7c6c961..506f188 100644
--- a/tests/test_uids.py
+++ b/tests/test_uids.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
-"""We want our customs UID wrapper to return raw bytes for the raw UID
-but decoded strings for email, name, and comment component.
+"""We want our custom UID wrapper to return encodable and displayable
+strings, rather than raw bytes, for the raw UID, email, name,
+and comment component.
 """
 from __future__ import unicode_literals
 
@@ -25,18 +26,18 @@ class FakeMKSUID:
 def test_mks_utf8_uid():
     "The normal case"
     uid = FakeMKSUID()
-    uid.uid = b'foo bar <foo bar com>'
+    uid.uid = 'foo bar <foo bar com>'
     u = gpgkey.UID.from_monkeysign(uid)
     assert_string(u.name)
     assert_string(u.comment)
     assert_string(u.email)
-    assert_bytes(u.uid)
+    assert_string(u.uid)
 
 def test_mks_latin_uid():
     uid = FakeMKSUID()
-    uid.uid = b"fo\xf6\x65\xe9\x62a"
+    uid.uid = 'fo\udcf6e\udce9ba <foo@bma.d>'
     u = gpgkey.UID.from_monkeysign(uid)
     assert_string(u.name)
     assert_string(u.comment)
     assert_string(u.email)
-    assert_bytes(u.uid)
+    assert_string(u.uid)


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]