Fix unicode-aware truncation of EAS event attrs

bengotow · bengotow · commit 6c3ea6367618 · 2016-09-01T11:29:58.000-07:00
Summary: The EAS event class was trying to be unicode-aware, but only ran unicode-safe truncation when the string was /already/ unicode-encoded. Creating an EAS event with a Location: `aกกกกก (+250 more)` caused it to break the non-unicode input in an invalid way. I changed the conveinece method to support unicode and non-unicode input, and consolidated it with other unicode-aware trimming code elsewhere. It should now be used everywhere we truncate string input. Fixes https://sentry.nylas.com/sentry/sync-prod/group/52270/ and possibly others. Test Plan: Updated tests Reviewers: drew, spang Reviewed By: spang Subscribers: khamidou, jenkins Differential Revision: https://phab.nylas.com/D3247
diff --git a/inbox/models/category.py b/inbox/models/category.py
@@ -14,22 +14,14 @@
 from inbox.models.constants import MAX_INDEXABLE_LENGTH
 from nylas.logging import get_logger
 from inbox.util.misc import fs_folder_path
+from inbox.util.encoding import unicode_safe_truncate
 log = get_logger()
 
 EPOCH = datetime.utcfromtimestamp(0)
 
 
 def sanitize_name(name):
-    # g_label may not have unicode type (in particular for a numeric
-    # label, e.g. '42'), so coerce to unicode.
-    if not isinstance(name, unicode):
-        name = str(name).decode('utf-8', 'ignore')
-
-    # Remove trailing whitespace, truncate (due to MySQL limitations).
-    name = name.rstrip()
-    if len(name) > MAX_INDEXABLE_LENGTH:
-        name = name[:MAX_INDEXABLE_LENGTH]
-    return name
+    return unicode_safe_truncate(name, MAX_INDEXABLE_LENGTH)
 
 
 class CategoryNameString(StringWithTransform):
diff --git a/inbox/models/contact.py b/inbox/models/contact.py
@@ -8,6 +8,7 @@
 from inbox.models.base import MailSyncBase
 from inbox.models.message import Message
 from inbox.models.namespace import Namespace
+from inbox.util.encoding import unicode_safe_truncate
 
 
 class Contact(MailSyncBase, HasRevisions, HasPublicID, HasEmailAddress,
@@ -45,7 +46,9 @@ class Contact(MailSyncBase, HasRevisions, HasPublicID, HasEmailAddress,
 
     @validates('raw_data')
     def validate_length(self, key, value):
-        return value if value is None else value[:MAX_TEXT_LENGTH]
+        if value is None:
+            return None
+        return unicode_safe_truncate(value, MAX_TEXT_LENGTH)
 
     @property
     def versioned_relationships(self):
diff --git a/inbox/models/event.py b/inbox/models/event.py
@@ -18,23 +18,26 @@
 from inbox.models.message import Message
 from inbox.models.when import Time, TimeSpan, Date, DateSpan
 from email.utils import parseaddr
-from inbox.util.encoding import unicode_truncate
+from inbox.util.encoding import unicode_safe_truncate
 
 from nylas.logging import get_logger
 log = get_logger()
 
+EVENT_STATUSES = ["confirmed", "tentative", "cancelled"]
+
 TITLE_MAX_LEN = 1024
 LOCATION_MAX_LEN = 255
 RECURRENCE_MAX_LEN = 255
 REMINDER_MAX_LEN = 255
 OWNER_MAX_LEN = 1024
-_LENGTHS = {'location': LOCATION_MAX_LEN,
-            'owner': OWNER_MAX_LEN,
-            'recurrence': MAX_TEXT_LENGTH,
-            'reminders': REMINDER_MAX_LEN,
-            'title': TITLE_MAX_LEN,
-            'raw_data': MAX_TEXT_LENGTH}
-EVENT_STATUSES = ["confirmed", "tentative", "cancelled"]
+MAX_LENS = {
+    'location': LOCATION_MAX_LEN,
+    'owner': OWNER_MAX_LEN,
+    'recurrence': MAX_TEXT_LENGTH,
+    'reminders': REMINDER_MAX_LEN,
+    'title': TITLE_MAX_LEN,
+    'raw_data': MAX_TEXT_LENGTH
+}
 
 
 def time_parse(x):
@@ -144,11 +147,9 @@ class Event(MailSyncBase, HasRevisions, HasPublicID, UpdatedAtMixin,
     @validates('reminders', 'recurrence', 'owner', 'location', 'title',
                'raw_data')
     def validate_length(self, key, value):
-        max_len = _LENGTHS[key]
-        if isinstance(value, unicode):
-            return value if value is None else unicode_truncate(value, max_len)
-        else:
-            return value if value is None else value[:max_len]
+        if value is None:
+            return None
+        return unicode_safe_truncate(value, MAX_LENS[key])
 
     @property
     def when(self):
diff --git a/inbox/models/message.py b/inbox/models/message.py
@@ -28,6 +28,9 @@
 from inbox.models.category import Category
 
 from inbox.sqlalchemy_ext.util import MAX_MYSQL_INTEGER
+from inbox.util.encoding import unicode_safe_truncate
+
+SNIPPET_LENGTH = 191
 
 
 def _trim_filename(s, namespace_id, max_len=255):
@@ -127,7 +130,6 @@ def categories_changes(self, has_changes):
 
     _compacted_body = Column(LONGBLOB, nullable=True)
     snippet = Column(String(191), nullable=False)
-    SNIPPET_LENGTH = 191
 
     # this might be a mail-parsing bug, or just a message from a bad client
     decode_error = Column(Boolean, server_default=false(), nullable=False,
@@ -201,8 +203,7 @@ def sanitize_subject(self, key, value):
         # contains null bytes.
         if value is None:
             return
-        if len(value) > 255:
-            value = value[:255]
+        value = unicode_safe_truncate(value, 255)
         value = value.replace('\0', '')
         return value
 
@@ -455,7 +456,7 @@ def calculate_html_snippet(self, text):
         return self.calculate_plaintext_snippet(text)
 
     def calculate_plaintext_snippet(self, text):
-        return ' '.join(text.split())[:self.SNIPPET_LENGTH]
+        return unicode_safe_truncate(' '.join(text.split()), SNIPPET_LENGTH)
 
     @property
     def body(self):
diff --git a/inbox/models/mixins.py b/inbox/models/mixins.py
@@ -7,6 +7,7 @@
 from inbox.sqlalchemy_ext.util import Base36UID, generate_public_id, ABCMixin
 from inbox.models.constants import MAX_INDEXABLE_LENGTH
 from inbox.util.addr import canonicalize_address
+from inbox.util.encoding import unicode_safe_truncate
 
 
 class HasRevisions(ABCMixin):
@@ -126,11 +127,11 @@ def email_address(cls):
 
     @email_address.setter
     def email_address(self, value):
+        # Silently truncate if necessary. In practice, this may be too
+        # long if somebody put a super-long email into their contacts by
+        # mistake or something.
         if value is not None:
-            # Silently truncate if necessary. In practice, this may be too
-            # long if somebody put a super-long email into their contacts by
-            # mistake or something.
-            value = value[:MAX_INDEXABLE_LENGTH]
+            value = unicode_safe_truncate(value, MAX_INDEXABLE_LENGTH)
         self._raw_address = value
         self._canonicalized_address = canonicalize_address(value)
 
diff --git a/inbox/util/encoding.py b/inbox/util/encoding.py
@@ -18,14 +18,11 @@ def base36decode(number):
     return int(number, 36)
 
 
-# From: http://stackoverflow.com/a/1820949
-# Quick and dirty hack to truncate a unicode string
-# on a codepoint boundary.
-def unicode_truncate(s, new_length):
-    assert isinstance(s, unicode)
-    encoded = s.encode('utf-8')[:new_length]
-
-    # This assumes that we've been able to decode the string
-    # to unicode in the first place, so any errors would be
-    # caused by the truncation.
-    return encoded.decode('utf-8', 'ignore')
+def unicode_safe_truncate(s, max_length):
+    """
+    Implements unicode-safe truncation and trims whitespace for a given input
+    string, number or unicode string.
+    """
+    if not isinstance(s, unicode):
+        s = str(s).decode('utf-8', 'ignore')
+    return s.rstrip()[:max_length]
diff --git a/tests/events/test_util.py b/tests/events/test_util.py
@@ -42,8 +42,8 @@ def test_removed_participants():
 
 
 def test_unicode_event_truncation(db, default_account):
-    emoji_str = u"".join([u"😁" for i in range(256)])
-    title = "".join(["a" for i in range(2048)])
+    emoji_str = u"".join([u"😁" for i in range(300)])
+    title = "".join(["a" for i in range(2000)])
 
     e = Event(raw_data='',
               busy=True,
@@ -61,8 +61,8 @@ def test_unicode_event_truncation(db, default_account):
     db.session.add(e)
     db.session.commit()
 
-    # Original location had 256 emoji chars. Emoji in utf-8 are
-    # 4 bytes in length. The field is at most 255 chars, so
-    # 255 / 4 = 63.
-    assert len(e.location) == 63
+    # Both location and title should be properly truncated to their max lengths.
+    # It's ok to have N unicode characters in a VARCHAR(N) field because
+    # the column is uft8-encoded.
+    assert len(e.location) == 255
     assert len(e.title) == 1024