Skip to content

Commit 6c3ea63

Browse files
committedSep 1, 2016
Fix unicode-aware truncation of EAS event attrs
Summary: The EAS event class was trying to be unicode-aware, but only ran unicode-safe truncation when the string was /already/ unicode-encoded. Creating an EAS event with a Location: `aกกกกก (+250 more)` caused it to break the non-unicode input in an invalid way. I changed the conveinece method to support unicode and non-unicode input, and consolidated it with other unicode-aware trimming code elsewhere. It should now be used everywhere we truncate string input. Fixes https://sentry.nylas.com/sentry/sync-prod/group/52270/ and possibly others. Test Plan: Updated tests Reviewers: drew, spang Reviewed By: spang Subscribers: khamidou, jenkins Differential Revision: https://phab.nylas.com/D3247
1 parent c6e5470 commit 6c3ea63

File tree

7 files changed

+44
-49
lines changed

7 files changed

+44
-49
lines changed
 

Diff for: ‎inbox/models/category.py

+2-10
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,14 @@
1414
from inbox.models.constants import MAX_INDEXABLE_LENGTH
1515
from nylas.logging import get_logger
1616
from inbox.util.misc import fs_folder_path
17+
from inbox.util.encoding import unicode_safe_truncate
1718
log = get_logger()
1819

1920
EPOCH = datetime.utcfromtimestamp(0)
2021

2122

2223
def sanitize_name(name):
23-
# g_label may not have unicode type (in particular for a numeric
24-
# label, e.g. '42'), so coerce to unicode.
25-
if not isinstance(name, unicode):
26-
name = str(name).decode('utf-8', 'ignore')
27-
28-
# Remove trailing whitespace, truncate (due to MySQL limitations).
29-
name = name.rstrip()
30-
if len(name) > MAX_INDEXABLE_LENGTH:
31-
name = name[:MAX_INDEXABLE_LENGTH]
32-
return name
24+
return unicode_safe_truncate(name, MAX_INDEXABLE_LENGTH)
3325

3426

3527
class CategoryNameString(StringWithTransform):

Diff for: ‎inbox/models/contact.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from inbox.models.base import MailSyncBase
99
from inbox.models.message import Message
1010
from inbox.models.namespace import Namespace
11+
from inbox.util.encoding import unicode_safe_truncate
1112

1213

1314
class Contact(MailSyncBase, HasRevisions, HasPublicID, HasEmailAddress,
@@ -45,7 +46,9 @@ class Contact(MailSyncBase, HasRevisions, HasPublicID, HasEmailAddress,
4546

4647
@validates('raw_data')
4748
def validate_length(self, key, value):
48-
return value if value is None else value[:MAX_TEXT_LENGTH]
49+
if value is None:
50+
return None
51+
return unicode_safe_truncate(value, MAX_TEXT_LENGTH)
4952

5053
@property
5154
def versioned_relationships(self):

Diff for: ‎inbox/models/event.py

+14-13
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,26 @@
1818
from inbox.models.message import Message
1919
from inbox.models.when import Time, TimeSpan, Date, DateSpan
2020
from email.utils import parseaddr
21-
from inbox.util.encoding import unicode_truncate
21+
from inbox.util.encoding import unicode_safe_truncate
2222

2323
from nylas.logging import get_logger
2424
log = get_logger()
2525

26+
EVENT_STATUSES = ["confirmed", "tentative", "cancelled"]
27+
2628
TITLE_MAX_LEN = 1024
2729
LOCATION_MAX_LEN = 255
2830
RECURRENCE_MAX_LEN = 255
2931
REMINDER_MAX_LEN = 255
3032
OWNER_MAX_LEN = 1024
31-
_LENGTHS = {'location': LOCATION_MAX_LEN,
32-
'owner': OWNER_MAX_LEN,
33-
'recurrence': MAX_TEXT_LENGTH,
34-
'reminders': REMINDER_MAX_LEN,
35-
'title': TITLE_MAX_LEN,
36-
'raw_data': MAX_TEXT_LENGTH}
37-
EVENT_STATUSES = ["confirmed", "tentative", "cancelled"]
33+
MAX_LENS = {
34+
'location': LOCATION_MAX_LEN,
35+
'owner': OWNER_MAX_LEN,
36+
'recurrence': MAX_TEXT_LENGTH,
37+
'reminders': REMINDER_MAX_LEN,
38+
'title': TITLE_MAX_LEN,
39+
'raw_data': MAX_TEXT_LENGTH
40+
}
3841

3942

4043
def time_parse(x):
@@ -144,11 +147,9 @@ class Event(MailSyncBase, HasRevisions, HasPublicID, UpdatedAtMixin,
144147
@validates('reminders', 'recurrence', 'owner', 'location', 'title',
145148
'raw_data')
146149
def validate_length(self, key, value):
147-
max_len = _LENGTHS[key]
148-
if isinstance(value, unicode):
149-
return value if value is None else unicode_truncate(value, max_len)
150-
else:
151-
return value if value is None else value[:max_len]
150+
if value is None:
151+
return None
152+
return unicode_safe_truncate(value, MAX_LENS[key])
152153

153154
@property
154155
def when(self):

Diff for: ‎inbox/models/message.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828
from inbox.models.category import Category
2929

3030
from inbox.sqlalchemy_ext.util import MAX_MYSQL_INTEGER
31+
from inbox.util.encoding import unicode_safe_truncate
32+
33+
SNIPPET_LENGTH = 191
3134

3235

3336
def _trim_filename(s, namespace_id, max_len=255):
@@ -127,7 +130,6 @@ def categories_changes(self, has_changes):
127130

128131
_compacted_body = Column(LONGBLOB, nullable=True)
129132
snippet = Column(String(191), nullable=False)
130-
SNIPPET_LENGTH = 191
131133

132134
# this might be a mail-parsing bug, or just a message from a bad client
133135
decode_error = Column(Boolean, server_default=false(), nullable=False,
@@ -201,8 +203,7 @@ def sanitize_subject(self, key, value):
201203
# contains null bytes.
202204
if value is None:
203205
return
204-
if len(value) > 255:
205-
value = value[:255]
206+
value = unicode_safe_truncate(value, 255)
206207
value = value.replace('\0', '')
207208
return value
208209

@@ -455,7 +456,7 @@ def calculate_html_snippet(self, text):
455456
return self.calculate_plaintext_snippet(text)
456457

457458
def calculate_plaintext_snippet(self, text):
458-
return ' '.join(text.split())[:self.SNIPPET_LENGTH]
459+
return unicode_safe_truncate(' '.join(text.split()), SNIPPET_LENGTH)
459460

460461
@property
461462
def body(self):

Diff for: ‎inbox/models/mixins.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from inbox.sqlalchemy_ext.util import Base36UID, generate_public_id, ABCMixin
88
from inbox.models.constants import MAX_INDEXABLE_LENGTH
99
from inbox.util.addr import canonicalize_address
10+
from inbox.util.encoding import unicode_safe_truncate
1011

1112

1213
class HasRevisions(ABCMixin):
@@ -126,11 +127,11 @@ def email_address(cls):
126127

127128
@email_address.setter
128129
def email_address(self, value):
130+
# Silently truncate if necessary. In practice, this may be too
131+
# long if somebody put a super-long email into their contacts by
132+
# mistake or something.
129133
if value is not None:
130-
# Silently truncate if necessary. In practice, this may be too
131-
# long if somebody put a super-long email into their contacts by
132-
# mistake or something.
133-
value = value[:MAX_INDEXABLE_LENGTH]
134+
value = unicode_safe_truncate(value, MAX_INDEXABLE_LENGTH)
134135
self._raw_address = value
135136
self._canonicalized_address = canonicalize_address(value)
136137

Diff for: ‎inbox/util/encoding.py

+8-11
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,11 @@ def base36decode(number):
1818
return int(number, 36)
1919

2020

21-
# From: http://stackoverflow.com/a/1820949
22-
# Quick and dirty hack to truncate a unicode string
23-
# on a codepoint boundary.
24-
def unicode_truncate(s, new_length):
25-
assert isinstance(s, unicode)
26-
encoded = s.encode('utf-8')[:new_length]
27-
28-
# This assumes that we've been able to decode the string
29-
# to unicode in the first place, so any errors would be
30-
# caused by the truncation.
31-
return encoded.decode('utf-8', 'ignore')
21+
def unicode_safe_truncate(s, max_length):
22+
"""
23+
Implements unicode-safe truncation and trims whitespace for a given input
24+
string, number or unicode string.
25+
"""
26+
if not isinstance(s, unicode):
27+
s = str(s).decode('utf-8', 'ignore')
28+
return s.rstrip()[:max_length]

Diff for: ‎tests/events/test_util.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ def test_removed_participants():
4242

4343

4444
def test_unicode_event_truncation(db, default_account):
45-
emoji_str = u"".join([u"😁" for i in range(256)])
46-
title = "".join(["a" for i in range(2048)])
45+
emoji_str = u"".join([u"😁" for i in range(300)])
46+
title = "".join(["a" for i in range(2000)])
4747

4848
e = Event(raw_data='',
4949
busy=True,
@@ -61,8 +61,8 @@ def test_unicode_event_truncation(db, default_account):
6161
db.session.add(e)
6262
db.session.commit()
6363

64-
# Original location had 256 emoji chars. Emoji in utf-8 are
65-
# 4 bytes in length. The field is at most 255 chars, so
66-
# 255 / 4 = 63.
67-
assert len(e.location) == 63
64+
# Both location and title should be properly truncated to their max lengths.
65+
# It's ok to have N unicode characters in a VARCHAR(N) field because
66+
# the column is uft8-encoded.
67+
assert len(e.location) == 255
6868
assert len(e.title) == 1024

0 commit comments

Comments
 (0)
Please sign in to comment.