Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

migrate to sacremoses and add toktok tokenizer #361

Merged
merged 2 commits into from
Aug 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ requests
# Optional NLP tools
nltk
spacy
sacremoses
git+git://github.com/jekbradbury/revtok.git

# Documentation
Expand Down
9 changes: 8 additions & 1 deletion test/data/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_get_tokenizer(self):
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# Test Moses option. Test strings taken from NLTK doctests.
# Test Moses option.
# Note that internally, MosesTokenizer converts to unicode if applicable
moses_tokenizer = data.get_tokenizer("moses")
assert moses_tokenizer(test_str) == [
Expand All @@ -26,6 +26,13 @@ def test_get_tokenizer(self):
# Nonbreaking prefixes should tokenize the final period.
assert moses_tokenizer(six.text_type("abc def.")) == ["abc", "def", "."]

# Test Toktok option. Test strings taken from NLTK doctests.
# Note that internally, MosesTokenizer converts to unicode if applicable
toktok_tokenizer = data.get_tokenizer("toktok")
assert toktok_tokenizer(test_str) == [
"A", "string", ",", "particularly", "one", "with", "slightly",
"complex", "punctuation", "."]

# Test that errors are raised for invalid input arguments.
with self.assertRaises(ValueError):
data.get_tokenizer(1)
Expand Down
18 changes: 12 additions & 6 deletions torchtext/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,22 @@ def get_tokenizer(tokenizer):
raise
elif tokenizer == "moses":
try:
from nltk.tokenize.moses import MosesTokenizer
from sacremoses import MosesTokenizer
moses_tokenizer = MosesTokenizer()
return moses_tokenizer.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at http://nltk.org for more information.")
print("Please install SacreMoses. "
"See the docs at https://github.com/alvations/sacremoses "
"for more information.")
raise
except LookupError:
print("Please install the necessary NLTK corpora. "
"See the docs at http://nltk.org for more information.")
elif tokenizer == "toktok":
try:
from nltk.tokenize.toktok import ToktokTokenizer
toktok = ToktokTokenizer()
return toktok.tokenize
except ImportError:
print("Please install NLTK. "
"See the docs at https://nltk.org for more information.")
raise
elif tokenizer == 'revtok':
try:
Expand Down