-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_tagger.py
163 lines (121 loc) · 4.33 KB
/
text_tagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#-*- coding: utf8 -*-
"""
Tagger to use in spanish and all the functions needed to work with it.
More info:
http://stackoverflow.com/questions/14732465/nltk-tagging-spanish-words-using-a-corpus
https://github.com/alvations/spaghetti-tagger
"""
from nltk import UnigramTagger, BigramTagger, TrigramTagger
import utilities
N_GRAM_NAMES = ["unigram", "bigram", "trigram"]
NOMWE_TEXT = "nomwe"
TAGGER_EXTENSION = '.tagger'
TAGGER_PATH = "Taggers/"
def train_tagger(corpus_name, corpus):
"""
Train the taggers and saves them
Args:
corpus_name: name of the corpus used to create the tagger
corpus: corpus for creating the tagger
"""
#List of n-gram taggers names
complete_names = [corpus_name + '_' + x for x in N_GRAM_NAMES]
# Training UnigramTagger
tagger1 = UnigramTagger(corpus)
utilities.save_pickle(tagger1, complete_names[0], TAGGER_EXTENSION, TAGGER_PATH)
print "UnigramTagger trained with", corpus_name
# Training BigramTagger
tagger2 = BigramTagger(corpus)
utilities.save_pickle(tagger2, complete_names[1], TAGGER_EXTENSION, TAGGER_PATH)
print "BigramTagger trained with", corpus_name
# Training TrigramTagger
tagger3 = TrigramTagger(corpus)
utilities.save_pickle(tagger3, complete_names[2], TAGGER_EXTENSION, TAGGER_PATH)
print "TrigramTagger trained with", corpus_name
# Function to unchunk corpus
def unchunk(corpus):
"""
Given a corpus it splits all the Multi-Word Expressions
Args:
corpus: corpus to use
"""
nomwe_corpus = []
for i in corpus:
nomwe = " ".join([j[0].replace("_", " ") for j in i])
nomwe_corpus.append(nomwe.split())
return nomwe_corpus
class Tagger():
"""
Tagger trained using cess_esp corpus
Attributes:
mwe: Indicates if we want to recognize Multi-Word Expressions as one token
uni: UnigramTagger
bi: BigramTagger
tri: TrigramTagger
"""
def __init__(self, name_tagger, corpus, mwe=True):
"""
When initialized it will load all the taggers. They are:
* UnigramTagger
* BigramTagger
* TrigramTagger
If not possible it will create them, and save them.
If Multi-Word Expressions are not allowed its necessary to split them
and then use a UnigramTagger to be trained
Args:
name_tagger: root part of the name of the tagger, like cess_esp
corpus: corpus that will train the tagger
mwe: It can allow Multi-Word Expressions
"""
self.mwe = mwe
if not mwe:
name_tagger += '_' + NOMWE_TEXT
#set the names of the taggers like:
# cess_es_unigram.tagger, cess_es_bigram.tagger
# or cess_es_nomwe_unigram.tagger, cess_es_nomwe_bigram.tagger
complete_names = [name_tagger + '_' + x for x in N_GRAM_NAMES]
# Try to load the taggers.
try:
for x in complete_names:
utilities.load_pickle(x, TAGGER_EXTENSION, TAGGER_PATH).tag(['hola'])
#If it not work create them
except IOError:
print "\n*** First-time use of", name_tagger, " taggers ***"
print "Training taggers ..."
timer = utilities.Timer()
if self.mwe:
cess_sents = corpus.tagged_sents()
train_tagger(name_tagger, cess_sents)
else:
#Without mutliwords we need to split them
cess_sents = unchunk(corpus.tagged_sents())
#We need the mwe tagger to train
aux_tagger = tagger(name_tagger, corpus, mwe=True)
tagged_cess_nomwe = aux_tagger.uni.tag_sents(cess_sents)
train_tagger(name_tagger + '_' + NOMWE_TEXT, tagged_cess_nomwe)
print "\nAll taggers trained in", timer.get_time(), "seconds"
# Load tagger
self.uni = utilities.load_pickle(complete_names[0], TAGGER_EXTENSION, TAGGER_PATH)
self.bi = utilities.load_pickle(complete_names[1], TAGGER_EXTENSION, TAGGER_PATH)
self.tri = utilities.load_pickle(complete_names[2], TAGGER_EXTENSION, TAGGER_PATH)
### TODO: Set backoffs for every tagger ###
def cess_esp(mwe=True):
"""
This gives a tagger created with the corpus CESS_ESP
Args:
mwe: It can allow Multi-Word Expressions
"""
from nltk.corpus import cess_esp as corpus
return Tagger('cess_esp', corpus, mwe)
def cess_cat(mwe=True):
"""
This gives a tagger created with the corpus CESS_CAT
Args:
mwe: It can allow Multi-Word Expressions
"""
from nltk.corpus import cess_cat as corpus
return Tagger('cess_cat', corpus, mwe)
#If it is not imported, run that
if __name__ == '__main__':
mtagger = cess_esp()
print mtagger.uni.tag('A la patata le gusta bailar ska .'.split())