6
6
import ivory .core .tokenize .TokenizerFactory ;
7
7
import ivory .sqe .retrieval .Constants ;
8
8
import ivory .sqe .retrieval .StructuredQuery ;
9
-
9
+ import java . util . regex .*;
10
10
import java .io .BufferedReader ;
11
11
import java .io .FileInputStream ;
12
12
import java .io .FileNotFoundException ;
19
19
import java .util .Map ;
20
20
import java .util .PriorityQueue ;
21
21
import java .util .Set ;
22
-
23
22
import org .apache .hadoop .conf .Configuration ;
24
23
import org .apache .hadoop .fs .FileSystem ;
25
24
import org .apache .hadoop .fs .Path ;
26
25
import org .apache .log4j .Level ;
27
26
import org .apache .log4j .Logger ;
28
-
29
27
import com .google .gson .JsonArray ;
30
28
import com .google .gson .JsonObject ;
31
29
import com .google .gson .JsonPrimitive ;
32
-
33
30
import edu .umd .cloud9 .io .map .HMapSFW ;
34
31
import edu .umd .cloud9 .io .pair .PairOfFloatInt ;
35
32
import edu .umd .cloud9 .io .pair .PairOfStrings ;
@@ -52,7 +49,8 @@ public class ProbabilisticStructuredQueryGenerator implements QueryGenerator {
52
49
private float lexProbThreshold , cumProbThreshold ;
53
50
private boolean isDocStemmed , isStemming , H6 , bigramSegment ;
54
51
private RetrievalEnvironment env ;
55
- private String queryLang , docLang ;
52
+ private String queryLang , docLang , translateOnly ;
53
+ private Pattern indriPuncPattern = Pattern .compile (".*\\ p{Punct}.*" );
56
54
57
55
public ProbabilisticStructuredQueryGenerator () throws IOException {
58
56
super ();
@@ -73,6 +71,9 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
73
71
LOG .info ("Stemmed stopword list file in query-language:" + conf .get (Constants .StemmedStopwordListQ ));
74
72
LOG .info ("Stemmed stopword list file in doc-language:" + conf .get (Constants .StemmedStopwordListD ));
75
73
74
+ queryLangTokenizer = TokenizerFactory .createTokenizer (fs , conf , queryLang , conf .get (Constants .QueryTokenizerData ), false , conf .get (Constants .StopwordListQ ), null , null );
75
+ queryLangTokenizerWithStemming = TokenizerFactory .createTokenizer (fs , conf , queryLang , conf .get (Constants .QueryTokenizerData ), true , null , conf .get (Constants .StemmedStopwordListQ ), null );
76
+
76
77
isDocStemmed = conf .getBoolean (Constants .IsDocStemmed , false );
77
78
isStemming = conf .getBoolean (Constants .IsStemming , false );
78
79
if (isStemming ) {
@@ -81,10 +82,6 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
81
82
defaultTokenizer = queryLangTokenizer ;
82
83
}
83
84
84
-
85
- queryLangTokenizer = TokenizerFactory .createTokenizer (fs , conf , queryLang , conf .get (Constants .QueryTokenizerData ), false , conf .get (Constants .StopwordListQ ), null , null );
86
- queryLangTokenizerWithStemming = TokenizerFactory .createTokenizer (fs , conf , queryLang , conf .get (Constants .QueryTokenizerData ), true , null , conf .get (Constants .StemmedStopwordListQ ), null );
87
-
88
85
if (isDocStemmed ) {
89
86
docLangTokenizer = TokenizerFactory .createTokenizer (fs , conf , docLang , conf .get (Constants .DocTokenizerData ), true , null , conf .get (Constants .StemmedStopwordListD ), null );
90
87
} else {
@@ -103,9 +100,11 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
103
100
}
104
101
LOG .info ("H6 = " + H6 );
105
102
103
+ translateOnly = conf .get (Constants .TranslateOnly );
104
+
106
105
// initialize environment to access index
107
106
// skip this if we only want to translate query (i.e., no retrieval)
108
- if (conf . get ( Constants . TranslateOnly ) == null ) {
107
+ if (translateOnly == null ) {
109
108
try {
110
109
env = new RetrievalEnvironment (conf .get (Constants .IndexPath ), fs );
111
110
env .initialize (true );
@@ -216,7 +215,8 @@ protected HMapSFW getTranslations(String query, String token, Set<PairOfStrings>
216
215
217
216
// LOG.info("Pr("+eTerm+"|"+token+")="+probEF);
218
217
219
- if (probEF > 0 && e > 0 && !docLangTokenizer .isStopWord (eTerm ) && (pairsInSCFG == null || pairsInSCFG .contains (new PairOfStrings (token ,eTerm )))) {
218
+ if (probEF > 0 && e > 0 && !docLangTokenizer .isStopWord (eTerm ) && !(translateOnly .equals ("indri" ) && indriPuncPattern .matcher (eTerm ).matches ()) && (pairsInSCFG == null || pairsInSCFG .contains (new PairOfStrings (token ,eTerm )))) {
219
+ System .out .println (eTerm );
220
220
// assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
221
221
// then we need to convert the translations of each source token into a sequence of bigrams
222
222
// we can distribute the translation probability equally to the each bigram
0 commit comments