lintool
diff --git a/‎data/vocab/other/ar-en.galep5oct_no-ar-stemming/ttable.en-ar
8.02 MB b/‎data/vocab/other/ar-en.galep5oct_no-ar-stemming/ttable.en-ar
8.02 MB
diff --git a/‎data/vocab/other/ar-en.galep5oct_no-ar-stemming/vocab.en-ar.ar
2.9 MB b/‎data/vocab/other/ar-en.galep5oct_no-ar-stemming/vocab.en-ar.ar
2.9 MB
diff --git a/‎data/vocab/other/ar-en.galep5oct_no-ar-stemming/vocab.en-ar.en
1.21 MB b/‎data/vocab/other/ar-en.galep5oct_no-ar-stemming/vocab.en-ar.en
1.21 MB
diff --git a/‎src/java/main/ivory/core/util/CLIRUtils.java
-79 b/‎src/java/main/ivory/core/util/CLIRUtils.java
-79
diff --git a/‎src/java/main/ivory/sqe/querygenerator/ProbabilisticStructuredQueryGenerator.java
+11-11 b/‎src/java/main/ivory/sqe/querygenerator/ProbabilisticStructuredQueryGenerator.java
+11-11
@@ -1393,83 +1393,4 @@ public static void main(String[] args) throws Exception {
 
     return;
   }
-
-  //  public static void main(String args[]){
-  //    if(args.length != 10 && args.length != 11 && args.length != 5 && args.length != 7){
-  //      printUsage();
-  //    }
-  //
-  //    // experimental    
-  //    if(args.length == 7){
-  //      CLIRUtils.combineTTables(args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
-  //      return;
-  //    }
-  //
-  //
-  //    // Read parameters
-  //    float probThreshold = 0.9f;
-  //    int numTrans = 15;
-  //    if(args.length >= 10){
-  //      try {
-  //        probThreshold = Float.parseFloat(args[9]);
-  //      } catch (NumberFormatException e) {
-  //        e.printStackTrace();
-  //      }
-  //    }
-  //
-  //    if(args.length >= 11){
-  //      try {
-  //        numTrans = Integer.parseInt(args[10]);
-  //      } catch (NumberFormatException e) {
-  //        e.printStackTrace();
-  //      }
-  //    }
-  //
-  //    try {
-  //      Configuration conf = new Configuration();
-  //      FileSystem localFS = FileSystem.getLocal(conf);
-  //
-  //      // query mode
-  //      if (args.length == 5) {
-  //        String srcTerm = args[0], trgTerm = args[1];
-  //        Vocab srcVocab = HadoopAlign.loadVocab(new Path(args[2]), localFS);
-  //        Vocab trgVocab = HadoopAlign.loadVocab(new Path(args[3]), localFS);
-  //        TTable_monolithic_IFAs src2trgProbs = new TTable_monolithic_IFAs(localFS, new Path(args[4]), true);
-  //        System.out.println("Source vocab size: " + srcVocab.size());
-  //        System.out.println("Target vocab size: " + trgVocab.size());
-  //        
-  //        if (trgTerm.equals("ALL")) {
-  //          int[] trgs = src2trgProbs.get(srcVocab.get(srcTerm)).getTranslations(0.0f);
-  //          System.out.println(srcTerm + " has "+ trgs.length + " translations:");
-  //          for (int i = 0; i < trgs.length; i++) {
-  //            trgTerm = trgVocab.get(trgs[i]);
-  //            System.out.println("Prob("+trgTerm+"|"+srcTerm+")="+src2trgProbs.get(srcVocab.get(srcTerm), trgVocab.get(trgTerm)));
-  //          }
-  //        }else {
-  //          System.out.println("Prob("+trgTerm+"|"+srcTerm+")="+src2trgProbs.get(srcVocab.get(srcTerm), trgVocab.get(trgTerm)));
-  //        }
-  //        return;
-  //      }
-  //
-  //      // create mode
-  //      String lex_f2e = args[0];
-  //      String lex_e2f = args[1];
-  //      String type = args[2];
-  //      logger.info("Type of input:" + type);
-  //      if(type.equals("giza")){
-  //        CLIRUtils.createTTableFromGIZA(lex_f2e, args[3], args[4], args[5], probThreshold, numTrans, localFS);
-  //        CLIRUtils.createTTableFromGIZA(lex_e2f, args[6], args[7], args[8], probThreshold, numTrans, localFS);
-  //      }else if(type.equals("berkeley")){
-  //        CLIRUtils.createTTableFromBerkeleyAligner(lex_f2e, args[3], args[4], args[5], probThreshold, numTrans, localFS);
-  //        CLIRUtils.createTTableFromBerkeleyAligner(lex_e2f, args[6], args[7], args[8], probThreshold, numTrans, localFS);
-  //      }else if(type.equals("hooka")){
-  //        CLIRUtils.createTTableFromHooka(lex_f2e, args[3], args[4], args[5], probThreshold, numTrans, localFS);
-  //        CLIRUtils.createTTableFromHooka(lex_e2f, args[6], args[7], args[8], probThreshold, numTrans, localFS);
-  //      }else{
-  //        printUsage();
-  //      }
-  //    } catch (IOException e) {
-  //      e.printStackTrace();
-  //    }
-  //  }
 }
@@ -6,7 +6,7 @@
 import ivory.core.tokenize.TokenizerFactory;
 import ivory.sqe.retrieval.Constants;
 import ivory.sqe.retrieval.StructuredQuery;
-
+import java.util.regex.*;
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -19,17 +19,14 @@
 import java.util.Map;
 import java.util.PriorityQueue;
 import java.util.Set;
-
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
-
 import com.google.gson.JsonArray;
 import com.google.gson.JsonObject;
 import com.google.gson.JsonPrimitive;
-
 import edu.umd.cloud9.io.map.HMapSFW;
 import edu.umd.cloud9.io.pair.PairOfFloatInt;
 import edu.umd.cloud9.io.pair.PairOfStrings;
@@ -52,7 +49,8 @@ public class ProbabilisticStructuredQueryGenerator implements QueryGenerator {
   private float lexProbThreshold, cumProbThreshold;
   private boolean isDocStemmed, isStemming, H6, bigramSegment;
   private RetrievalEnvironment env;
-  private String queryLang, docLang;
+  private String queryLang, docLang, translateOnly;
+  private Pattern indriPuncPattern = Pattern.compile(".*\\p{Punct}.*");
 
   public ProbabilisticStructuredQueryGenerator() throws IOException {
     super();
@@ -73,6 +71,9 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
     LOG.info("Stemmed stopword list file in query-language:" + conf.get(Constants.StemmedStopwordListQ));
     LOG.info("Stemmed stopword list file in doc-language:" + conf.get(Constants.StemmedStopwordListD));
 
+    queryLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), false, conf.get(Constants.StopwordListQ), null, null);
+    queryLangTokenizerWithStemming = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), true, null, conf.get(Constants.StemmedStopwordListQ), null);
+    
     isDocStemmed = conf.getBoolean(Constants.IsDocStemmed, false);
     isStemming = conf.getBoolean(Constants.IsStemming, false);
     if (isStemming) {
@@ -81,10 +82,6 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
       defaultTokenizer = queryLangTokenizer;
     }
 
-    
-    queryLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), false, conf.get(Constants.StopwordListQ), null, null);
-    queryLangTokenizerWithStemming = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), true, null, conf.get(Constants.StemmedStopwordListQ), null);
-    
     if (isDocStemmed) {
       docLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, docLang, conf.get(Constants.DocTokenizerData), true, null, conf.get(Constants.StemmedStopwordListD), null);
     } else {
@@ -103,9 +100,11 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
     }
     LOG.info("H6 = " + H6);
 
+    translateOnly = conf.get(Constants.TranslateOnly);
+
     // initialize environment to access index
     // skip this if we only want to translate query (i.e., no retrieval)
-    if (conf.get(Constants.TranslateOnly) == null) {    
+    if (translateOnly == null) {    
       try {
         env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);
         env.initialize(true);
@@ -216,7 +215,8 @@ protected HMapSFW getTranslations(String query, String token, Set<PairOfStrings>
 
       //      LOG.info("Pr("+eTerm+"|"+token+")="+probEF);
 
-      if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {      
+      if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && !(translateOnly.equals("indri") && indriPuncPattern.matcher(eTerm).matches()) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {      
+System.out.println(eTerm);
         // assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
         // then we need to convert the translations of each source token into a sequence of bigrams
         // we can distribute the translation probability equally to the each bigram