Skip to content
This repository was archived by the owner on May 6, 2018. It is now read-only.

Commit 9dd046d

Browse files
committed
added new En->Ar vocab files without stemming on Arabic. few fixes when SQE used with Indri.
1 parent 14a2b7c commit 9dd046d

File tree

5 files changed

+11
-90
lines changed

5 files changed

+11
-90
lines changed
Binary file not shown.
Binary file not shown.
Binary file not shown.

src/java/main/ivory/core/util/CLIRUtils.java

-79
Original file line numberDiff line numberDiff line change
@@ -1393,83 +1393,4 @@ public static void main(String[] args) throws Exception {
13931393

13941394
return;
13951395
}
1396-
1397-
// public static void main(String args[]){
1398-
// if(args.length != 10 && args.length != 11 && args.length != 5 && args.length != 7){
1399-
// printUsage();
1400-
// }
1401-
//
1402-
// // experimental
1403-
// if(args.length == 7){
1404-
// CLIRUtils.combineTTables(args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
1405-
// return;
1406-
// }
1407-
//
1408-
//
1409-
// // Read parameters
1410-
// float probThreshold = 0.9f;
1411-
// int numTrans = 15;
1412-
// if(args.length >= 10){
1413-
// try {
1414-
// probThreshold = Float.parseFloat(args[9]);
1415-
// } catch (NumberFormatException e) {
1416-
// e.printStackTrace();
1417-
// }
1418-
// }
1419-
//
1420-
// if(args.length >= 11){
1421-
// try {
1422-
// numTrans = Integer.parseInt(args[10]);
1423-
// } catch (NumberFormatException e) {
1424-
// e.printStackTrace();
1425-
// }
1426-
// }
1427-
//
1428-
// try {
1429-
// Configuration conf = new Configuration();
1430-
// FileSystem localFS = FileSystem.getLocal(conf);
1431-
//
1432-
// // query mode
1433-
// if (args.length == 5) {
1434-
// String srcTerm = args[0], trgTerm = args[1];
1435-
// Vocab srcVocab = HadoopAlign.loadVocab(new Path(args[2]), localFS);
1436-
// Vocab trgVocab = HadoopAlign.loadVocab(new Path(args[3]), localFS);
1437-
// TTable_monolithic_IFAs src2trgProbs = new TTable_monolithic_IFAs(localFS, new Path(args[4]), true);
1438-
// System.out.println("Source vocab size: " + srcVocab.size());
1439-
// System.out.println("Target vocab size: " + trgVocab.size());
1440-
//
1441-
// if (trgTerm.equals("ALL")) {
1442-
// int[] trgs = src2trgProbs.get(srcVocab.get(srcTerm)).getTranslations(0.0f);
1443-
// System.out.println(srcTerm + " has "+ trgs.length + " translations:");
1444-
// for (int i = 0; i < trgs.length; i++) {
1445-
// trgTerm = trgVocab.get(trgs[i]);
1446-
// System.out.println("Prob("+trgTerm+"|"+srcTerm+")="+src2trgProbs.get(srcVocab.get(srcTerm), trgVocab.get(trgTerm)));
1447-
// }
1448-
// }else {
1449-
// System.out.println("Prob("+trgTerm+"|"+srcTerm+")="+src2trgProbs.get(srcVocab.get(srcTerm), trgVocab.get(trgTerm)));
1450-
// }
1451-
// return;
1452-
// }
1453-
//
1454-
// // create mode
1455-
// String lex_f2e = args[0];
1456-
// String lex_e2f = args[1];
1457-
// String type = args[2];
1458-
// logger.info("Type of input:" + type);
1459-
// if(type.equals("giza")){
1460-
// CLIRUtils.createTTableFromGIZA(lex_f2e, args[3], args[4], args[5], probThreshold, numTrans, localFS);
1461-
// CLIRUtils.createTTableFromGIZA(lex_e2f, args[6], args[7], args[8], probThreshold, numTrans, localFS);
1462-
// }else if(type.equals("berkeley")){
1463-
// CLIRUtils.createTTableFromBerkeleyAligner(lex_f2e, args[3], args[4], args[5], probThreshold, numTrans, localFS);
1464-
// CLIRUtils.createTTableFromBerkeleyAligner(lex_e2f, args[6], args[7], args[8], probThreshold, numTrans, localFS);
1465-
// }else if(type.equals("hooka")){
1466-
// CLIRUtils.createTTableFromHooka(lex_f2e, args[3], args[4], args[5], probThreshold, numTrans, localFS);
1467-
// CLIRUtils.createTTableFromHooka(lex_e2f, args[6], args[7], args[8], probThreshold, numTrans, localFS);
1468-
// }else{
1469-
// printUsage();
1470-
// }
1471-
// } catch (IOException e) {
1472-
// e.printStackTrace();
1473-
// }
1474-
// }
14751396
}

src/java/main/ivory/sqe/querygenerator/ProbabilisticStructuredQueryGenerator.java

+11-11
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import ivory.core.tokenize.TokenizerFactory;
77
import ivory.sqe.retrieval.Constants;
88
import ivory.sqe.retrieval.StructuredQuery;
9-
9+
import java.util.regex.*;
1010
import java.io.BufferedReader;
1111
import java.io.FileInputStream;
1212
import java.io.FileNotFoundException;
@@ -19,17 +19,14 @@
1919
import java.util.Map;
2020
import java.util.PriorityQueue;
2121
import java.util.Set;
22-
2322
import org.apache.hadoop.conf.Configuration;
2423
import org.apache.hadoop.fs.FileSystem;
2524
import org.apache.hadoop.fs.Path;
2625
import org.apache.log4j.Level;
2726
import org.apache.log4j.Logger;
28-
2927
import com.google.gson.JsonArray;
3028
import com.google.gson.JsonObject;
3129
import com.google.gson.JsonPrimitive;
32-
3330
import edu.umd.cloud9.io.map.HMapSFW;
3431
import edu.umd.cloud9.io.pair.PairOfFloatInt;
3532
import edu.umd.cloud9.io.pair.PairOfStrings;
@@ -52,7 +49,8 @@ public class ProbabilisticStructuredQueryGenerator implements QueryGenerator {
5249
private float lexProbThreshold, cumProbThreshold;
5350
private boolean isDocStemmed, isStemming, H6, bigramSegment;
5451
private RetrievalEnvironment env;
55-
private String queryLang, docLang;
52+
private String queryLang, docLang, translateOnly;
53+
private Pattern indriPuncPattern = Pattern.compile(".*\\p{Punct}.*");
5654

5755
public ProbabilisticStructuredQueryGenerator() throws IOException {
5856
super();
@@ -73,6 +71,9 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
7371
LOG.info("Stemmed stopword list file in query-language:" + conf.get(Constants.StemmedStopwordListQ));
7472
LOG.info("Stemmed stopword list file in doc-language:" + conf.get(Constants.StemmedStopwordListD));
7573

74+
queryLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), false, conf.get(Constants.StopwordListQ), null, null);
75+
queryLangTokenizerWithStemming = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), true, null, conf.get(Constants.StemmedStopwordListQ), null);
76+
7677
isDocStemmed = conf.getBoolean(Constants.IsDocStemmed, false);
7778
isStemming = conf.getBoolean(Constants.IsStemming, false);
7879
if (isStemming) {
@@ -81,10 +82,6 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
8182
defaultTokenizer = queryLangTokenizer;
8283
}
8384

84-
85-
queryLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), false, conf.get(Constants.StopwordListQ), null, null);
86-
queryLangTokenizerWithStemming = TokenizerFactory.createTokenizer(fs, conf, queryLang, conf.get(Constants.QueryTokenizerData), true, null, conf.get(Constants.StemmedStopwordListQ), null);
87-
8885
if (isDocStemmed) {
8986
docLangTokenizer = TokenizerFactory.createTokenizer(fs, conf, docLang, conf.get(Constants.DocTokenizerData), true, null, conf.get(Constants.StemmedStopwordListD), null);
9087
} else {
@@ -103,9 +100,11 @@ public void init(FileSystem fs, Configuration conf) throws IOException {
103100
}
104101
LOG.info("H6 = " + H6);
105102

103+
translateOnly = conf.get(Constants.TranslateOnly);
104+
106105
// initialize environment to access index
107106
// skip this if we only want to translate query (i.e., no retrieval)
108-
if (conf.get(Constants.TranslateOnly) == null) {
107+
if (translateOnly == null) {
109108
try {
110109
env = new RetrievalEnvironment(conf.get(Constants.IndexPath), fs);
111110
env.initialize(true);
@@ -216,7 +215,8 @@ protected HMapSFW getTranslations(String query, String token, Set<PairOfStrings>
216215

217216
// LOG.info("Pr("+eTerm+"|"+token+")="+probEF);
218217

219-
if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {
218+
if (probEF > 0 && e > 0 && !docLangTokenizer.isStopWord(eTerm) && !(translateOnly.equals("indri") && indriPuncPattern.matcher(eTerm).matches()) && (pairsInSCFG == null || pairsInSCFG.contains(new PairOfStrings(token,eTerm)))) {
219+
System.out.println(eTerm);
220220
// assuming our bilingual dictionary is learned from normally segmented text, but we want to use bigram tokenizer for CLIR purposes
221221
// then we need to convert the translations of each source token into a sequence of bigrams
222222
// we can distribute the translation probability equally to the each bigram

0 commit comments

Comments
 (0)