Adding *.java files from original repository

dasguptar · dasguptar · commit 10ec299bdeb7 · 2017-05-02T15:36:32.000+05:30
diff --git a/lib/CollapseUnaryTransformer.java b/lib/CollapseUnaryTransformer.java
@@ -0,0 +1,34 @@
+import java.util.List;
+
+import edu.stanford.nlp.ling.Label;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreeTransformer;
+import edu.stanford.nlp.util.Generics;
+
+/**
+ * This transformer collapses chains of unary nodes so that the top
+ * node is the only node left.  The Sentiment model does not handle
+ * unary nodes, so this simplifies them to make a binary tree consist
+ * entirely of binary nodes and preterminals.  A new tree with new
+ * nodes and labels is returned; the original tree is unchanged.
+ *
+ * @author John Bauer
+ */
+public class CollapseUnaryTransformer implements TreeTransformer {
+  public Tree transformTree(Tree tree) {
+    if (tree.isPreTerminal() || tree.isLeaf()) {
+      return tree.deepCopy();
+    }
+
+    Label label = tree.label().labelFactory().newLabel(tree.label());
+    Tree[] children = tree.children();
+    while (children.length == 1 && !children[0].isLeaf()) {
+      children = children[0].children();
+    }
+    List<Tree> processedChildren = Generics.newArrayList();
+    for (Tree child : children) {
+      processedChildren.add(transformTree(child));
+    }
+    return tree.treeFactory().newTreeNode(label, processedChildren);
+  }
+}
diff --git a/lib/ConstituencyParse.java b/lib/ConstituencyParse.java
@@ -0,0 +1,234 @@
+import edu.stanford.nlp.process.WordTokenFactory;
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.process.PTBTokenizer;
+import edu.stanford.nlp.util.StringUtils;
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
+import edu.stanford.nlp.trees.GrammaticalStructure;
+import edu.stanford.nlp.trees.GrammaticalStructureFactory;
+import edu.stanford.nlp.trees.PennTreebankLanguagePack;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.Trees;
+import edu.stanford.nlp.trees.TreebankLanguagePack;
+import edu.stanford.nlp.trees.TypedDependency;
+
+import java.io.BufferedWriter;
+import java.io.FileWriter;
+import java.io.StringReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.HashMap;
+import java.util.Properties;
+import java.util.Scanner;
+
+public class ConstituencyParse {
+
+  private boolean tokenize;
+  private BufferedWriter tokWriter, parentWriter;
+  private LexicalizedParser parser;
+  private TreeBinarizer binarizer;
+  private CollapseUnaryTransformer transformer;
+  private GrammaticalStructureFactory gsf;
+
+  private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
+
+  public ConstituencyParse(String tokPath, String parentPath, boolean tokenize) throws IOException {
+    this.tokenize = tokenize;
+    if (tokPath != null) {
+      tokWriter = new BufferedWriter(new FileWriter(tokPath));
+    }
+    parentWriter = new BufferedWriter(new FileWriter(parentPath));
+    parser = LexicalizedParser.loadModel(PCFG_PATH);
+    binarizer = TreeBinarizer.simpleTreeBinarizer(
+      parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
+    transformer = new CollapseUnaryTransformer();
+
+    // set up to produce dependency representations from constituency trees
+    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
+    gsf = tlp.grammaticalStructureFactory();
+  }
+
+  public List<HasWord> sentenceToTokens(String line) {
+    List<HasWord> tokens = new ArrayList<>();
+    if (tokenize) {
+      PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
+      for (Word label; tokenizer.hasNext(); ) {
+        tokens.add(tokenizer.next());
+      }
+    } else {
+      for (String word : line.split(" ")) {
+        tokens.add(new Word(word));
+      }
+    }
+
+    return tokens;
+  }
+
+  public Tree parse(List<HasWord> tokens) {
+    Tree tree = parser.apply(tokens);
+    return tree;
+  }
+
+  public int[] constTreeParents(Tree tree) {
+    Tree binarized = binarizer.transformTree(tree);
+    Tree collapsedUnary = transformer.transformTree(binarized);
+    Trees.convertToCoreLabels(collapsedUnary);
+    collapsedUnary.indexSpans();
+    List<Tree> leaves = collapsedUnary.getLeaves();
+    int size = collapsedUnary.size() - leaves.size();
+    int[] parents = new int[size];
+    HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
+
+    int idx = leaves.size();
+    int leafIdx = 0;
+    for (Tree leaf : leaves) {
+      Tree cur = leaf.parent(collapsedUnary); // go to preterminal
+      int curIdx = leafIdx++;
+      boolean done = false;
+      while (!done) {
+        Tree parent = cur.parent(collapsedUnary);
+        if (parent == null) {
+          parents[curIdx] = 0;
+          break;
+        }
+
+        int parentIdx;
+        int parentNumber = parent.nodeNumber(collapsedUnary);
+        if (!index.containsKey(parentNumber)) {
+          parentIdx = idx++;
+          index.put(parentNumber, parentIdx);
+        } else {
+          parentIdx = index.get(parentNumber);
+          done = true;
+        }
+
+        parents[curIdx] = parentIdx + 1;
+        cur = parent;
+        curIdx = parentIdx;
+      }
+    }
+
+    return parents;
+  }
+
+  // convert constituency parse to a dependency representation and return the
+  // parent pointer representation of the tree
+  public int[] depTreeParents(Tree tree, List<HasWord> tokens) {
+    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
+    Collection<TypedDependency> tdl = gs.typedDependencies();
+    int len = tokens.size();
+    int[] parents = new int[len];
+    for (int i = 0; i < len; i++) {
+      // if a node has a parent of -1 at the end of parsing, then the node
+      // has no parent.
+      parents[i] = -1;
+    }
+
+    for (TypedDependency td : tdl) {
+      // let root have index 0
+      int child = td.dep().index();
+      int parent = td.gov().index();
+      parents[child - 1] = parent;
+    }
+
+    return parents;
+  }
+
+  public void printTokens(List<HasWord> tokens) throws IOException {
+    int len = tokens.size();
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < len - 1; i++) {
+      if (tokenize) {
+        sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
+      } else {
+        sb.append(tokens.get(i).word());
+      }
+      sb.append(' ');
+    }
+
+    if (tokenize) {
+      sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
+    } else {
+      sb.append(tokens.get(len - 1).word());
+    }
+
+    sb.append('\n');
+    tokWriter.write(sb.toString());
+  }
+
+  public void printParents(int[] parents) throws IOException {
+    StringBuilder sb = new StringBuilder();
+    int size = parents.length;
+    for (int i = 0; i < size - 1; i++) {
+      sb.append(parents[i]);
+      sb.append(' ');
+    }
+    sb.append(parents[size - 1]);
+    sb.append('\n');
+    parentWriter.write(sb.toString());
+  }
+
+  public void close() throws IOException {
+    if (tokWriter != null) tokWriter.close();
+    parentWriter.close();
+  }
+
+  public static void main(String[] args) throws Exception {
+    Properties props = StringUtils.argsToProperties(args);
+    if (!props.containsKey("parentpath")) {
+      System.err.println(
+        "usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
+      System.exit(1);
+    }
+
+    // whether to tokenize input sentences
+    boolean tokenize = false;
+    if (props.containsKey("tokenize")) {
+      tokenize = true;
+    }
+
+    // whether to produce dependency trees from the constituency parse
+    boolean deps = false;
+    if (props.containsKey("deps")) {
+      deps = true;
+    }
+
+    String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null;
+    String parentPath = props.getProperty("parentpath");
+    ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tokenize);
+
+    Scanner stdin = new Scanner(System.in);
+    int count = 0;
+    long start = System.currentTimeMillis();
+    while (stdin.hasNextLine()) {
+      String line = stdin.nextLine();
+      List<HasWord> tokens = processor.sentenceToTokens(line);
+      Tree parse = processor.parse(tokens);
+
+      // produce parent pointer representation
+      int[] parents = deps ? processor.depTreeParents(parse, tokens)
+                           : processor.constTreeParents(parse);
+      
+      // print
+      if (tokPath != null) {
+        processor.printTokens(tokens);
+      }
+      processor.printParents(parents);
+
+      count++;
+      if (count % 1000 == 0) {
+        double elapsed = (System.currentTimeMillis() - start) / 1000.0;
+        System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
+      }
+    }
+
+    long totalTimeMillis = System.currentTimeMillis() - start;
+    System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n",
+      count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count);
+    processor.close();
+  }
+}
diff --git a/lib/DependencyParse.java b/lib/DependencyParse.java