Skip to content

Commit 10ec299

Browse files
committed
Adding *.java files from original repository
1 parent 8206abd commit 10ec299

File tree

3 files changed

+408
-0
lines changed

3 files changed

+408
-0
lines changed

lib/CollapseUnaryTransformer.java

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import java.util.List;
2+
3+
import edu.stanford.nlp.ling.Label;
4+
import edu.stanford.nlp.trees.Tree;
5+
import edu.stanford.nlp.trees.TreeTransformer;
6+
import edu.stanford.nlp.util.Generics;
7+
8+
/**
9+
* This transformer collapses chains of unary nodes so that the top
10+
* node is the only node left. The Sentiment model does not handle
11+
* unary nodes, so this simplifies them to make a binary tree consist
12+
* entirely of binary nodes and preterminals. A new tree with new
13+
* nodes and labels is returned; the original tree is unchanged.
14+
*
15+
* @author John Bauer
16+
*/
17+
public class CollapseUnaryTransformer implements TreeTransformer {
18+
public Tree transformTree(Tree tree) {
19+
if (tree.isPreTerminal() || tree.isLeaf()) {
20+
return tree.deepCopy();
21+
}
22+
23+
Label label = tree.label().labelFactory().newLabel(tree.label());
24+
Tree[] children = tree.children();
25+
while (children.length == 1 && !children[0].isLeaf()) {
26+
children = children[0].children();
27+
}
28+
List<Tree> processedChildren = Generics.newArrayList();
29+
for (Tree child : children) {
30+
processedChildren.add(transformTree(child));
31+
}
32+
return tree.treeFactory().newTreeNode(label, processedChildren);
33+
}
34+
}

lib/ConstituencyParse.java

+234
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
import edu.stanford.nlp.process.WordTokenFactory;
2+
import edu.stanford.nlp.ling.HasWord;
3+
import edu.stanford.nlp.ling.Word;
4+
import edu.stanford.nlp.ling.CoreLabel;
5+
import edu.stanford.nlp.process.PTBTokenizer;
6+
import edu.stanford.nlp.util.StringUtils;
7+
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
8+
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
9+
import edu.stanford.nlp.trees.GrammaticalStructure;
10+
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
11+
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
12+
import edu.stanford.nlp.trees.Tree;
13+
import edu.stanford.nlp.trees.Trees;
14+
import edu.stanford.nlp.trees.TreebankLanguagePack;
15+
import edu.stanford.nlp.trees.TypedDependency;
16+
17+
import java.io.BufferedWriter;
18+
import java.io.FileWriter;
19+
import java.io.StringReader;
20+
import java.io.IOException;
21+
import java.util.ArrayList;
22+
import java.util.Collection;
23+
import java.util.List;
24+
import java.util.HashMap;
25+
import java.util.Properties;
26+
import java.util.Scanner;
27+
28+
public class ConstituencyParse {
29+
30+
private boolean tokenize;
31+
private BufferedWriter tokWriter, parentWriter;
32+
private LexicalizedParser parser;
33+
private TreeBinarizer binarizer;
34+
private CollapseUnaryTransformer transformer;
35+
private GrammaticalStructureFactory gsf;
36+
37+
private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
38+
39+
public ConstituencyParse(String tokPath, String parentPath, boolean tokenize) throws IOException {
40+
this.tokenize = tokenize;
41+
if (tokPath != null) {
42+
tokWriter = new BufferedWriter(new FileWriter(tokPath));
43+
}
44+
parentWriter = new BufferedWriter(new FileWriter(parentPath));
45+
parser = LexicalizedParser.loadModel(PCFG_PATH);
46+
binarizer = TreeBinarizer.simpleTreeBinarizer(
47+
parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
48+
transformer = new CollapseUnaryTransformer();
49+
50+
// set up to produce dependency representations from constituency trees
51+
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
52+
gsf = tlp.grammaticalStructureFactory();
53+
}
54+
55+
public List<HasWord> sentenceToTokens(String line) {
56+
List<HasWord> tokens = new ArrayList<>();
57+
if (tokenize) {
58+
PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), "");
59+
for (Word label; tokenizer.hasNext(); ) {
60+
tokens.add(tokenizer.next());
61+
}
62+
} else {
63+
for (String word : line.split(" ")) {
64+
tokens.add(new Word(word));
65+
}
66+
}
67+
68+
return tokens;
69+
}
70+
71+
public Tree parse(List<HasWord> tokens) {
72+
Tree tree = parser.apply(tokens);
73+
return tree;
74+
}
75+
76+
public int[] constTreeParents(Tree tree) {
77+
Tree binarized = binarizer.transformTree(tree);
78+
Tree collapsedUnary = transformer.transformTree(binarized);
79+
Trees.convertToCoreLabels(collapsedUnary);
80+
collapsedUnary.indexSpans();
81+
List<Tree> leaves = collapsedUnary.getLeaves();
82+
int size = collapsedUnary.size() - leaves.size();
83+
int[] parents = new int[size];
84+
HashMap<Integer, Integer> index = new HashMap<Integer, Integer>();
85+
86+
int idx = leaves.size();
87+
int leafIdx = 0;
88+
for (Tree leaf : leaves) {
89+
Tree cur = leaf.parent(collapsedUnary); // go to preterminal
90+
int curIdx = leafIdx++;
91+
boolean done = false;
92+
while (!done) {
93+
Tree parent = cur.parent(collapsedUnary);
94+
if (parent == null) {
95+
parents[curIdx] = 0;
96+
break;
97+
}
98+
99+
int parentIdx;
100+
int parentNumber = parent.nodeNumber(collapsedUnary);
101+
if (!index.containsKey(parentNumber)) {
102+
parentIdx = idx++;
103+
index.put(parentNumber, parentIdx);
104+
} else {
105+
parentIdx = index.get(parentNumber);
106+
done = true;
107+
}
108+
109+
parents[curIdx] = parentIdx + 1;
110+
cur = parent;
111+
curIdx = parentIdx;
112+
}
113+
}
114+
115+
return parents;
116+
}
117+
118+
// convert constituency parse to a dependency representation and return the
119+
// parent pointer representation of the tree
120+
public int[] depTreeParents(Tree tree, List<HasWord> tokens) {
121+
GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
122+
Collection<TypedDependency> tdl = gs.typedDependencies();
123+
int len = tokens.size();
124+
int[] parents = new int[len];
125+
for (int i = 0; i < len; i++) {
126+
// if a node has a parent of -1 at the end of parsing, then the node
127+
// has no parent.
128+
parents[i] = -1;
129+
}
130+
131+
for (TypedDependency td : tdl) {
132+
// let root have index 0
133+
int child = td.dep().index();
134+
int parent = td.gov().index();
135+
parents[child - 1] = parent;
136+
}
137+
138+
return parents;
139+
}
140+
141+
public void printTokens(List<HasWord> tokens) throws IOException {
142+
int len = tokens.size();
143+
StringBuilder sb = new StringBuilder();
144+
for (int i = 0; i < len - 1; i++) {
145+
if (tokenize) {
146+
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word()));
147+
} else {
148+
sb.append(tokens.get(i).word());
149+
}
150+
sb.append(' ');
151+
}
152+
153+
if (tokenize) {
154+
sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word()));
155+
} else {
156+
sb.append(tokens.get(len - 1).word());
157+
}
158+
159+
sb.append('\n');
160+
tokWriter.write(sb.toString());
161+
}
162+
163+
public void printParents(int[] parents) throws IOException {
164+
StringBuilder sb = new StringBuilder();
165+
int size = parents.length;
166+
for (int i = 0; i < size - 1; i++) {
167+
sb.append(parents[i]);
168+
sb.append(' ');
169+
}
170+
sb.append(parents[size - 1]);
171+
sb.append('\n');
172+
parentWriter.write(sb.toString());
173+
}
174+
175+
public void close() throws IOException {
176+
if (tokWriter != null) tokWriter.close();
177+
parentWriter.close();
178+
}
179+
180+
public static void main(String[] args) throws Exception {
181+
Properties props = StringUtils.argsToProperties(args);
182+
if (!props.containsKey("parentpath")) {
183+
System.err.println(
184+
"usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>");
185+
System.exit(1);
186+
}
187+
188+
// whether to tokenize input sentences
189+
boolean tokenize = false;
190+
if (props.containsKey("tokenize")) {
191+
tokenize = true;
192+
}
193+
194+
// whether to produce dependency trees from the constituency parse
195+
boolean deps = false;
196+
if (props.containsKey("deps")) {
197+
deps = true;
198+
}
199+
200+
String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null;
201+
String parentPath = props.getProperty("parentpath");
202+
ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tokenize);
203+
204+
Scanner stdin = new Scanner(System.in);
205+
int count = 0;
206+
long start = System.currentTimeMillis();
207+
while (stdin.hasNextLine()) {
208+
String line = stdin.nextLine();
209+
List<HasWord> tokens = processor.sentenceToTokens(line);
210+
Tree parse = processor.parse(tokens);
211+
212+
// produce parent pointer representation
213+
int[] parents = deps ? processor.depTreeParents(parse, tokens)
214+
: processor.constTreeParents(parse);
215+
216+
// print
217+
if (tokPath != null) {
218+
processor.printTokens(tokens);
219+
}
220+
processor.printParents(parents);
221+
222+
count++;
223+
if (count % 1000 == 0) {
224+
double elapsed = (System.currentTimeMillis() - start) / 1000.0;
225+
System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed);
226+
}
227+
}
228+
229+
long totalTimeMillis = System.currentTimeMillis() - start;
230+
System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n",
231+
count, totalTimeMillis / 1000.0, totalTimeMillis / (double) count);
232+
processor.close();
233+
}
234+
}

0 commit comments

Comments
 (0)