Skip to content

Commit c26b25e

Browse files
committed
Add an Ssurgeon operation which adds an (English only) lemma to text
1 parent d302c63 commit c26b25e

File tree

3 files changed

+130
-0
lines changed

3 files changed

+130
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
2+
3+
import java.util.Objects;
4+
5+
import edu.stanford.nlp.international.Language;
6+
import edu.stanford.nlp.ling.CoreAnnotations;
7+
import edu.stanford.nlp.ling.IndexedWord;
8+
import edu.stanford.nlp.process.Morphology;
9+
import edu.stanford.nlp.semgraph.SemanticGraph;
10+
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
11+
12+
/**
13+
* Add the output of the English lemmatizer to the word in question.
14+
* Currently this only supports English! You can add a known lemma
15+
* for a different language by using EditNode and setting the lemma
16+
* attribute
17+
*
18+
* @author John Bauer
19+
*/
20+
public class Lemmatize extends SsurgeonEdit {
21+
public static final String LABEL = "lemmatize";
22+
23+
final String nodeName;
24+
final Morphology morphology;
25+
final Language language;
26+
27+
public Lemmatize(String nodeName, Language language) {
28+
if (nodeName == null) {
29+
throw new SsurgeonParseException("Cannot make a Lemmatize with no nodeName");
30+
}
31+
this.nodeName = nodeName;
32+
33+
if (language == Language.UniversalEnglish || language == Language.English) {
34+
this.language = Language.English;
35+
} else if (language == Language.Unknown) {
36+
// log something here?
37+
this.language = Language.English;
38+
} else {
39+
throw new SsurgeonParseException("Lemmatizing " + language + " is not supported");
40+
}
41+
42+
this.morphology = new Morphology();
43+
}
44+
45+
@Override
46+
public String toEditString() {
47+
StringBuilder buf = new StringBuilder();
48+
buf.append(LABEL); buf.append("\t");
49+
buf.append(Ssurgeon.NODENAME_ARG);buf.append(" ");
50+
buf.append(nodeName);
51+
return buf.toString();
52+
}
53+
54+
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
55+
IndexedWord word = sm.getNode(nodeName);
56+
if (word == null)
57+
return false;
58+
59+
String oldLemma = word.lemma();
60+
morphology.stem(word.backingLabel(), CoreAnnotations.LemmaAnnotation.class);
61+
String newLemma = word.lemma();
62+
boolean changed = !Objects.equals(oldLemma, newLemma);
63+
return changed;
64+
}
65+
}

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java

+10
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
* <li> {@code reattachNamedEdge -edge edgename -gov gov -dep dep}
8484
* <li> {@code addDep -gov node1 -reln depType -position where ...attributes...}
8585
* <li> {@code editNode -node node ...attributes...}
86+
* <li> {@code lemmatize -node node}
8687
* <li> {@code combineMWT -node node -word word}
8788
* <li> {@code setRoots n1 (n2 n3 ...)}
8889
* <li> {@code mergeNodes n1 n2}
@@ -137,6 +138,10 @@
137138
* needs the ability to add or remove features without resetting the entire features map,
138139
* please file an issue on github.
139140
*</p><p>
141+
* {@code lemmatize} will put a lemma on a word.
142+
* {@code -node} is the node to edit.
143+
* This only works on English text.
144+
*</p><p>
140145
* {@code combineMWT} will add MWT attributes to a sequence of two or more words.
141146
* {@code -node} (repeated) is the nodes to edit.
142147
* {@code -word} is the optional text to use for the new MWT. If not set, the words will be concatenated.
@@ -566,6 +571,11 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
566571
throw new SsurgeonParseException("Cannot make an EditNode out of " + argsBox.nodes.size() + " nodes. Please use exactly one -node");
567572
}
568573
return new EditNode(argsBox.nodes.get(0), argsBox.annotations, argsBox.updateMorphoFeatures);
574+
} else if (command.equalsIgnoreCase(Lemmatize.LABEL)) {
575+
if (argsBox.nodes.size() != 1) {
576+
throw new SsurgeonParseException("Cannot make a Lemmatize out of " + argsBox.nodes.size() + " nodes. Please use exactly one -node");
577+
}
578+
return new Lemmatize(argsBox.nodes.get(0), language);
569579
} else if (command.equalsIgnoreCase(MergeNodes.LABEL)) {
570580
if (argsBox.nodes.size() < 2) {
571581
throw new SsurgeonParseException("Cannot make a MergeNodes out of fewer than 2 nodes (got " + argsBox.nodes.size() + ")");

test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java

+55
Original file line numberDiff line numberDiff line change
@@ -1362,6 +1362,61 @@ public void checkAnnotationConversionErrors() {
13621362

13631363

13641364
/**
1365+
* Check that the edit which puts a lemma on a node redoes the lemma on the nodes it targets
1366+
*/
1367+
@Test
1368+
public void readXMLLemmatize() {
1369+
Ssurgeon inst = Ssurgeon.inst();
1370+
1371+
// use "dep" as the dependency so as to be language-agnostic in this test
1372+
String lemma = String.join(newline,
1373+
"<ssurgeon-pattern-list>",
1374+
" <ssurgeon-pattern>",
1375+
" <uid>38</uid>",
1376+
" <notes>Edit a node</notes>",
1377+
" <semgrex>" + XMLUtils.escapeXML("!{lemma:/.+/}=nolemma") + "</semgrex>",
1378+
" <edit-list>lemmatize -node nolemma</edit-list>",
1379+
" </ssurgeon-pattern>",
1380+
"</ssurgeon-pattern-list>");
1381+
List<SsurgeonPattern> patterns = inst.readFromString(lemma);
1382+
assertEquals(patterns.size(), 1);
1383+
SsurgeonPattern lemmatizeSsurgeon = patterns.get(0);
1384+
1385+
SemanticGraph sg = SemanticGraph.valueOf("[has/VBZ-2 nsubj> Jennifer/NNP-1 obj> [antennae/NNS-4 dep> green/JJ-3]]");
1386+
for (IndexedWord word : sg.vertexSet()) {
1387+
assertNull(word.lemma());
1388+
}
1389+
SemanticGraph newSG = lemmatizeSsurgeon.iterate(sg).first;
1390+
String[] expectedLemmas = {"Jennifer", "have", "green", "antenna"};
1391+
for (IndexedWord word : newSG.vertexSet()) {
1392+
assertEquals(expectedLemmas[word.index() - 1], word.lemma());
1393+
}
1394+
1395+
// this version would bomb if lemmatize were not bomb-proof
1396+
lemma = String.join(newline,
1397+
"<ssurgeon-pattern-list>",
1398+
" <ssurgeon-pattern>",
1399+
" <uid>38</uid>",
1400+
" <notes>Edit a node</notes>",
1401+
" <semgrex>" + XMLUtils.escapeXML("{}=nolemma") + "</semgrex>",
1402+
" <edit-list>lemmatize -node nolemma</edit-list>",
1403+
" </ssurgeon-pattern>",
1404+
"</ssurgeon-pattern-list>");
1405+
patterns = inst.readFromString(lemma);
1406+
assertEquals(patterns.size(), 1);
1407+
lemmatizeSsurgeon = patterns.get(0);
1408+
1409+
sg = SemanticGraph.valueOf("[has/VBZ-2 nsubj> Jennifer/NNP-1 obj> [antennae/NNS-4 dep> green/JJ-3]]");
1410+
for (IndexedWord word : sg.vertexSet()) {
1411+
assertNull(word.lemma());
1412+
}
1413+
newSG = lemmatizeSsurgeon.iterate(sg).first;
1414+
for (IndexedWord word : newSG.vertexSet()) {
1415+
assertEquals(expectedLemmas[word.index() - 1], word.lemma());
1416+
}
1417+
}
1418+
1419+
/*
13651420
* Check that a basic edit script works as expected
13661421
*/
13671422
@Test

0 commit comments

Comments
 (0)