Add an Ssurgeon operation which adds an (English only) lemma to text

AngledLuffa · AngledLuffa · commit c26b25e118db · 2023-12-05T08:10:33.000-08:00
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Lemmatize.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Lemmatize.java
@@ -0,0 +1,65 @@
+package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
+
+import java.util.Objects;
+
+import edu.stanford.nlp.international.Language;
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.process.Morphology;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
+
+/**
+ * Add the output of the English lemmatizer to the word in question.
+ * Currently this only supports English!  You can add a known lemma
+ * for a different language by using EditNode and setting the lemma
+ * attribute
+ *
+ * @author John Bauer
+ */
+public class Lemmatize extends SsurgeonEdit {
+  public static final String LABEL = "lemmatize";
+
+  final String nodeName;
+  final Morphology morphology;
+  final Language language;
+
+  public Lemmatize(String nodeName, Language language) {
+    if (nodeName == null) {
+      throw new SsurgeonParseException("Cannot make a Lemmatize with no nodeName");
+    }
+    this.nodeName = nodeName;
+
+    if (language == Language.UniversalEnglish || language == Language.English) {
+      this.language = Language.English;
+    } else if (language == Language.Unknown) {
+      // log something here?
+      this.language = Language.English;
+    } else {
+      throw new SsurgeonParseException("Lemmatizing " + language + " is not supported");
+    }
+
+    this.morphology = new Morphology();
+  }
+
+  @Override
+  public String toEditString() {
+    StringBuilder buf = new StringBuilder();
+    buf.append(LABEL);  buf.append("\t");
+    buf.append(Ssurgeon.NODENAME_ARG);buf.append(" ");
+    buf.append(nodeName);
+    return buf.toString();
+  }
+
+  public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
+    IndexedWord word = sm.getNode(nodeName);
+    if (word == null)
+      return false;
+
+    String oldLemma = word.lemma();
+    morphology.stem(word.backingLabel(), CoreAnnotations.LemmaAnnotation.class);
+    String newLemma = word.lemma();
+    boolean changed = !Objects.equals(oldLemma, newLemma);
+    return changed;
+  }
+}
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -83,6 +83,7 @@
  * <li> {@code reattachNamedEdge -edge edgename -gov gov -dep dep}
  * <li> {@code addDep -gov node1 -reln depType -position where ...attributes...}
  * <li> {@code editNode -node node ...attributes...}
+ * <li> {@code lemmatize -node node}
  * <li> {@code combineMWT -node node -word word}
  * <li> {@code setRoots n1 (n2 n3 ...)}
  * <li> {@code mergeNodes n1 n2}
@@ -137,6 +138,10 @@
  *   needs the ability to add or remove features without resetting the entire features map,
  *   please file an issue on github.
  *</p><p>
+ * {@code lemmatize} will put a lemma on a word.
+ * {@code -node} is the node to edit.
+ *   This only works on English text.
+ *</p><p>
  * {@code combineMWT} will add MWT attributes to a sequence of two or more words.
  * {@code -node} (repeated) is the nodes to edit.
  * {@code -word} is the optional text to use for the new MWT.  If not set, the words will be concatenated.
@@ -566,6 +571,11 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
           throw new SsurgeonParseException("Cannot make an EditNode out of " + argsBox.nodes.size() + " nodes.  Please use exactly one -node");
         }
         return new EditNode(argsBox.nodes.get(0), argsBox.annotations, argsBox.updateMorphoFeatures);
+      } else if (command.equalsIgnoreCase(Lemmatize.LABEL)) {
+        if (argsBox.nodes.size() != 1) {
+          throw new SsurgeonParseException("Cannot make a Lemmatize out of " + argsBox.nodes.size() + " nodes.  Please use exactly one -node");
+        }
+        return new Lemmatize(argsBox.nodes.get(0), language);
       } else if (command.equalsIgnoreCase(MergeNodes.LABEL)) {
         if (argsBox.nodes.size() < 2) {
           throw new SsurgeonParseException("Cannot make a MergeNodes out of fewer than 2 nodes (got " + argsBox.nodes.size() + ")");
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -1362,6 +1362,61 @@ public void checkAnnotationConversionErrors() {
 
 
   /**
+   * Check that the edit which puts a lemma on a node redoes the lemma on the nodes it targets
+   */
+  @Test
+  public void readXMLLemmatize() {
+    Ssurgeon inst = Ssurgeon.inst();
+
+    // use "dep" as the dependency so as to be language-agnostic in this test
+    String lemma = String.join(newline,
+                               "<ssurgeon-pattern-list>",
+                               "  <ssurgeon-pattern>",
+                               "    <uid>38</uid>",
+                               "    <notes>Edit a node</notes>",
+                               "    <semgrex>" + XMLUtils.escapeXML("!{lemma:/.+/}=nolemma") + "</semgrex>",
+                               "    <edit-list>lemmatize -node nolemma</edit-list>",
+                               "  </ssurgeon-pattern>",
+                               "</ssurgeon-pattern-list>");
+    List<SsurgeonPattern> patterns = inst.readFromString(lemma);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern lemmatizeSsurgeon = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[has/VBZ-2 nsubj> Jennifer/NNP-1 obj> [antennae/NNS-4 dep> green/JJ-3]]");
+    for (IndexedWord word : sg.vertexSet()) {
+      assertNull(word.lemma());
+    }
+    SemanticGraph newSG = lemmatizeSsurgeon.iterate(sg).first;
+    String[] expectedLemmas = {"Jennifer", "have", "green", "antenna"};
+    for (IndexedWord word : newSG.vertexSet()) {
+      assertEquals(expectedLemmas[word.index() - 1], word.lemma());
+    }
+
+    // this version would bomb if lemmatize were not bomb-proof
+    lemma = String.join(newline,
+                        "<ssurgeon-pattern-list>",
+                        "  <ssurgeon-pattern>",
+                        "    <uid>38</uid>",
+                        "    <notes>Edit a node</notes>",
+                        "    <semgrex>" + XMLUtils.escapeXML("{}=nolemma") + "</semgrex>",
+                        "    <edit-list>lemmatize -node nolemma</edit-list>",
+                        "  </ssurgeon-pattern>",
+                        "</ssurgeon-pattern-list>");
+    patterns = inst.readFromString(lemma);
+    assertEquals(patterns.size(), 1);
+    lemmatizeSsurgeon = patterns.get(0);
+
+    sg = SemanticGraph.valueOf("[has/VBZ-2 nsubj> Jennifer/NNP-1 obj> [antennae/NNS-4 dep> green/JJ-3]]");
+    for (IndexedWord word : sg.vertexSet()) {
+      assertNull(word.lemma());
+    }
+    newSG = lemmatizeSsurgeon.iterate(sg).first;
+    for (IndexedWord word : newSG.vertexSet()) {
+      assertEquals(expectedLemmas[word.index() - 1], word.lemma());
+    }
+  }
+
+  /*
    * Check that a basic edit script works as expected
    */
   @Test