Skip to content

Commit 0660fa9

Browse files
committed
Add a two-word MergeNodes operation.
MergeNodes operation allows for arbitrary attributes on the new word, or makes a best effort at putting a combined word & lemma on the new combined word if nothing was specified. Would like to extend it to handle multiple words at once
1 parent db0bd45 commit 0660fa9

File tree

4 files changed

+323
-5
lines changed

4 files changed

+323
-5
lines changed

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java

+10-5
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,16 @@ public static void moveNode(SemanticGraph sg, SemgrexMatcher sm, IndexedWord wor
139139
}
140140
}
141141

142-
public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function<Integer, Boolean> shouldMove, Function<Integer, Integer> destination) {
142+
/**
143+
* reverse: operate in reverse order, highest index to first. You want true if moving indices up, false if moving indices down
144+
*/
145+
public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function<Integer, Boolean> shouldMove, Function<Integer, Integer> destination, boolean reverse) {
143146
// iterate first, then move, so that we don't screw up the graph while iterating
144147
List<IndexedWord> toMove = sg.vertexSet().stream().filter(x -> shouldMove.apply(x.index())).collect(Collectors.toList());
145148
Collections.sort(toMove);
146-
Collections.reverse(toMove);
149+
if (reverse) {
150+
Collections.reverse(toMove);
151+
}
147152
for (IndexedWord word : toMove) {
148153
moveNode(sg, sm, word, destination.apply(word.index()));
149154
}
@@ -166,8 +171,8 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
166171
// +2 to leave room: we will increase all other nodes with the
167172
// proper index, so we need +1 of room, then another +1 for
168173
// a temp place to put this node
169-
// TODO: when we implement updating the SemgrexMatcher,
170-
// this won't be necessary
174+
// TODO: we could theoretically put the new node in the right place
175+
// immediately and move the other nodes, but this is easier
171176
tempIndex = SemanticGraphUtils.maxIndex(sg) + 2;
172177

173178
if (position.equals("-")) {
@@ -203,7 +208,7 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
203208
if (position != null && !position.equals("+")) {
204209
// the payoff for tempIndex == maxIndex + 2:
205210
// everything will be moved one higher, unless it's the new node
206-
moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1);
211+
moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1, true);
207212
moveNode(sg, sm, newNode, newIndex);
208213
}
209214

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
2+
3+
import java.io.StringWriter;
4+
import java.util.ArrayList;
5+
import java.util.List;
6+
import java.util.Map;
7+
import java.util.TreeMap;
8+
9+
import edu.stanford.nlp.ling.CoreAnnotations;
10+
import edu.stanford.nlp.ling.CoreLabel;
11+
import edu.stanford.nlp.ling.IndexedWord;
12+
import edu.stanford.nlp.semgraph.SemanticGraph;
13+
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
14+
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
15+
16+
/**
17+
* Combines two words into one word
18+
*<br>
19+
* This requires one of the nodes to be the head of a phrase of the words,
20+
* and the dependent words can't have any extra edges in or out of that subgraph
21+
*<br>
22+
* The word and lemma will be the combination of the words, squished together.
23+
* Before and after will be updated to use the before and after of the endpoints of the subgraph
24+
*
25+
* @author John Bauer
26+
*/
27+
public class MergeNodes extends SsurgeonEdit {
28+
public static final String LABEL = "mergeNodes";
29+
final String name1;
30+
final String name2;
31+
final Map<String, String> attributes;
32+
33+
public MergeNodes(String name1, String name2, Map<String, String> attributes) {
34+
this.name1 = name1;
35+
this.name2 = name2;
36+
this.attributes = new TreeMap<>(attributes);
37+
}
38+
39+
/**
40+
* Emits a parseable instruction string.
41+
*/
42+
@Override
43+
public String toEditString() {
44+
StringWriter buf = new StringWriter();
45+
buf.write(LABEL); buf.write("\t");
46+
buf.write(name1); buf.write("\t");
47+
buf.write(name2);
48+
49+
// TODO: some attributes might need to be escaped!
50+
for (String key : attributes.keySet()) {
51+
buf.write("\t-");
52+
buf.write(key);
53+
buf.write(" ");
54+
buf.write(attributes.get(key));
55+
}
56+
57+
return buf.toString();
58+
}
59+
60+
/**
61+
* If the two named nodes are next to each other, and the edges of
62+
* the graph allow for it, squish the two words into one word
63+
*/
64+
@Override
65+
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
66+
IndexedWord node1 = sm.getNode(name1);
67+
IndexedWord node2 = sm.getNode(name2);
68+
69+
if (node1 == null || node2 == null) {
70+
return false;
71+
}
72+
73+
List<SemanticGraphEdge> n1_to_n2 = sg.getAllEdges(node1, node2);
74+
List<SemanticGraphEdge> n2_to_n1 = sg.getAllEdges(node2, node1);
75+
if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) {
76+
return false;
77+
}
78+
79+
// TODO: what about the case where the dep is or has copies?
80+
final IndexedWord head;
81+
final IndexedWord dep;
82+
83+
if (n1_to_n2.size() > 0) {
84+
head = node1;
85+
dep = node2;
86+
} else {
87+
head = node2;
88+
dep = node1;
89+
}
90+
91+
// If the dep has any edges that aren't between dep & head, abort
92+
// TODO: we could probably make it adjust edges with "dep" as source, instead
93+
for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) {
94+
if (e.getTarget() != head) {
95+
return false;
96+
}
97+
}
98+
for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) {
99+
if (e.getSource() != head) {
100+
return false;
101+
}
102+
}
103+
104+
IndexedWord left;
105+
IndexedWord right;
106+
if (node1.index() < node2.index()) {
107+
left = node1;
108+
right = node2;
109+
} else {
110+
left = node2;
111+
right = node1;
112+
}
113+
114+
CoreLabel newLabel = AddDep.fromCheapStrings(attributes);
115+
if (newLabel.word() == null) {
116+
String newWord = left.word() + right.word();
117+
newLabel.setWord(newWord);
118+
}
119+
if (newLabel.value() == null) {
120+
newLabel.setValue(newLabel.word());
121+
}
122+
if (newLabel.lemma() == null) {
123+
String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null;
124+
newLabel.setLemma(newLemma);
125+
}
126+
// after() and before() return "" if null, so we need to use the CoreAnnotations directly
127+
if (newLabel.get(CoreAnnotations.AfterAnnotation.class) == null) {
128+
newLabel.setAfter(right.after());
129+
}
130+
if (newLabel.get(CoreAnnotations.BeforeAnnotation.class) == null) {
131+
newLabel.setBefore(right.before());
132+
}
133+
134+
for (IndexedWord vertex : sg.vertexSet()) {
135+
if (vertex.index() == head.index()) {
136+
for (Class key : newLabel.keySet()) {
137+
Object value = newLabel.get(key);
138+
vertex.set(key, value);
139+
}
140+
}
141+
}
142+
143+
// copy the list so that deletion doesn't hurt the iterator
144+
// TODO: super fancy would be implementing iterator.remove()
145+
// on the Set returned by the SemanticGraph
146+
for (IndexedWord vertex : sg.vertexListSorted()) {
147+
if (vertex.index() == dep.index()) {
148+
sg.removeVertex(vertex);
149+
}
150+
}
151+
152+
// reindex everyone
153+
AddDep.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false);
154+
155+
return true;
156+
}
157+
158+
}
159+

src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java

+13
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
* <li> {@code addDep -gov node1 -reln depType -position where ...attributes...}
8383
* <li> {@code editNode -node node ...attributes...}
8484
* <li> {@code setRoots n1 (n2 n3 ...)}
85+
* <li> {@code mergeNodes n1 n2}
8586
* <li> {@code killAllIncomingEdges -node node}
8687
* <li> {@code deleteGraphFromNode -node node}
8788
* <li> {@code killNonRootedNodes}
@@ -134,6 +135,11 @@
134135
* This is best done in conjunction with other operations which actually manipulate the structure
135136
* of the graph, or the new root will weirdly have dependents and the graph will be incorrect.
136137
*</p><p>
138+
* {@code mergeNodes} will merge n1 and n2, assuming they are mergeable.
139+
* The nodes can be merged if one of the nodes is the head of a phrase
140+
* and the other node depends on the head. TODO: can make it process
141+
* more than two nodes at once.
142+
*</p><p>
137143
* {@code killAllIncomingEdges} deletes all edges to a node.
138144
* {@code -node} is the node to edit.
139145
* Note that this is the same as {@code removeEdge} with only the dependent set.
@@ -496,6 +502,13 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
496502
String[] names = tuples1[1].split("\\s+");
497503
List<String> newRoots = Arrays.asList(names);
498504
return new SetRoots(newRoots);
505+
} else if (command.equalsIgnoreCase(MergeNodes.LABEL)) {
506+
String[] names = tuples1[1].split("\\s+", 3);
507+
if (names.length == 2 && attributeArgs.size() == 0) {
508+
return new MergeNodes(names[0], names[1], Collections.emptyMap());
509+
}
510+
final SsurgeonArgs argsBox = parseArgsBox(names.length == 2 ? "" : names[2], attributeArgs);
511+
return new MergeNodes(names[0], names[1], argsBox.annotations);
499512
} else if (command.equalsIgnoreCase(KillNonRootedNodes.LABEL)) {
500513
return new KillNonRootedNodes();
501514
}

test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java

+141
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,147 @@ public void readXMLAddDepBrokenAnnotation() {
10631063
}
10641064
}
10651065

1066+
/**
1067+
* Test a basic case of two nodes that should be merged
1068+
*<br>
1069+
* The indices should be changed as well
1070+
*/
1071+
@Test
1072+
public void readXMLMergeNodes() {
1073+
Ssurgeon inst = Ssurgeon.inst();
1074+
1075+
// Test the head word being the first word
1076+
String merge = String.join(newline,
1077+
"<ssurgeon-pattern-list>",
1078+
" <ssurgeon-pattern>",
1079+
" <uid>38</uid>",
1080+
" <notes>Merge two nodes that should not have been split</notes>",
1081+
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
1082+
" <edit-list>mergeNodes source punct</edit-list>",
1083+
" </ssurgeon-pattern>",
1084+
"</ssurgeon-pattern-list>");
1085+
List<SsurgeonPattern> patterns = inst.readFromString(merge);
1086+
assertEquals(patterns.size(), 1);
1087+
SsurgeonPattern mergeSsurgeon = patterns.get(0);
1088+
1089+
SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1090+
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
1091+
SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [prof.-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
1092+
assertEquals(expected, newSG);
1093+
IndexedWord prof = sg.getNodeByIndexSafe(3);
1094+
assertNotNull(prof);
1095+
assertEquals("prof.", prof.word());
1096+
assertEquals("prof.", prof.value());
1097+
assertNull(prof.lemma());
1098+
1099+
// Same test, but this time test merging the lemmas
1100+
sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1101+
sg.getNodeByIndexSafe(3).setLemma("prof");
1102+
sg.getNodeByIndexSafe(4).setLemma(".");
1103+
newSG = mergeSsurgeon.iterate(sg).first;
1104+
assertEquals(expected, newSG);
1105+
prof = sg.getNodeByIndexSafe(3);
1106+
assertEquals("prof.", prof.lemma());
1107+
1108+
// Test the head word being the second word
1109+
merge = String.join(newline,
1110+
"<ssurgeon-pattern-list>",
1111+
" <ssurgeon-pattern>",
1112+
" <uid>38</uid>",
1113+
" <notes>Merge two nodes that should not have been split</notes>",
1114+
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
1115+
" <edit-list>mergeNodes source punct</edit-list>",
1116+
" </ssurgeon-pattern>",
1117+
"</ssurgeon-pattern-list>");
1118+
patterns = inst.readFromString(merge);
1119+
assertEquals(patterns.size(), 1);
1120+
mergeSsurgeon = patterns.get(0);
1121+
1122+
// Check what happens if the root of the phrase is on the right and the dep is on the left
1123+
// The words & lemmas should still hopefully be merged in order
1124+
sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-4 det> Il-2 punct> .-3 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1125+
sg.getNodeByIndexSafe(3).setLemma(".");
1126+
assertEquals(".", sg.getNodeByIndexSafe(3).word());
1127+
sg.getNodeByIndexSafe(4).setLemma("prof");
1128+
newSG = mergeSsurgeon.iterate(sg).first;
1129+
expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [.prof-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
1130+
assertEquals(expected, newSG);
1131+
prof = newSG.getNodeByIndexSafe(3);
1132+
assertEquals(".prof", prof.word());
1133+
assertEquals(".prof", prof.lemma());
1134+
}
1135+
1136+
1137+
/**
1138+
* Test a basic case of two nodes that should be merged
1139+
*<br>
1140+
* The indices should be changed as well
1141+
*/
1142+
@Test
1143+
public void readXMLMergeNodesAttributes() {
1144+
Ssurgeon inst = Ssurgeon.inst();
1145+
1146+
// Test the head word being the first word
1147+
String merge = String.join(newline,
1148+
"<ssurgeon-pattern-list>",
1149+
" <ssurgeon-pattern>",
1150+
" <uid>38</uid>",
1151+
" <notes>Merge two nodes that should not have been split</notes>",
1152+
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
1153+
" <edit-list>mergeNodes source punct -word foo -lemma bar</edit-list>",
1154+
" </ssurgeon-pattern>",
1155+
"</ssurgeon-pattern-list>");
1156+
List<SsurgeonPattern> patterns = inst.readFromString(merge);
1157+
assertEquals(patterns.size(), 1);
1158+
SsurgeonPattern mergeSsurgeon = patterns.get(0);
1159+
1160+
SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1161+
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
1162+
SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [foo-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
1163+
assertEquals(expected, newSG);
1164+
IndexedWord prof = sg.getNodeByIndexSafe(3);
1165+
assertNotNull(prof);
1166+
assertEquals("foo", prof.word());
1167+
assertEquals("foo", prof.value());
1168+
assertEquals("bar", prof.lemma());
1169+
}
1170+
1171+
/**
1172+
* Test a basic case of two nodes that should be merged
1173+
*<br>
1174+
* The indices should be changed as well
1175+
*/
1176+
@Test
1177+
public void readXMLMergeNodesFailCases() {
1178+
Ssurgeon inst = Ssurgeon.inst();
1179+
1180+
// use "dep" as the dependency so as to be language-agnostic in this test
1181+
String merge = String.join(newline,
1182+
"<ssurgeon-pattern-list>",
1183+
" <ssurgeon-pattern>",
1184+
" <uid>38</uid>",
1185+
" <notes>Merge two nodes that should not have been split</notes>",
1186+
" <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
1187+
" <edit-list>mergeNodes source punct</edit-list>",
1188+
" </ssurgeon-pattern>",
1189+
"</ssurgeon-pattern-list>");
1190+
List<SsurgeonPattern> patterns = inst.readFromString(merge);
1191+
assertEquals(patterns.size(), 1);
1192+
SsurgeonPattern mergeSsurgeon = patterns.get(0);
1193+
1194+
// Add an extra edge from the punct we want to squash to somewhere else
1195+
// The graph should not be changed
1196+
SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1197+
SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
1198+
SemanticGraph expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1199+
assertEquals(expected, newSG);
1200+
1201+
sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1202+
newSG = mergeSsurgeon.iterate(sg).first;
1203+
expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
1204+
assertEquals(expected, newSG);
1205+
}
1206+
10661207
/**
10671208
* The AddDep should update the matches in the SemgrexMatcher.
10681209
* If that isn't done correctly, then moving the words first

0 commit comments

Comments
 (0)