Skip to content

Commit cc7983e

Browse files
committed
Fix optional CoordinationPatterns - both conjunctions and disjunctions would infinite loop, as the matcher would accept a failed match even after a previous success (meaning it would always be willing to accept the same failed match again). #1375
1 parent 50f0985 commit cc7983e

File tree

4 files changed

+143
-4
lines changed

4 files changed

+143
-4
lines changed

src/edu/stanford/nlp/trees/tregex/CoordinationPattern.java

+45-4
Original file line numberDiff line numberDiff line change
@@ -131,22 +131,51 @@ void resetChildIter(Tree tree) {
131131
}
132132
}
133133

134+
@Override
135+
boolean isReset() {
136+
// if we partially or completely went through the node,
137+
// we are obviously not reset
138+
if (currChild != 0) {
139+
return false;
140+
}
141+
// if we're at the start, and the first child is not
142+
// initialized, we haven't done anything yet
143+
if (children[0] == null) {
144+
return true;
145+
}
146+
// otherwise, we may have initialized the child
147+
// on a previous time through, then reset it
148+
return (children[0].isReset());
149+
}
150+
134151
// find the next local match
135152
@Override
136153
public boolean matches() { // also known as "FUN WITH LOGIC"
137154
if (considerAll) {
138155
// these are the cases where all children must be considered to match
139156
if (currChild < 0) {
140-
// a past call to this node either got that it failed
157+
// A past call to this node either got that it failed
141158
// matching or that it was a negative match that succeeded,
142159
// which we only want to accept once
143-
return myNode.isOptional();
160+
// Note that in the case of isOptional nodes, we want to NOT
161+
// match again. The previous time through failed to match,
162+
// but was already returned as true because of isOptional().
163+
// If we match again, it will infinite loop because it
164+
// keeps "succeeding" at the optional match
165+
return false;
144166
}
145167

146168
// we must have happily reached the end of a match the last
147169
// time we were here
170+
// we track pastSuccess so that if we reach a failure in
171+
// an optional node after already succeeding, we don't return
172+
// another success, which would be a spurious extra match
173+
final boolean pastSuccess;
148174
if (currChild == children.length) {
149175
--currChild;
176+
pastSuccess = true;
177+
} else {
178+
pastSuccess = false;
150179
}
151180

152181
while (true) {
@@ -173,7 +202,7 @@ public boolean matches() { // also known as "FUN WITH LOGIC"
173202
// earlier location.
174203
--currChild;
175204
if (currChild < 0) {
176-
return myNode.isOptional();
205+
return myNode.isOptional() && !pastSuccess;
177206
}
178207
} else {
179208
// oops, this didn't work - negated disjunction version
@@ -188,6 +217,16 @@ public boolean matches() { // also known as "FUN WITH LOGIC"
188217
}
189218
}
190219
} else {
220+
// Track the first time through this loop
221+
// This will let us handle optional disjunctions
222+
// Note that we can't just check currChild, since it might be 0
223+
// for a match that already hit once on the first child
224+
// We also can't check that children[0] is not created, since
225+
// it might be created and then later reset if this node is
226+
// reached a second time after something higher in the tree
227+
// already matched
228+
final boolean firstTime = isReset();
229+
191230
// these are the cases where a single child node can make you match
192231
for (; currChild < children.length; currChild++) {
193232
if (children[currChild] == null) {
@@ -214,7 +253,9 @@ public boolean matches() { // also known as "FUN WITH LOGIC"
214253
children[resetChild].resetChildIter();
215254
}
216255
}
217-
return myNode.isOptional();
256+
// only accept an optional disjunction if this is the first time through
257+
// otherwise, we'd be accepting the same disjunction over and over
258+
return firstTime && myNode.isOptional();
218259
}
219260
}
220261

src/edu/stanford/nlp/trees/tregex/DescriptionPattern.java

+5
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,11 @@ private void resetChild() {
356356
}
357357
}
358358

359+
@Override
360+
boolean isReset() {
361+
return (treeNodeMatchCandidateIterator == null);
362+
}
363+
359364
/* goes to the next node in the tree that is a successful match to my description pattern.
360365
* This is the hotspot method in running tregex, but not clear how to make it faster. */
361366
// when finished = false; break; is called, it means I successfully matched.

src/edu/stanford/nlp/trees/tregex/TregexMatcher.java

+10
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,16 @@ void resetChildIter(Tree tree) {
9090
*/
9191
abstract void resetChildIter();
9292

93+
/**
94+
* Specifically useful for CoordinationPattern optional disjunctions. <br>
95+
* We want to know if the node has succeeded at least once already,
96+
* in which case we won't succeed in the case of a failure of all
97+
* remaining children. <br>
98+
* Checking this instead of keeping a variable will be slightly faster
99+
* for the standard use case of not using disjunctions
100+
*/
101+
abstract boolean isReset();
102+
93103
/**
94104
* Does the pattern match the tree? It's actually closer to java.util.regex's
95105
* "lookingAt" in that the root of the tree has to match the root of the pattern

test/src/edu/stanford/nlp/trees/tregex/TregexTest.java

+83
Original file line numberDiff line numberDiff line change
@@ -1571,6 +1571,89 @@ public void testOptionalToString() {
15711571
assertEquals("Root (A ?[< B | < C ])", pattern.toString());
15721572
}
15731573

1574+
/**
1575+
* Tests the subtree pattern, <code>&lt;...</code>, which checks for
1576+
* an exact subtree under our current tree, but test it as an optional relation.
1577+
*<br>
1578+
* Checks a bug reported by @tanloong (https://github.com/stanfordnlp/CoreNLP/issues/1375)
1579+
*<br>
1580+
* The optional subtree should only match exactly once,
1581+
* but its buggy form was an infinite loop
1582+
* (see CoordinationPattern for more notes on why)
1583+
*/
1584+
public void testOptionalSubtreePattern() {
1585+
runTest("A ?<... { B ; C ; D }", "(A (B 1) (C 2) (D 3))", "(A (B 1) (C 2) (D 3))");
1586+
}
1587+
1588+
/**
1589+
* The bug reported by @tanloong (https://github.com/stanfordnlp/CoreNLP/issues/1375)
1590+
* actually applied to <i>all</i> optional coordination patterns
1591+
*<br>
1592+
* Here we check that a simpler optional conjunction also fails
1593+
*/
1594+
public void testOptionalChild() {
1595+
runTest("A ?(< B <C)", "(A (B 1) (C 2) (D 3))", "(A (B 1) (C 2) (D 3))");
1596+
}
1597+
1598+
/**
1599+
* An optional coordination which doesn't hit should also match exactly once
1600+
*/
1601+
public void testOptionalChildMiss() {
1602+
runTest("A ?(< B < E)", "(A (B 1) (C 2) (D 3))", "(A (B 1) (C 2) (D 3))");
1603+
}
1604+
1605+
/**
1606+
* An optional disjunction coordination should match at least once,
1607+
* but not match any extra times just because of the optional
1608+
*/
1609+
public void testOptionalDisjunction() {
1610+
// this matches once as an optional, even though none of the children match
1611+
runTest("A ?[< E | < F]", "(A (B 1) (C 2) (D 3))", "(A (B 1) (C 2) (D 3))");
1612+
1613+
// this matches twice
1614+
runTest("A ?[< B | < C]", "(A (B 1) (C 2))", "(A (B 1) (C 2))", "(A (B 1) (C 2))");
1615+
// this matches once, since the (< E) is useless
1616+
runTest("A ?[< B | < E]", "(A (B 1) (C 2) (D 3))", "(A (B 1) (C 2) (D 3))");
1617+
// now it will match twice, since the B should match twice
1618+
runTest("A ?[< B | < E]", "(A (B 1) (C 2) (B 3))", "(A (B 1) (C 2) (B 3))", "(A (B 1) (C 2) (B 3))");
1619+
1620+
// check by hand that foo & bar are set as expected for the disjunction matches
1621+
// note that the order will be the order of the disjunction then subtrees,
1622+
// not sorted by the order of the subtrees
1623+
TregexPattern pattern = TregexPattern.compile("A ?[< B=foo | < C=bar]");
1624+
Tree tree = treeFromString("(A (B 1) (C 2) (B 3))");
1625+
TregexMatcher matcher = pattern.matcher(tree);
1626+
assertTrue(matcher.find());
1627+
assertEquals("(B 1)", matcher.getNode("foo").toString());
1628+
assertNull(matcher.getNode("bar"));
1629+
1630+
assertTrue(matcher.find());
1631+
assertEquals("(B 3)", matcher.getNode("foo").toString());
1632+
assertNull(matcher.getNode("bar"));
1633+
1634+
assertTrue(matcher.find());
1635+
assertNull(matcher.getNode("foo"));
1636+
assertEquals("(C 2)", matcher.getNode("bar").toString());
1637+
1638+
assertFalse(matcher.find());
1639+
1640+
// this example should also work if the same name is used
1641+
// for both of the children!
1642+
pattern = TregexPattern.compile("A ?[< B=foo | < C=foo]");
1643+
matcher = pattern.matcher(tree);
1644+
assertTrue(matcher.find());
1645+
assertEquals("(B 1)", matcher.getNode("foo").toString());
1646+
1647+
assertTrue(matcher.find());
1648+
assertEquals("(B 3)", matcher.getNode("foo").toString());
1649+
1650+
assertTrue(matcher.find());
1651+
assertEquals("(C 2)", matcher.getNode("foo").toString());
1652+
1653+
assertFalse(matcher.find());
1654+
1655+
}
1656+
15741657
/**
15751658
* A user supplied an example of a negated disjunction which went into an infinite loop.
15761659
* Apparently no one had ever used a negated disjunction of tree structures before!

0 commit comments

Comments
 (0)