Skip to content

Commit 9a86ece

Browse files
committed
Add a fake XSL node when converting constituency trees to dependencies for SD. This makes it easy to treat 'up to' as an MWE. #1363
1 parent 8c46648 commit 9a86ece

6 files changed

+27
-13
lines changed

src/edu/stanford/nlp/trees/CollinsHeadFinder.java

+2
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ public CollinsHeadFinder(TreebankLanguagePack tlp, String... categoriesToAvoid)
7272
nonTerminalInfo.put("TYPO", new String[][] {{"left"}}); // another crap rule, for Brown (Roger)
7373
nonTerminalInfo.put("EDITED", new String[][] {{"left"}}); // crap rule for Switchboard (if don't delete EDITED nodes)
7474
nonTerminalInfo.put("XS", new String[][] {{"right", "IN"}}); // rule for new structure in QP
75+
// XSL is similar to XS, but is specifically for left headed phrases
76+
nonTerminalInfo.put("XSL", new String[][]{{"left"}});
7577
}
7678

7779
@Override

src/edu/stanford/nlp/trees/EnglishGrammaticalRelations.java

+8-12
Original file line numberDiff line numberDiff line change
@@ -980,16 +980,10 @@ private EnglishGrammaticalRelations() {}
980980
public static final GrammaticalRelation QUANTIFIER_MODIFIER =
981981
new GrammaticalRelation(Language.English, "quantmod", "quantifier modifier",
982982
MODIFIER, "QP", tregexCompiler,
983-
// RP is because sometimes "up" in "up to ___" gets tagged RP in PTB
984-
// this is probably a mistake - generally it is tagged IN
985-
// but sometimes the tagger follows suit
986-
// there are no conflicts elsewhere in the targets of a QP,
987-
// so there should be no need to specifically check for the phrase "up to" for `up_RP`
988-
"QP < IN|RB|RBR|RBS|PDT|DT|JJ|JJR|JJS|XS|RP=target",
989-
// TO is for the "to" in "up to ___"
990-
// TODO: but currently not working for up_IN to_IN foo_CD, since it wants to make TO the head of IN!
991-
"(QP < (TO=target < /^(?i:to)$/) < (__=up < /^(?i:up)$/)) : (=up $++ =target)");
992-
983+
// XS and XSL is to match "up to" or similar phrases
984+
// after the QPTreeTransformer's operation
985+
"QP < IN|RB|RBR|RBS|PDT|DT|JJ|JJR|JJS|XS|XSL|RP=target"
986+
);
993987

994988
/**
995989
* The "noun compound modifier" grammatical relation. A noun compound
@@ -1275,15 +1269,17 @@ private EnglishGrammaticalRelations() {}
12751269
*/
12761270
public static final GrammaticalRelation MULTI_WORD_EXPRESSION =
12771271
new GrammaticalRelation(Language.English, "mwe", "multi-word expression",
1278-
MODIFIER, "PP|XS|ADVP|CONJP", tregexCompiler,
1272+
MODIFIER, "PP|XS|XSL|ADVP|CONJP", tregexCompiler,
12791273
"PP|XS < (IN|TO < as|of|at|to|in) < (JJ|IN|JJR|JJS|NN=target < such|because|Because|least|instead|due|Due|addition|to)",
12801274
"ADVP < (RB|IN < well) < (IN|RB|JJS=target < as)",
12811275
// TODO: perhaps the phrase "all but" is more like "all" and should have that as the head
12821276
"ADVP < (DT=target < all) < (CC < but)",
12831277
"CONJP < (RB < rather|well|instead) < (RB|IN=target < as|than|of)",
12841278
"CONJP < (IN < in) < (NN|TO=target < addition|to)",
12851279
// todo: note inconsistent head finding for "rather than"!
1286-
"XS < JJR|JJS=target" // more than, fewer than, well over -- maybe change some of these?
1280+
"XS < JJR|JJS=target", // more than, fewer than, well over -- maybe change some of these?
1281+
// currently only "up to"
1282+
"XSL < __=target"
12871283
);
12881284

12891285
/* mihai: this block needs to be uncommented to get the KBP 2010 system to work (due to the cached sentences using old code)

src/edu/stanford/nlp/trees/ModCollinsHeadFinder.java

+2
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ public ModCollinsHeadFinder(TreebankLanguagePack tlp) {
138138

139139
nonTerminalInfo.put("META", new String[][] {{"left"}}); // rule for OntoNotes, but maybe should just be deleted in TreeReader??
140140
nonTerminalInfo.put("XS", new String[][] {{"right", "IN"}}); // rule for new structure in QP, introduced by Stanford in QPTreeTransformer
141+
// XSL is similar to XS, but is specifically for left headed phrases
142+
nonTerminalInfo.put("XSL", new String[][]{{"left"}});
141143
// nonTerminalInfo.put(null, new String[][] {{"left"}}); // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule?
142144

143145
// todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category.

src/edu/stanford/nlp/trees/QPTreeTransformer.java

+11-1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,14 @@ public Tree transformTree(Tree t) {
6767
private static TsurgeonPattern flattenNPoverQPTsurgeon =
6868
Tsurgeon.parseOperation("[createSubtree QP left right] [excise left left] [excise right right]");
6969

70+
private static TregexPattern multiwordXSLTregex =
71+
// captures "up to"
72+
// once "up to" is captured in the XSL, the following XS operation won't accidentally grab it
73+
TregexPattern.compile("QP < ( /^RB|IN|RP/=left < /^(?:up)$/ ) < ( /^IN|TO/=right < /^(?:to)$/ $- =left )");
74+
75+
private static TsurgeonPattern multiwordXSLTsurgeon =
76+
Tsurgeon.parseOperation("createSubtree XSL left right");
77+
7078
private static TregexPattern multiwordXSTregex =
7179
// TODO: should add NN and $ to the numeric expressions captured
7280
// NN is for words such as "half" which are probably misparsed
@@ -109,8 +117,10 @@ public Tree transformTree(Tree t) {
109117
*/
110118
public Tree QPtransform(Tree t) {
111119
t = Tsurgeon.processPattern(flattenNPoverQPTregex, flattenNPoverQPTsurgeon, t);
112-
if ( ! universalDependencies)
120+
if (!universalDependencies) {
121+
t = Tsurgeon.processPattern(multiwordXSLTregex, multiwordXSLTsurgeon, t);
113122
t = Tsurgeon.processPattern(multiwordXSTregex, multiwordXSTsurgeon, t);
123+
}
114124
t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t);
115125
t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t);
116126
return t;

src/edu/stanford/nlp/trees/SemanticHeadFinder.java

+2
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ private void ruleChanges() {
199199

200200
// add the constituent XS (special node to add a layer in a QP tree introduced in our QPTreeTransformer)
201201
nonTerminalInfo.put("XS", new String[][]{{"right", "IN"}});
202+
// XSL is similar to XS, but is specifically for left headed phrases
203+
nonTerminalInfo.put("XSL", new String[][]{{"left"}});
202204

203205
// add a rule to deal with the CoNLL data
204206
nonTerminalInfo.put("EMBED", new String[][]{{"right", "INTJ"}});

src/edu/stanford/nlp/trees/UniversalSemanticHeadFinder.java

+2
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ private void ruleChanges() {
181181

182182
// add the constituent XS (special node to add a layer in a QP tree introduced in our QPTreeTransformer)
183183
nonTerminalInfo.put("XS", new String[][]{{"right", "IN"}});
184+
// XSL is similar to XS, but is specifically for left headed phrases
185+
nonTerminalInfo.put("XSL", new String[][]{{"left"}});
184186

185187
// add a rule to deal with the CoNLL data
186188
nonTerminalInfo.put("EMBED", new String[][]{{"right", "INTJ"}});

0 commit comments

Comments
 (0)