Skip to content

Commit 5e57eab

Browse files
committed
Adding PTB Corrector as an option reduces the total validation errors in the PTB conversion to dependencies by about 250. Weirdly this is by removing 280 syntax errors and adding 40 morpho errors for aux verbs. Presumably those should be fixable. Of course, there is always more that can be done - there are now 2622 errors left when using the converter. UniversalDependencies/docs#717
1 parent 1f11e3f commit 5e57eab

File tree

1 file changed

+8
-1
lines changed

1 file changed

+8
-1
lines changed

src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java

+8-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import edu.stanford.nlp.semgraph.SemanticGraph;
77
import edu.stanford.nlp.semgraph.SemanticGraphFactory;
88
import edu.stanford.nlp.trees.*;
9+
import edu.stanford.nlp.trees.treebank.EnglishPTBTreebankCorrector;
910
import edu.stanford.nlp.util.Pair;
1011
import edu.stanford.nlp.util.PropertiesUtils;
1112
import edu.stanford.nlp.util.StringUtils;
@@ -229,6 +230,7 @@ private static void addSpaceAfter(SemanticGraph sg, String text, int graphIdx) {
229230
* {@code -textFile}: A file with text to be used as a guide for SpaceAfter (optional)<br>
230231
* {@code -outputRepresentation}: "basic" (default), "enhanced", or "enhanced++"<br>
231232
* {@code -combineMWTs}: "False" (default), "True" marks things like it's as MWT
233+
* {@code -correctPTB}: "False" (default), "True" runs the PTB Corrector over the trees
232234
*/
233235
public static void main(String[] args) {
234236
Properties props = StringUtils.argsToProperties(args);
@@ -239,6 +241,7 @@ public static void main(String[] args) {
239241
boolean addFeatures = PropertiesUtils.getBool(props, "addFeatures", false);
240242
boolean combineMWTs = PropertiesUtils.getBool(props, "combineMWTs", false);
241243
boolean replaceLemmata = PropertiesUtils.getBool(props, "replaceLemmata", false);
244+
boolean correctPTB = PropertiesUtils.getBool(props, "correctPTB", false);
242245

243246
Iterator<Pair<SemanticGraph, SemanticGraph>> sgIterator; // = null;
244247

@@ -258,7 +261,7 @@ public static void main(String[] args) {
258261
System.err.println("No input file specified!");
259262
System.err.println();
260263
System.err.printf("Usage: java %s [-treeFile trees.tree | -conlluFile deptrees.conllu]" +
261-
" [-addFeatures] [-replaceLemmata] [-textFile trees.txt] [-outputRepresentation basic|enhanced|enhanced++ (default: basic)]%n",
264+
" [-addFeatures] [-replaceLemmata] [-correctPTB] [-textFile trees.txt] [-outputRepresentation basic|enhanced|enhanced++ (default: basic)]%n",
262265
UniversalDependenciesConverter.class.getCanonicalName());
263266
return;
264267
}
@@ -271,6 +274,7 @@ public static void main(String[] args) {
271274

272275
UniversalDependenciesFeatureAnnotator featureAnnotator = (addFeatures) ? new UniversalDependenciesFeatureAnnotator() : null;
273276
EnglishMWTCombiner mwtCombiner = (combineMWTs) ? new EnglishMWTCombiner() : null;
277+
EnglishPTBTreebankCorrector ptbCorrector = (correctPTB) ? new EnglishPTBTreebankCorrector() : null;
274278

275279
CoNLLUDocumentWriter writer = new CoNLLUDocumentWriter();
276280

@@ -282,6 +286,9 @@ public static void main(String[] args) {
282286
if (treeFileName != null) {
283287
//add UPOS tags
284288
Tree tree = ((TreeToSemanticGraphIterator) sgIterator).getCurrentTree();
289+
if (ptbCorrector != null) {
290+
tree = ptbCorrector.transformTree(tree);
291+
}
285292
Tree uposTree = UniversalPOSMapper.mapTree(tree);
286293
List<Label> uposLabels = uposTree.preTerminalYield();
287294
for (IndexedWord token: sg.vertexListSorted()) {

0 commit comments

Comments
 (0)