Skip to content

Commit 55595d3

Browse files
committed
Merge the ssplit into the tokenize annotator
1 parent 5f6df2d commit 55595d3

File tree

4 files changed

+47
-32
lines changed

4 files changed

+47
-32
lines changed

src/edu/stanford/nlp/pipeline/Annotator.java

+22-22
Original file line numberDiff line numberDiff line change
@@ -130,35 +130,35 @@ default Collection<String> exactRequirements() {
130130
put(STANFORD_CDC_TOKENIZE, new LinkedHashSet<>(Arrays.asList()));
131131
put(STANFORD_CLEAN_XML, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
132132
put(STANFORD_SSPLIT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
133-
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
133+
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
134134
put(STANFORD_DOCDATE, new LinkedHashSet<>(Arrays.asList()));
135-
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
136-
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
137-
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA)));
135+
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
136+
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
137+
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA)));
138138
put(STANFORD_TOKENSREGEX, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
139-
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
140-
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
141-
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
142-
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
143-
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
144-
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
145-
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
146-
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
147-
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
148-
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_PARSE)));
139+
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
140+
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
141+
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
142+
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
143+
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
144+
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
145+
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
146+
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
147+
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
148+
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_PARSE)));
149149
put(STANFORD_COLUMN_DATA_CLASSIFIER, new LinkedHashSet<>(Arrays.asList()));
150-
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
151-
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
152-
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
153-
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
154-
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
155-
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES)));
156-
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
150+
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
151+
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
152+
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
153+
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
154+
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
155+
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES)));
156+
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
157157
// TODO: there are language specific dependencies which we may
158158
// want to encode somehow. For example, English KBP needs coref
159159
// to function. Spanish KBP doesn't need coref, and in fact,
160160
// Spanish coref doesn't even exist.
161-
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
161+
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
162162
}};
163163

164164
}

src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java

+17-8
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,9 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
257257

258258
// if cleanxml is requested and tokenize is here,
259259
// make it part of tokenize rather than its own annotator
260-
unifyCleanXML(this.properties);
260+
unifyTokenizeProperty(this.properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
261+
// ssplit is always part of tokenize now
262+
unifyTokenizeProperty(this.properties, STANFORD_SSPLIT, null);
261263

262264
// cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
263265
this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
@@ -315,24 +317,31 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
315317
* In such a case, we remove the cleanxml from the annotators and set
316318
* the tokenize.cleanxml option instead
317319
*/
318-
static void unifyCleanXML(Properties properties) {
320+
static void unifyTokenizeProperty(Properties properties, String property, String option) {
319321
String annotators = properties.getProperty("annotators", "");
320322
int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
321-
int clean = annotators.indexOf(STANFORD_CLEAN_XML);
323+
int unwanted = annotators.indexOf(property);
322324

323-
if (clean >= 0 && tokenize >= 0) {
324-
properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
325-
int comma = annotators.indexOf(",", clean);
325+
if (unwanted >= 0 && tokenize >= 0) {
326+
if (option != null) {
327+
properties.setProperty(option, "true");
328+
}
329+
int comma = annotators.indexOf(",", unwanted);
326330
if (comma >= 0) {
327-
annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
331+
annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
328332
} else {
329333
comma = annotators.lastIndexOf(",");
330334
if (comma < 0) {
331335
throw new IllegalArgumentException("Unable to process annotators " + annotators);
332336
}
333337
annotators = annotators.substring(0, comma);
334338
}
335-
logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
339+
if (option != null) {
340+
logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
341+
} else {
342+
logger.debug(property + " is now included as part of the tokenize annotator by default");
343+
}
344+
logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
336345
properties.setProperty("annotators", annotators);
337346
}
338347
}

src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java

+7-1
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {
134134
private final boolean useSegmenter;
135135
private final Annotator segmenterAnnotator;
136136
private final CleanXmlAnnotator cleanxmlAnnotator;
137+
private final WordsToSentencesAnnotator ssplitAnnotator;
137138

138139
/** run a custom post processor after the lexer **/
139140
private final List<CoreLabelProcessor> postProcessors;
@@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
250251
} else {
251252
this.cleanxmlAnnotator = null;
252253
}
254+
255+
this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
253256
}
254257

255258
/**
@@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {
429432
if (this.cleanxmlAnnotator != null) {
430433
this.cleanxmlAnnotator.annotate(annotation);
431434
}
435+
this.ssplitAnnotator.annotate(annotation);
432436
}
433437

434438
@Override
@@ -451,7 +455,9 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
451455
CoreAnnotations.IndexAnnotation.class,
452456
CoreAnnotations.OriginalTextAnnotation.class,
453457
CoreAnnotations.ValueAnnotation.class,
454-
CoreAnnotations.IsNewlineAnnotation.class
458+
CoreAnnotations.IsNewlineAnnotation.class,
459+
CoreAnnotations.SentencesAnnotation.class,
460+
CoreAnnotations.SentenceIndexAnnotation.class
455461
));
456462
}
457463

test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public void testUnifyTokenizer() {
105105
for (int i = 0; i < inputs.length; ++i) {
106106
Properties props = new Properties();
107107
props.setProperty("annotators", inputs[i]);
108-
StanfordCoreNLP.unifyCleanXML(props);
108+
StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");
109109
assertEquals(expected[i], props.getProperty("annotators"));
110110
assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
111111
}

0 commit comments

Comments
 (0)