Skip to content

Commit 4011cfa

Browse files
committed
Merge the ssplit into the tokenize annotator
For the TokenizerAnnotatorITest, this uppdates the behavior of the test to reflect that the newlines are now being consumed by the inner ssplit
1 parent 5f6df2d commit 4011cfa

File tree

5 files changed

+67
-42
lines changed

5 files changed

+67
-42
lines changed

itest/src/edu/stanford/nlp/pipeline/TokenizerAnnotatorITest.java

+19-9
Original file line numberDiff line numberDiff line change
@@ -29,28 +29,38 @@ public void testNotSpanish() {
2929
assertEquals("Damelo", ann.get(CoreAnnotations.TokensAnnotation.class).get(0).word());
3030
}
3131

32-
private static final String spanishText = "Me voy a Madrid (ES).\n\"Me gusta\", lo dice.";
33-
private static List<String> spanishTokens = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "(", "ES", ")", ".", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." });
34-
private static List<String> spanishTokens2 = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "(", "ES", ")", ".", AbstractTokenizer.NEWLINE_TOKEN, "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." });
32+
private static final String spanishText = "Me voy a Madrid (ES)\n\n\"Me gusta\", lo dice.";
33+
private static final String[] spanishTokens = { "Me", "voy", "a", "Madrid", "(", "ES", ")", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." };
3534

3635
public void testSpanishTokenizer() {
37-
TokenizerAnnotator annotator = new TokenizerAnnotator(false, "es", null);
36+
Properties props = new Properties();
37+
props.setProperty("tokenize.language", "es");
38+
39+
TokenizerAnnotator annotator = new TokenizerAnnotator(false, props);
3840
Annotation annotation = new Annotation(spanishText);
3941
annotator.annotate(annotation);
4042
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
41-
assertEquals(spanishTokens.size(), tokens.size());
43+
assertEquals(spanishTokens.length, tokens.size());
4244
for (int i = 0; i < tokens.size(); ++i) {
43-
assertEquals(spanishTokens.get(i), tokens.get(i).value());
45+
assertEquals(spanishTokens[i], tokens.get(i).value());
4446
}
47+
assertEquals(1, annotation.get(CoreAnnotations.SentencesAnnotation.class).size());
48+
49+
// the difference here with NEWLINE_... = two, tokenizeNLs is on
50+
// and there will be two sentences
51+
// the sentence splitter inside the TokenizerAnnotator will see
52+
// the *NL* and split a second sentence there
53+
props.setProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, "two");
4554

46-
annotator = new TokenizerAnnotator(false, "es", "tokenizeNLs,");
55+
annotator = new TokenizerAnnotator(false, props);
4756
annotation = new Annotation(spanishText);
4857
annotator.annotate(annotation);
4958
tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
50-
assertEquals(spanishTokens2.size(), tokens.size());
59+
assertEquals(spanishTokens.length, tokens.size());
5160
for (int i = 0; i < tokens.size(); ++i) {
52-
assertEquals(spanishTokens2.get(i), tokens.get(i).value());
61+
assertEquals(spanishTokens[i], tokens.get(i).value());
5362
}
63+
assertEquals(2, annotation.get(CoreAnnotations.SentencesAnnotation.class).size());
5464
}
5565

5666
}

src/edu/stanford/nlp/pipeline/Annotator.java

+22-22
Original file line numberDiff line numberDiff line change
@@ -130,35 +130,35 @@ default Collection<String> exactRequirements() {
130130
put(STANFORD_CDC_TOKENIZE, new LinkedHashSet<>(Arrays.asList()));
131131
put(STANFORD_CLEAN_XML, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
132132
put(STANFORD_SSPLIT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
133-
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
133+
put(STANFORD_MWT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
134134
put(STANFORD_DOCDATE, new LinkedHashSet<>(Arrays.asList()));
135-
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
136-
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
137-
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA)));
135+
put(STANFORD_POS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
136+
put(STANFORD_LEMMA, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
137+
put(STANFORD_NER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA)));
138138
put(STANFORD_TOKENSREGEX, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
139-
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
140-
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
141-
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
142-
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
143-
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
144-
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
145-
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
146-
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
147-
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
148-
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_PARSE)));
139+
put(STANFORD_REGEXNER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
140+
put(STANFORD_ENTITY_MENTIONS, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
141+
put(STANFORD_GENDER, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
142+
put(STANFORD_TRUECASE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
143+
put(STANFORD_PARSE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
144+
put(STANFORD_DETERMINISTIC_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
145+
put(STANFORD_COREF, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
146+
put(STANFORD_COREF_MENTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
147+
put(STANFORD_RELATION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
148+
put(STANFORD_SENTIMENT, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_PARSE)));
149149
put(STANFORD_COLUMN_DATA_CLASSIFIER, new LinkedHashSet<>(Arrays.asList()));
150-
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
151-
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
152-
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
153-
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
154-
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
155-
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES)));
156-
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
150+
put(STANFORD_DEPENDENCIES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
151+
put(STANFORD_NATLOG, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
152+
put(STANFORD_OPENIE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
153+
put(STANFORD_QUOTE, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
154+
put(STANFORD_QUOTE_ATTRIBUTION, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
155+
put(STANFORD_UD_FEATURES, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES)));
156+
put(STANFORD_LINK, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
157157
// TODO: there are language specific dependencies which we may
158158
// want to encode somehow. For example, English KBP needs coref
159159
// to function. Spanish KBP doesn't need coref, and in fact,
160160
// Spanish coref doesn't even exist.
161-
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
161+
put(STANFORD_KBP, new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
162162
}};
163163

164164
}

src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java

+17-8
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,9 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
257257

258258
// if cleanxml is requested and tokenize is here,
259259
// make it part of tokenize rather than its own annotator
260-
unifyCleanXML(this.properties);
260+
unifyTokenizeProperty(this.properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
261+
// ssplit is always part of tokenize now
262+
unifyTokenizeProperty(this.properties, STANFORD_SSPLIT, null);
261263

262264
// cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
263265
this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
@@ -315,24 +317,31 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
315317
* In such a case, we remove the cleanxml from the annotators and set
316318
* the tokenize.cleanxml option instead
317319
*/
318-
static void unifyCleanXML(Properties properties) {
320+
static void unifyTokenizeProperty(Properties properties, String property, String option) {
319321
String annotators = properties.getProperty("annotators", "");
320322
int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
321-
int clean = annotators.indexOf(STANFORD_CLEAN_XML);
323+
int unwanted = annotators.indexOf(property);
322324

323-
if (clean >= 0 && tokenize >= 0) {
324-
properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
325-
int comma = annotators.indexOf(",", clean);
325+
if (unwanted >= 0 && tokenize >= 0) {
326+
if (option != null) {
327+
properties.setProperty(option, "true");
328+
}
329+
int comma = annotators.indexOf(",", unwanted);
326330
if (comma >= 0) {
327-
annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
331+
annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
328332
} else {
329333
comma = annotators.lastIndexOf(",");
330334
if (comma < 0) {
331335
throw new IllegalArgumentException("Unable to process annotators " + annotators);
332336
}
333337
annotators = annotators.substring(0, comma);
334338
}
335-
logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
339+
if (option != null) {
340+
logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
341+
} else {
342+
logger.debug(property + " is now included as part of the tokenize annotator by default");
343+
}
344+
logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
336345
properties.setProperty("annotators", annotators);
337346
}
338347
}

src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java

+8-2
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {
134134
private final boolean useSegmenter;
135135
private final Annotator segmenterAnnotator;
136136
private final CleanXmlAnnotator cleanxmlAnnotator;
137+
private final WordsToSentencesAnnotator ssplitAnnotator;
137138

138139
/** run a custom post processor after the lexer **/
139140
private final List<CoreLabelProcessor> postProcessors;
@@ -196,7 +197,7 @@ public TokenizerAnnotator(boolean verbose, String lang, String options) {
196197
}
197198

198199
public TokenizerAnnotator(boolean verbose, Properties props) {
199-
this(verbose, props, null);
200+
this(verbose, props, computeExtraOptions(props));
200201
}
201202

202203
public TokenizerAnnotator(boolean verbose, Properties props, String options) {
@@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
250251
} else {
251252
this.cleanxmlAnnotator = null;
252253
}
254+
255+
this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
253256
}
254257

255258
/**
@@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {
429432
if (this.cleanxmlAnnotator != null) {
430433
this.cleanxmlAnnotator.annotate(annotation);
431434
}
435+
this.ssplitAnnotator.annotate(annotation);
432436
}
433437

434438
@Override
@@ -451,7 +455,9 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
451455
CoreAnnotations.IndexAnnotation.class,
452456
CoreAnnotations.OriginalTextAnnotation.class,
453457
CoreAnnotations.ValueAnnotation.class,
454-
CoreAnnotations.IsNewlineAnnotation.class
458+
CoreAnnotations.IsNewlineAnnotation.class,
459+
CoreAnnotations.SentencesAnnotation.class,
460+
CoreAnnotations.SentenceIndexAnnotation.class
455461
));
456462
}
457463

test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public void testUnifyTokenizer() {
105105
for (int i = 0; i < inputs.length; ++i) {
106106
Properties props = new Properties();
107107
props.setProperty("annotators", inputs[i]);
108-
StanfordCoreNLP.unifyCleanXML(props);
108+
StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");
109109
assertEquals(expected[i], props.getProperty("annotators"));
110110
assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
111111
}

0 commit comments

Comments
 (0)