Merge the ssplit into the tokenize annotator

AngledLuffa · AngledLuffa · commit 4011cfa2426a · 2022-03-17T13:25:59.000-07:00
For the TokenizerAnnotatorITest, this uppdates the behavior of the test
to reflect that the newlines are now being consumed by the inner ssplit
diff --git a/itest/src/edu/stanford/nlp/pipeline/TokenizerAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/TokenizerAnnotatorITest.java
@@ -29,28 +29,38 @@ public void testNotSpanish() {
     assertEquals("Damelo", ann.get(CoreAnnotations.TokensAnnotation.class).get(0).word());
   }
 
-  private static final String spanishText = "Me voy a Madrid (ES).\n\"Me gusta\", lo dice.";
-  private static List<String> spanishTokens = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "(", "ES", ")", ".", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." });
-  private static List<String> spanishTokens2 = Arrays.asList(new String[] { "Me", "voy", "a", "Madrid", "(", "ES", ")", ".", AbstractTokenizer.NEWLINE_TOKEN, "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." });
+  private static final String spanishText = "Me voy a Madrid (ES)\n\n\"Me gusta\", lo dice.";
+  private static final String[] spanishTokens = { "Me", "voy", "a", "Madrid", "(", "ES", ")", "\"", "Me", "gusta", "\"", ",", "lo", "dice", "." };
 
   public void testSpanishTokenizer() {
-    TokenizerAnnotator annotator = new TokenizerAnnotator(false, "es", null);
+    Properties props = new Properties();
+    props.setProperty("tokenize.language", "es");
+
+    TokenizerAnnotator annotator = new TokenizerAnnotator(false, props);
     Annotation annotation = new Annotation(spanishText);
     annotator.annotate(annotation);
     List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
-    assertEquals(spanishTokens.size(), tokens.size());
+    assertEquals(spanishTokens.length, tokens.size());
     for (int i = 0; i < tokens.size(); ++i) {
-      assertEquals(spanishTokens.get(i), tokens.get(i).value());
+      assertEquals(spanishTokens[i], tokens.get(i).value());
     }
+    assertEquals(1, annotation.get(CoreAnnotations.SentencesAnnotation.class).size());
+
+    // the difference here with NEWLINE_... = two, tokenizeNLs is on
+    // and there will be two sentences
+    // the sentence splitter inside the TokenizerAnnotator will see
+    // the *NL* and split a second sentence there
+    props.setProperty(StanfordCoreNLP.NEWLINE_IS_SENTENCE_BREAK_PROPERTY, "two");
 
-    annotator = new TokenizerAnnotator(false, "es", "tokenizeNLs,");
+    annotator = new TokenizerAnnotator(false, props);
     annotation = new Annotation(spanishText);
     annotator.annotate(annotation);
     tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
-    assertEquals(spanishTokens2.size(), tokens.size());
+    assertEquals(spanishTokens.length, tokens.size());
     for (int i = 0; i < tokens.size(); ++i) {
-      assertEquals(spanishTokens2.get(i), tokens.get(i).value());
+      assertEquals(spanishTokens[i], tokens.get(i).value());
     }
+    assertEquals(2, annotation.get(CoreAnnotations.SentencesAnnotation.class).size());
   }
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/Annotator.java b/src/edu/stanford/nlp/pipeline/Annotator.java
@@ -130,35 +130,35 @@ default Collection<String> exactRequirements() {
     put(STANFORD_CDC_TOKENIZE,             new LinkedHashSet<>(Arrays.asList()));
     put(STANFORD_CLEAN_XML,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
     put(STANFORD_SSPLIT,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
-    put(STANFORD_MWT,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
+    put(STANFORD_MWT,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
     put(STANFORD_DOCDATE,                  new LinkedHashSet<>(Arrays.asList()));
-    put(STANFORD_POS,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_LEMMA,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_NER,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA)));
+    put(STANFORD_POS,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_LEMMA,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_NER,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA)));
     put(STANFORD_TOKENSREGEX,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
-    put(STANFORD_REGEXNER,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_ENTITY_MENTIONS,          new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
-    put(STANFORD_GENDER,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
-    put(STANFORD_TRUECASE,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT)));
-    put(STANFORD_PARSE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_DETERMINISTIC_COREF,      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
-    put(STANFORD_COREF,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
-    put(STANFORD_COREF_MENTION,            new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
-    put(STANFORD_RELATION,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
-    put(STANFORD_SENTIMENT,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_PARSE)));
+    put(STANFORD_REGEXNER,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_ENTITY_MENTIONS,          new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_GENDER,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_TRUECASE,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE)));
+    put(STANFORD_PARSE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_DETERMINISTIC_COREF,      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE)));
+    put(STANFORD_COREF,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
+    put(STANFORD_COREF_MENTION,            new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_DEPENDENCIES)));
+    put(STANFORD_RELATION,                 new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_PARSE, STANFORD_DEPENDENCIES)));
+    put(STANFORD_SENTIMENT,                new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_PARSE)));
     put(STANFORD_COLUMN_DATA_CLASSIFIER,   new LinkedHashSet<>(Arrays.asList()));
-    put(STANFORD_DEPENDENCIES,             new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS)));
-    put(STANFORD_NATLOG,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
-    put(STANFORD_OPENIE,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
-    put(STANFORD_QUOTE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
-    put(STANFORD_QUOTE_ATTRIBUTION,        new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
-    put(STANFORD_UD_FEATURES,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES)));
-    put(STANFORD_LINK,                     new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
+    put(STANFORD_DEPENDENCIES,             new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS)));
+    put(STANFORD_NATLOG,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES)));
+    put(STANFORD_OPENIE,                   new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_DEPENDENCIES, STANFORD_NATLOG)));
+    put(STANFORD_QUOTE,                    new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF)));
+    put(STANFORD_QUOTE_ATTRIBUTION,        new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_LEMMA, STANFORD_NER, STANFORD_COREF_MENTION, STANFORD_DEPENDENCIES, STANFORD_QUOTE)));
+    put(STANFORD_UD_FEATURES,              new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES)));
+    put(STANFORD_LINK,                     new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER, STANFORD_ENTITY_MENTIONS)));
     // TODO: there are language specific dependencies which we may
     // want to encode somehow.  For example, English KBP needs coref
     // to function.  Spanish KBP doesn't need coref, and in fact,
     // Spanish coref doesn't even exist.
-    put(STANFORD_KBP,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_SSPLIT, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
+    put(STANFORD_KBP,                      new LinkedHashSet<>(Arrays.asList(STANFORD_TOKENIZE, STANFORD_POS, STANFORD_DEPENDENCIES, STANFORD_LEMMA, STANFORD_NER)));
   }};
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
@@ -257,7 +257,9 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
 
     // if cleanxml is requested and tokenize is here,
     // make it part of tokenize rather than its own annotator
-    unifyCleanXML(this.properties);
+    unifyTokenizeProperty(this.properties, STANFORD_CLEAN_XML, STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML);
+    // ssplit is always part of tokenize now
+    unifyTokenizeProperty(this.properties, STANFORD_SSPLIT, null);
 
     // cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
     this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());
@@ -315,24 +317,31 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
    * In such a case, we remove the cleanxml from the annotators and set
    * the tokenize.cleanxml option instead
    */
-  static void unifyCleanXML(Properties properties) {
+  static void unifyTokenizeProperty(Properties properties, String property, String option) {
     String annotators = properties.getProperty("annotators", "");
     int tokenize = annotators.indexOf(STANFORD_TOKENIZE);
-    int clean = annotators.indexOf(STANFORD_CLEAN_XML);
+    int unwanted = annotators.indexOf(property);
 
-    if (clean >= 0 && tokenize >= 0) {
-      properties.setProperty(STANFORD_TOKENIZE + "." + STANFORD_CLEAN_XML, "true");
-      int comma = annotators.indexOf(",", clean);
+    if (unwanted >= 0 && tokenize >= 0) {
+      if (option != null) {
+        properties.setProperty(option, "true");
+      }
+      int comma = annotators.indexOf(",", unwanted);
       if (comma >= 0) {
-        annotators = annotators.substring(0, clean) + annotators.substring(comma+1);
+        annotators = annotators.substring(0, unwanted) + annotators.substring(comma+1);
       } else {
         comma = annotators.lastIndexOf(",");
         if (comma < 0) {
           throw new IllegalArgumentException("Unable to process annotators " + annotators);
         }
         annotators = annotators.substring(0, comma);
       }
-      logger.debug("cleanxml can now be triggered as an option to tokenize rather than a separate annotator via tokenize.cleanxml=true  Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
+      if (option != null) {
+        logger.debug(property + " can now be triggered as an option to tokenize rather than a separate annotator via " + option + "=true");
+      } else {
+        logger.debug(property + " is now included as part of the tokenize annotator by default");
+      }
+      logger.debug("Updating annotators from " + properties.getProperty("annotators") + " to " + annotators);
       properties.setProperty("annotators", annotators);
     }
   }
diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {
   private final boolean useSegmenter;
   private final Annotator segmenterAnnotator;
   private final CleanXmlAnnotator cleanxmlAnnotator;
+  private final WordsToSentencesAnnotator ssplitAnnotator;
 
   /** run a custom post processor after the lexer **/
   private final List<CoreLabelProcessor> postProcessors;
@@ -196,7 +197,7 @@ public TokenizerAnnotator(boolean verbose, String lang, String options) {
   }
 
   public TokenizerAnnotator(boolean verbose, Properties props) {
-    this(verbose, props, null);
+    this(verbose, props, computeExtraOptions(props));
   }
 
   public TokenizerAnnotator(boolean verbose, Properties props, String options) {
@@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {
     } else {
       this.cleanxmlAnnotator = null;
     }
+
+    this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
   }
 
   /**
@@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {
     if (this.cleanxmlAnnotator != null) {
       this.cleanxmlAnnotator.annotate(annotation);
     }
+    this.ssplitAnnotator.annotate(annotation);
   }
 
   @Override
@@ -451,7 +455,9 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
         CoreAnnotations.IndexAnnotation.class,
         CoreAnnotations.OriginalTextAnnotation.class,
         CoreAnnotations.ValueAnnotation.class,
-        CoreAnnotations.IsNewlineAnnotation.class
+        CoreAnnotations.IsNewlineAnnotation.class,
+        CoreAnnotations.SentencesAnnotation.class,
+        CoreAnnotations.SentenceIndexAnnotation.class
     ));
   }
 
diff --git a/test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java b/test/src/edu/stanford/nlp/pipeline/StanfordCoreNLPTest.java
@@ -105,7 +105,7 @@ public void testUnifyTokenizer() {
     for (int i = 0; i < inputs.length; ++i) {
       Properties props = new Properties();
       props.setProperty("annotators", inputs[i]);
-      StanfordCoreNLP.unifyCleanXML(props);
+      StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");
       assertEquals(expected[i], props.getProperty("annotators"));
       assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));
     }

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@ public static TokenizerType getTokenizerType(Properties props) {`
`134`	`134`	`private final boolean useSegmenter;`
`135`	`135`	`private final Annotator segmenterAnnotator;`
`136`	`136`	`private final CleanXmlAnnotator cleanxmlAnnotator;`
	`137`	`+ private final WordsToSentencesAnnotator ssplitAnnotator;`
`137`	`138`
`138`	`139`	`/ run a custom post processor after the lexer /`
`139`	`140`	`private final List<CoreLabelProcessor> postProcessors;`
`@@ -196,7 +197,7 @@ public TokenizerAnnotator(boolean verbose, String lang, String options) {`
`196`	`197`	`}`
`197`	`198`
`198`	`199`	`public TokenizerAnnotator(boolean verbose, Properties props) {`
`199`		`- this(verbose, props, null);`
	`200`	`+ this(verbose, props, computeExtraOptions(props));`
`200`	`201`	`}`
`201`	`202`
`202`	`203`	`public TokenizerAnnotator(boolean verbose, Properties props, String options) {`
`@@ -250,6 +251,8 @@ public TokenizerAnnotator(boolean verbose, Properties props, String options) {`
`250`	`251`	`} else {`
`251`	`252`	`this.cleanxmlAnnotator = null;`
`252`	`253`	`}`
	`254`	`+`
	`255`	`+ this.ssplitAnnotator = new WordsToSentencesAnnotator(props);`
`253`	`256`	`}`
`254`	`257`
`255`	`258`	`/**`
`@@ -429,6 +432,7 @@ public void annotate(Annotation annotation) {`
`429`	`432`	`if (this.cleanxmlAnnotator != null) {`
`430`	`433`	`this.cleanxmlAnnotator.annotate(annotation);`
`431`	`434`	`}`
	`435`	`+ this.ssplitAnnotator.annotate(annotation);`
`432`	`436`	`}`
`433`	`437`
`434`	`438`	`@Override`
`@@ -451,7 +455,9 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {`
`451`	`455`	`CoreAnnotations.IndexAnnotation.class,`
`452`	`456`	`CoreAnnotations.OriginalTextAnnotation.class,`
`453`	`457`	`CoreAnnotations.ValueAnnotation.class,`
`454`		`- CoreAnnotations.IsNewlineAnnotation.class`
	`458`	`+ CoreAnnotations.IsNewlineAnnotation.class,`
	`459`	`+ CoreAnnotations.SentencesAnnotation.class,`
	`460`	`+ CoreAnnotations.SentenceIndexAnnotation.class`
`455`	`461`	`));`
`456`	`462`	`}`
`457`	`463`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ public void testUnifyTokenizer() {`
`105`	`105`	`for (int i = 0; i < inputs.length; ++i) {`
`106`	`106`	`Properties props = new Properties();`
`107`	`107`	`props.setProperty("annotators", inputs[i]);`
`108`		`- StanfordCoreNLP.unifyCleanXML(props);`
	`108`	`+ StanfordCoreNLP.unifyTokenizeProperty(props, "cleanxml", "tokenize.cleanxml");`
`109`	`109`	`assertEquals(expected[i], props.getProperty("annotators"));`
`110`	`110`	`assertEquals(option[i], PropertiesUtils.getBool(props, "tokenize.cleanxml", false));`
`111`	`111`	`}`