Skip to content

Commit 8b97d64

Browse files
committed
Make JFlex-based tokenizers share more and be more consistent.
- Everthing uses AbstractTokenizer.NEW_LINE - French and Spanish add PTBLexer enum for dashes option/treatments, and delete ptb3Dashes options - ellipsis and dashes style "ptb3" renamed to "ascii" - extract out and unify more token regex specifications in LexCommon.tokens (e.g., PHONE, EMOJI) - add FILENAME rule to Spanish lexer
1 parent 0d9e9c8 commit 8b97d64

File tree

12 files changed

+92240
-116309
lines changed

12 files changed

+92240
-116309
lines changed

src/edu/stanford/nlp/international/french/process/FrenchLexer.flex

+275-293
Large diffs are not rendered by default.

src/edu/stanford/nlp/international/french/process/FrenchLexer.java

+12,250-10,530
Large diffs are not rendered by default.

src/edu/stanford/nlp/international/french/process/FrenchTokenizer.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ public class FrenchTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
6363
private List<CoreLabel> compoundBuffer;
6464

6565
// Produces the tokenization for parsing used by Green, de Marneffe, and Manning (2011)
66-
public static final String FTB_OPTIONS = "ellipses=ptb3,normalizeParentheses=true,ptb3Dashes=false," +
66+
public static final String FTB_OPTIONS = "ellipses=ascii,normalizeParentheses=true,dashes=not_cp1252," +
6767
SPLIT_CONTRACTIONS_OPTION + "=true," + SPLIT_COMPOUNDS_OPTION + "=true";
6868

6969
// Official pipeline default settings for French
@@ -381,7 +381,7 @@ public static void main(String[] args) {
381381
while (tokenizer.hasNext()) {
382382
++nTokens;
383383
String word = tokenizer.next().word();
384-
if (word.equals(FrenchLexer.NEWLINE_TOKEN)) {
384+
if (word.equals(AbstractTokenizer.NEWLINE_TOKEN)) {
385385
++nLines;
386386
printSpace = false;
387387
System.out.println();

src/edu/stanford/nlp/international/spanish/process/SpanishLexer.flex

+267-297
Large diffs are not rendered by default.

src/edu/stanford/nlp/international/spanish/process/SpanishLexer.java

+13,829-7,474
Large diffs are not rendered by default.

src/edu/stanford/nlp/international/spanish/process/SpanishTokenizer.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,9 @@ public class SpanishTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
6464
private SpanishVerbStripper verbStripper;
6565

6666
// Produces the tokenization for parsing used by AnCora (fixed) */
67-
public static final String ANCORA_OPTIONS = "ellipses=ptb3,normalizeParentheses=true,splitAll=true";
67+
public static final String ANCORA_OPTIONS = "ellipses=ascii,normalizeParentheses=true,splitAll=true";
6868

69-
public static final String DEFAULT_OPTIONS = "invertible,ellipses=ptb3,splitAll=false";
69+
public static final String DEFAULT_OPTIONS = "invertible,ellipses=ascii,splitAll=false";
7070

7171
/**
7272
* Constructor.

src/edu/stanford/nlp/process/LexCommon.tokens

+53
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,26 @@
1+
/* Defines common token types for our JFlex-based tokenizers */
2+
3+
/* Todo: Really SGML shouldn't be here at all, it's kind of legacy. But we continue to tokenize
4+
some simple standard forms of concrete SGML syntax, since it tends to give robustness. */
5+
/* ---
6+
( +([A-Za-z][A-Za-z0-9:.-]*( *= *['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]| *\/))*
7+
SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]*([ ]+([A-Za-z][A-Za-z0-9:.-]*([ ]*=[ ]*['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)>
8+
( +[A-Za-z][A-Za-z0-9:.-]*)*
9+
FOO = ([ ]+[A-Za-z][A-Za-z0-9:.-]*)*
10+
SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]* *)>
11+
SGML = \<([!\?][A-Za-z\-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*['\"][^\r\n'\"]*['\"]|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)\>
12+
--- */
13+
14+
/* <STORYID cat=w pri=u> */
15+
/* SGML1 allows attribute value match over newline; SGML2 does not. */
16+
SGML1 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ \r\n]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ \r\n]*=[ \r\n]*('[^']*'|\"[^\"]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ \r\n]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ \r\n]*\>
17+
SGML2 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ ]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ ]*\>
18+
SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
19+
SPAMP = &amp;
20+
SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
21+
SPLET = &[aeiouAEIOU](acute|grave|uml);
22+
23+
124
/* \u3000 is ideographic space; \u205F is medium math space */
225
/* \u2063 is an invisible separator */
326
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000]
@@ -12,3 +35,33 @@ NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000\r\n\u202
1235

1336
FILENAME_EXT = 3gp|aac|aspx|avi|bat|bmp|bz2|c|class|cgi|cpp|csv|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|m4a|m4v|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|pptx|ps|psd|py|rtf|sql|tar|tgz|tif|tiff|tmp|txt|wav|wm[va]|x|xls|xlsx|xml|zip
1437
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
38+
39+
40+
/* Slightly generous but generally reasonably good emoji parsing. These patterns handle correctly 100% of emoji through Unicode 14.0 (Sept 2021). */
41+
/* These are emoji that can be followed by a zwj (U+200D) and then gender or similar things (as well as skin color). Mainly humans but certain others like bears, hearts */
42+
EMOJI_GENDERED = [\u26F9\u2764\u{01F3C3}-\u{01F3C4}\u{01F3CA}-\u{01F3CC}\u{01F408}\u{01F415}\u{01F43B}\u{01F466}-\u{01F469}\u{01F46E}-\u{01F477}\u{01F481}-\u{01F482}\u{01F486}-\u{01F487}\u{01F575}\u{01F62E}\u{1F635}\u{01F636}\u{01F645}-\u{01F647}\u{01F64B}\u{01F64D}-\u{01F64E}\u{01F6A3}\u{01F6B4}-\u{01F6B6}\u{01F926}\u{01F934}-\u{01F93E}\u{01F9B8}-\u{01F9B9}\u{01F9CD}-\u{01F9DF}\u{01FAF1}-\u{01FAF2}]
43+
/* Emoji follow is variation selector (emoji/non-emoji rendering) or Fitzpatrick skin tone */
44+
EMOJI_FOLLOW = [\uFE0E\uFE0F\u{01F3FB}-\u{01F3FF}]
45+
/* Just things followed by the keycap surrounding char - note that if not separated by space beforehand, may be mistokenized */
46+
EMOJI_KEYCAPS = [\u0023\u002A\u0030-\u0039]\uFE0F?\u20E3
47+
/* Flags (changed to use \U to avoid bug in IntelliJ JFlex plugin).
48+
* 1st disjunct: Two geographic characters as a flag
49+
* 2nd disjunct: Tag digits and small letters, currently used only for GB regions flags (Scotland, Wales, England)
50+
* 3rd disjunct: emoji tag sequence (ETS) support for certain additional flags: gay, transgender, pirate
51+
*/
52+
EMOJI_FLAG = [\U01F1E6-\U01F1FF]{2,2}|\U01F3F4[\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}]+\U0E007F
53+
/* Rainbow flag, transgender flag, etc. */
54+
EMOJI_MISC = [\u{01F3F3}\u{01F3F4}\u{01F441}][\uFE0E\uFE0F]?\u200D[\u2620\u26A7\u{01F308}\u{01F5E8}][\uFE0E\uFE0F]?|{EMOJI_KEYCAPS}
55+
/* Things that have an emoji presentation form. This is where the general single character emoji appear */
56+
EMOJI_PRESENTATION = [\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9-\u21AA\u231A-\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA-\u25AB\u25B6\u25C0\u25FB-\u27BF\u2934-\u2935\u2B05-\u2B07\u2B1B-\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\u{01F000}-\u{01FAFF}]
57+
/* Emoji modifier is something that appears after a zero-width joiner (zwj) U+200D */
58+
EMOJI_MODIFIER = [\u2640\u2642\u2695-\u2696\u2708\u2744\u2764\u2B1B\u{01F32B}\u{01F33E}\u{01F373}\u{01F37C}\u{01F384}\u{01F393}\u{01F3A4}\u{01F3A8}\u{01F3EB}\u{01F3ED}\u{01F466}-\u{01F469}\u{01F468}-\u{01F469}\u{01F48B}\u{01F4A8}\u{01F4AB}\u{01F4BB}-\u{01F4BC}\u{01F525}\u{01F527}\u{01F52C}\u{01F5E8}\u{01F680}\u{01F692}\u{01F91D}\u{01F9AF}\u{01F9B0}-\u{01F9B3}\u{01F9BA}-\u{01F9BD}\u{01F9D1}\u{01FA79}\u{01FAF2}]
59+
/* flag | emoji optionally with follower | precomposed gendered/family consisting of human followed by one or more of zero width joiner then another human/profession | Misc */
60+
EMOJI = {EMOJI_FLAG}|{EMOJI_PRESENTATION}{EMOJI_FOLLOW}?|{EMOJI_GENDERED}{EMOJI_FOLLOW}?(\u200D{EMOJI_MODIFIER}{EMOJI_FOLLOW}?){1,3}|{EMOJI_MISC}
61+
62+
/* Allow N'Ko numerals */
63+
DIGIT = [:digit:]|[\u07C0-\u07C9]
64+
65+
/* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. And for new treebank tokenization 346-8792. 1st digit can't be 0 or 1 in NANP. */
66+
/* 2022: Also allow hyphen between area code and number; allow French number like 47-42-17-11 */
67+
PHONE = (\([0-9]{2,3}\)[- \u00A0\u2007]?|(\+\+?)?([0-9]{1,4}[- \u00A0\u2007\u2012])?[0-9]{2,4}[- \u00A0\u2007\u2012/])[0-9]{3,4}[- \u00A0\u2007\u2012]?[0-9]{3,5}|((\+\+?)?[0-9]{1,4}\.)?[0-9]{2,4}\.[0-9]{2,4}\.[0-9]{2,5}|((\+\+?)?[0-9]{1,4}-)?[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,5}|[2-9][0-9]{2}[-\u2012][0-9]{4}

src/edu/stanford/nlp/process/LexerUtils.java

+13-9
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ private LexerUtils() {} // static methods
3232

3333
public enum QuotesEnum { UNICODE, LATEX, ASCII, NOT_CP1252, ORIGINAL }
3434

35-
public enum EllipsesEnum { UNICODE, PTB3, NOT_CP1252, ORIGINAL }
35+
public enum EllipsesEnum { UNICODE, ASCII, NOT_CP1252, ORIGINAL }
3636

37-
public enum DashesEnum { UNICODE, PTB3, NOT_CP1252, ORIGINAL }
37+
public enum DashesEnum { UNICODE, ASCII, NOT_CP1252, ORIGINAL }
3838

3939

4040
/** Change precomposed fraction characters to spelled out letter forms.
@@ -241,7 +241,7 @@ public static String handleEllipsis(final String tok, EllipsesEnum ellipsesStyle
241241
switch (ellipsesStyle) {
242242
case UNICODE:
243243
return unicodeEllipsisStr;
244-
case PTB3:
244+
case ASCII:
245245
return ptb3EllipsisStr;
246246
case NOT_CP1252:
247247
if (tok.equals("\u0085")) {
@@ -257,7 +257,7 @@ public static String handleEllipsis(final String tok, EllipsesEnum ellipsesStyle
257257
}
258258
}
259259

260-
// Other things to consider handling: [_\u058A\u2010\u2011\u2012]
260+
261261
public static String handleDashes(final String tok, DashesEnum dashesStyle) {
262262
switch (dashesStyle) {
263263
case UNICODE:
@@ -266,12 +266,16 @@ public static String handleDashes(final String tok, DashesEnum dashesStyle) {
266266
} else {
267267
return "—"; // em dash
268268
}
269-
case PTB3:
270-
if ("-".equals(tok)) {
271-
return "-"; // keep an ASCII hyphen-minus as hyphen-minus
272-
} else {
273-
return "--"; // two hyphen-minus ascii dashes
269+
case ASCII:
270+
// Map similar things to one or two ASCII hyphen-dash characters
271+
// hyphen-dash, underscore, Armenian hyphen, hyphen, non-break hyphen, figure dash
272+
String mid = tok.replaceAll("[-_\u058A\u2010\u2011\u2012]","-");
273+
// cp1252 en dash, cp1252 em dash, en dash, em dash, horizontal bar
274+
mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]", "--");
275+
if ("---".equals(mid)) {
276+
mid = "--";
274277
}
278+
return mid;
275279
case NOT_CP1252:
276280
if (tok.equals("\u0096")) {
277281
return "–"; // en dash

0 commit comments

Comments
 (0)