Skip to content

Commit 9476a8e

Browse files
committed
Make PTBLexer recognize most things with apostrophes in them as single words
- Only break on known prefixes (e.g., th' for the) and suffixes (e.g., 's and 'll) - Add suffix 'em for them - Split up 'tain't into 3 tokens - Allow as tokens things like covid-19 variants: BA.5 and BA.2.12.1 - Add 16 test cases for new behavior
1 parent cd4e49f commit 9476a8e

File tree

3 files changed

+60944
-64805
lines changed

3 files changed

+60944
-64805
lines changed

src/edu/stanford/nlp/process/PTBLexer.flex

+95-39
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ import edu.stanford.nlp.util.logging.Redwood;
266266

267267

268268
/** Turn on to find out how things were tokenized. */
269-
private static final boolean DEBUG = false;
269+
private static final boolean DEBUG = true;
270270

271271
/** A logger for this class */
272272
private static final Redwood.RedwoodChannels logger = Redwood.channels(PTBLexer.class);
@@ -571,13 +571,13 @@ import edu.stanford.nlp.util.logging.Redwood;
571571
SENTEND1 = {SPACENL}({SPACENL}|[:uppercase:]|{SGML1})
572572
SENTEND2 = {SPACE}({SPACE}|[:uppercase:]|{SGML2})
573573

574+
/* Note that JFlex doesn't support {2,} pattern form. Only {j,k}. */
574575
DATE = {DIGIT}{1,2}[\-\u2012\/]{DIGIT}{1,2}[\-\u2012\/]{DIGIT}{2,4}|{DIGIT}{4}[\-\u2012\/]{DIGIT}{1,2}[\-\u2012\/]{DIGIT}{1,2}
575576
/* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
576577
NUM = {DIGIT}*([.,\u066B\u066C]{DIGIT}+)+|{DIGIT}+([.:,\u00AD\u066B\u066C\u2009\u202F]{DIGIT}+)*
577578
LEADING_NUM = {DIGIT}+([.,\u066B\u066C]{DIGIT}+)+
578579
/* Now don't allow bracketed negative numbers! They have too many uses (e.g.,
579-
years or times in parentheses), and having them in tokens messes up
580-
treebank parsing.
580+
years or times in parentheses), and having them in tokens messes up treebank parsing.
581581
NUMBER = [\-+]?{NUM}|\({NUM}\) */
582582
NUMBER = [\-\u2212+]?{NUM}
583583
SUBSUPNUM = [\u207A\u207B\u208A\u208B]?([\u2070\u00B9\u00B2\u00B3\u2074-\u2079]+|[\u2080-\u2089]+)
@@ -614,7 +614,7 @@ THING_LETTER = ([dDoOlL]{APOSETCETERA}[\p{Alpha}\p{Digit}])?([\p{Alpha}\p{Digit}
614614
THINGA = [A-Z]+(([+&]|{SPAMP})[A-Z]+)+
615615
THING3 = [\p{Alpha}\p{Digit}]+(-[\p{Alpha}]+){0,2}(\\?\/[\p{Alpha}\p{Digit}]+(-[\p{Alpha}]+){0,2}){1,2}
616616
APOS = ['\u0092\u2019´]|' /* ASCII straight quote, single right curly quote in CP1252 (wrong) or Unicode or reversed quote or HTML SGML escape */
617-
/* Includes extra ones that may appear inside a word, rightly or wrongly */
617+
/* Includes extra ones that may appear inside a word, rightly or wrongly: ASCII backquote, CP1252 left curly quote, left curly quote, high upside down left curly quote */
618618
APOSETCETERA = {APOS}|[`\u0091\u2018\u201B]
619619
/* HTHING recognizes hyphenated words, including ones with various kinds of numbers in them. And with underscores. */
620620
HTHING = [\p{Alpha}\p{Digit}][\p{Alpha}\p{Digit}.,\u00AD\u200C\u200D\u2060]*([-_]([\p{Alpha}\p{Digit}\u00AD\u200C\u200D\u2060]+(\.{DIGIT}+)?|{ACRO2}\.))+
@@ -639,21 +639,24 @@ HTHINGEXCEPTIONPREFIXED = (e|a|u|x|agro|ante|anti|arch|be|bi|bio|co|counter|cros
639639
HTHINGEXCEPTIONSUFFIXED = ([\p{Alpha}\p{Digit}][\p{Alpha}\p{Digit}.,\u00AD]*)(-)(esque|ette|fest|fold|gate|itis|less|most|o-torium|rama|wise)(s|es|d|ed)?
640640
HTHINGEXCEPTIONWHOLE = (mm-hm|mm-mm|o-kay|uh-huh|uh-oh)(s|es|d|ed)?
641641

642-
/* things like 'll and 'm */
643-
REDAUX = {APOSETCETERA}([msdMSD]|re|ve|ll)
642+
/* things like 'll and 'm and 'em for them */
643+
REDAUX = {APOSETCETERA}(m|s|d|re|ve|ll|em)
644644
/* For things that will have n't on the end. They can't end in 'n' */
645645
/* \u00AD is soft hyphen. \u2060 is word joiner */
646-
SWORD = [\p{Alpha}\u00AD\u200C\u200D\u2060]*[A-MO-Za-mo-z][\u00AD\u200C\u200D\u2060]*
647-
SREDAUX = n{APOSETCETERA}t
648-
/* Tokens you want but already okay: C'mon 'n' '[2-9]0s '[eE]m 'till?
649-
[Yy]'all 'Cause Shi'ite B'Gosh o'clock. Here now only need apostrophe
650-
final words. */
651-
/* Note that Jflex doesn't support {2,} form. Only {2,k}. */
652-
/* [yY]' is for Y'know, y'all and I for I. So exclude from one letter first */
653-
/* Rest are for French borrowings. n allows n'ts in "don'ts" */
654-
/* Arguably, c'mon should be split to "c'm" + "on", but not yet. 'Twixt for betwixt */
655-
APOWORD = {APOS}n{APOS}?|[lLdDjJ]{APOS}|(Dunkin|somethin|ol){APOS}|{APOS}em|diff{APOSETCETERA}rent|[A-HJ-XZn]{APOSETCETERA}[:letter:]{2}[:letter:]*|{APOS}[1-9]0s|[1-9]0{APOS}s|{APOS}till?|[:letter:][:letter:]*[aáeiouhlpyAEIOUY]{APOSETCETERA}[aeiíoulA-Z][:letter:]*|{APOS}cause|cont{APOSETCETERA}d\.?|nor{APOSETCETERA}easter|c{APOSETCETERA}mon|e{APOSETCETERA}er|s{APOSETCETERA}mores|ev{APOSETCETERA}ry|li{APOSETCETERA}l|nat{APOSETCETERA}l|ass{APOSETCETERA}t|'twixt|O{APOSETCETERA}o
656-
APOWORD2 = y{APOS}
646+
WORD_NOT = [\p{Alpha}\u00AD\u200C\u200D\u2060]*[A-MO-Za-mo-z][\u00AD\u200C\u200D\u2060]*
647+
REDAUX_NOT = n{APOSETCETERA}ts?
648+
649+
/* 2022 tokenizer change. We generally allow apostrophes (including curly ones) into words. This is much better for
650+
* Hebrew, Arabic, Star Trek and some Black American names, etc. We only separate off word forms with apostrophes
651+
* that are known common word shortenings or clitics.
652+
*/
653+
/* Tokens you want: 'n' '[2-9]0s '[eE]m 'till? 'Cause Shi'ite B'Gosh o'clock 'Twixt
654+
Here now only need apostrophe initial or final words listed. */
655+
/* Single letters are for French borrowings. */
656+
/* Arguably, c'mon should be split to "c'm" + "on", but not yet. */
657+
APOWORD = {WORD}({APOSETCETERA}{WORD})+|\p{Script=Latin}{APOSETCETERA}[A-Z]\.([A-Z]\.)+|{APOS}n{APOS}?|([lLdDjJ]|Dunkin|somethin|ol){APOS}|{APOS}(em|till?|cause|twixt|[1-9]0s)|[1-9]0{APOS}s
658+
/* APOWORD2 is things we will strip at beginning of word: th' shortening "the" (Th'enchanting) and y' shortening "you" (y'know, y'all) */
659+
APOWORD2 = (th|y){APOS}
657660
/* Some Wired URLs end in + or = so omit that too. Some quoting with '[' and ']' so disallow. */
658661
FULLURL = (ftp|svn|svn\+ssh|http|https|mailto):\/\/[^ \t\n\f\r<>|`\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+[^ \t\n\f\r<>|.!?¡¿,·;:&`\"\'\*\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-]
659662
LIKELYURL = ((www\.([^ \t\n\f\r`<>|.!?,\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+\.)+[a-zA-Z]{2,4})|(([^ \t\n\f\r`<>|.!?,:\/$\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}]+\.)+(com|net|org|edu)))(\/[^ \t\n\f\r`<>|]+[^ \t\n\f\r`<>|.!?,;:&\p{OpenPunctuation}\p{InitialPunctuation}\p{ClosePunctuation}\p{FinalPunctuation}-])?
@@ -769,26 +772,34 @@ INSENTP = [,;:\u3001\u0F0D]
769772
QUOTES = {APOS}|[`\u2018-\u201F\u0082\u0084\u0091-\u0094\u2039\u203A\u00AB\u00BB]{1,2}
770773
DBLQUOT = \"|&quot;|[`'\u0091\u0092\u2018\u2019]'
771774
/* Cap'n for captain, c'est for french */
772-
TBSPEC = -(RRB|LRB|RCB|LCB|RSB|LSB)-|C\.D\.s|pro-|anti-|S(&|&amp;)P-500|S(&|&amp;)Ls|Cap{APOS}n|c{APOS}est
775+
TBSPEC = -(RRB|LRB|RCB|LCB|RSB|LSB)-|C\.D\.s|pro-|anti-|S(&|&amp;)P-500|S(&|&amp;)Ls
773776
SWEARING = f[-*][-c*]k(in[g']?|e[dr])?|f[-*](in[g']?|e[dr])|(bull|dip)?s[h@][-\*#]t(ty|e|box|s)?|c[-*]nts?|p[-*]ss(e[sd]|ing)?|c[-*]ck|b[-*]tch|t[-*]ts|tw[-*]ts?|cr[-*]p|d[-*]cks?|b[-*][-*s]t[-*]rds?|pr[-*]ck|d[-*]mn|bl[-*]{2,2}dy
774777
TBSPEC2 = {APOS}[0-9][0-9]
775778
BANGWORDS = (E|Yahoo|Jeopardy)\!
776779
BANGMAGAZINES = OK\!
777780

781+
/* Allows covid-19 variants and other similar things. Must filter out first p.500, No.17, etc. */
782+
CAP_NUM_REST = [0-9]+(\.[0-9]+)*[A-Za-z]*
783+
CAP_NUM = [A-Z]+\.(A-Z]+\.)?{CAP_NUM_REST}
784+
778785
/* Smileys (based on Chris Potts' sentiment tutorial, but much more restricted set - e.g., no "8)", "do:" or "):", too ambiguous) and simple Asian smileys */
779786
SMILEY = [<>]?[:;=][\-o\*']?[\(\)DPdpO\\{@\|\[\]]
780787
ASIANSMILEY = [\^x=~<>]\.\[\^x=~<>]|[\-\^x=~<>']_[\-\^x=~<>']|\([\-\^x=~<>'][_.]?[\-\^x=~<>']\)|\([\^x=~<>']-[\^x=~<>'`]\)|¯\\_\(\)_\/¯
781788

782-
783789
/* U+2200-U+2BFF has a lot of the various mathematical, etc. symbol ranges */
784790
/* \uFF65 is Halfwidth katakana middle dot; \u30FB is Katakana middle dot */
785791
/* Math and other symbols that stand alone: °²× ∀; \u33A1 is m^2 in one char! */
786792
/* Tibetan tsheg or tsek (U+0F0B) goes between syllables; words aren't space separated, so it may be a word or syllable marker; it indicates a possible line-break point. Treat as separate symbol. */
787793
MISCSYMBOL = [+%&~\^|\\¦\u00A7¨\u00A9\u00AC\u00AE¯\u00B0-\u00B3\u00B4-\u00BA\u00D7\u00F7\u0387\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0600-\u0603\u0606-\u060A\u060C\u0614\u061B\u061E\u066A\u066D\u0703-\u070D\u07F6\u07F7\u07F8\u0964\u0965\u0E4F\u0F0B\u1FBD\u2016\u2017\u2020-\u2025\u2030-\u2038\u203B\u203C\u2043\u203E-\u2042\u2044\u2053\u207A-\u207F\u208A-\u208E\u2100-\u214F\u2190-\u21FF\u2200-\u2BFF\u3001-\u3006\u3008-\u3020\u30FB\u33A1\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\uFF65]
788794

789795
PROG_LANGS = c[+][+]|(c|f)#
796+
797+
ONECHAR_APOS = ['\u0092\u2019´`\u0091\u2018\u201B]
798+
/* Assimilations5 leave 5 chars behind after division */
799+
ASSIMILATIONS5 = {ONECHAR_APOS}tain{ONECHAR_APOS}t|t{ONECHAR_APOS}ain{ONECHAR_APOS}t
790800
/* Assimilations3 leave 3 chars behind after division */
791801
ASSIMILATIONS3 = cannot|'twas|dunno|['’]d['’]ve
802+
/* Assimilations2 leave 2 chars behind after division */
792803
/* "nno" is a remnant after pushing back from dunno in ASSIMILATIONS3 */
793804
/* Include splitting some apostrophe-less negations, but not ones like "wont" that are also words. */
794805
ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno|aint|dont|doesnt|didnt|theyre
@@ -806,6 +817,14 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
806817
if (DEBUG) { logger.info("Used {PROG_LANGS} to recognize " + tok); }
807818
return getNext(tok, tok);
808819
}
820+
{ASSIMILATIONS5} { if (splitAssimilations) {
821+
yypushback(5);
822+
}
823+
String tok = yytext();
824+
if (DEBUG) { logger.info("Used {ASSIMILATIONS5} to recognize " + tok +
825+
"; splitAssimilations=" + splitAssimilations); }
826+
return getNext(tok, tok);
827+
}
809828
{ASSIMILATIONS3} { if (splitAssimilations) {
810829
yypushback(3);
811830
}
@@ -860,6 +879,29 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
860879
if (DEBUG) { logger.info("Used {SPPUNC} to recognize " + tok); }
861880
return getNext(tok, tok);
862881
}
882+
883+
/* Allow for two {REDAUX} like I'd've or they'd've */
884+
{WORD}/{REDAUX}{REDAUX} { final String origTxt = yytext();
885+
String tok = LexerUtils.removeSoftHyphens(origTxt);
886+
if (americanize) {
887+
tok = Americanize.americanize(tok);
888+
}
889+
if (DEBUG) { logger.info("Used {WORD} (4) to recognize " + origTxt + " as " + tok); }
890+
return getNext(tok, origTxt);
891+
}
892+
{APOWORD}/{REDAUX}{REDAUX} { String tok = yytext();
893+
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
894+
if (DEBUG) { logger.info("Used {APOWORD} (2) to recognize " + tok + " as " + norm +
895+
"; quoteStyle=" + quoteStyle + "; probablyLeft=" + false); }
896+
return getNext(norm, tok);
897+
}
898+
{WORD_NOT}/{REDAUX_NOT}{REDAUX} { final String origTxt = yytext();
899+
String tok = LexerUtils.removeSoftHyphens(origTxt);
900+
if (DEBUG) { logger.info("Used {WORD_NOT} (2) to recognize " + origTxt + " as " + tok); }
901+
return getNext(tok, origTxt);
902+
}
903+
904+
863905
{WORD}/{REDAUX} { final String origTxt = yytext();
864906
String tok = LexerUtils.removeSoftHyphens(origTxt);
865907
if (americanize) {
@@ -868,11 +910,21 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
868910
if (DEBUG) { logger.info("Used {WORD} to recognize " + origTxt + " as " + tok); }
869911
return getNext(tok, origTxt);
870912
}
871-
{SWORD}/{SREDAUX} { final String origTxt = yytext();
913+
{WORD_NOT}/{REDAUX_NOT} { final String origTxt = yytext();
872914
String tok = LexerUtils.removeSoftHyphens(origTxt);
873-
if (DEBUG) { logger.info("Used {SWORD} to recognize " + origTxt + " as " + tok); }
915+
if (DEBUG) { logger.info("Used {WORD_NOT} to recognize " + origTxt + " as " + tok); }
874916
return getNext(tok, origTxt);
875917
}
918+
{APOWORD}/{REDAUX} { String tok = yytext();
919+
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
920+
if (DEBUG) { logger.info("Used {APOWORD} (2) to recognize " + tok + " as " + norm +
921+
"; quoteStyle=" + quoteStyle + "; probablyLeft=" + false); }
922+
return getNext(norm, tok);
923+
}
924+
{APOWORD2}/{WORD} { String txt = yytext();
925+
if (DEBUG) { logger.info("Used {APOWORD2} to recognize " + txt); }
926+
return getNext(txt, txt);
927+
}
876928
{DIGIT}+/{SEP_SUFFIX} { String txt = yytext();
877929
if (DEBUG) { logger.info("Used {DIGIT}/{SEP_SUFFIX} to recognize " + txt); }
878930
return getNext(txt, txt);
@@ -897,14 +949,11 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
897949
}
898950
{APOWORD} { String tok = yytext();
899951
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
952+
norm = LexerUtils.removeSoftHyphens(norm);
900953
if (DEBUG) { logger.info("Used {APOWORD} to recognize " + tok + " as " + norm +
901-
"; probablyLeft=" + false); }
954+
"; quoteStyle=" + quoteStyle + "; probablyLeft=" + false); }
902955
return getNext(norm, tok);
903956
}
904-
{APOWORD2}/[:letter:] { String txt = yytext();
905-
if (DEBUG) { logger.info("Used {APOWORD2} to recognize " + txt); }
906-
return getNext(txt, txt);
907-
}
908957
{FULLURL} { String txt = yytext();
909958
String norm = txt;
910959
if (escapeForwardSlashAsterisk) {
@@ -934,13 +983,13 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
934983
{REDAUX}/[^\p{Latin}'’] { String tok = yytext();
935984
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
936985
if (DEBUG) { logger.info("Used {REDAUX} to recognize " + tok + " as " + norm +
937-
"; probablyLeft=" + false); }
986+
"; quoteStyle=" + quoteStyle + "; probablyLeft=" + false); }
938987
return getNext(norm, tok);
939988
}
940-
{SREDAUX}/[^\p{Latin}'’] { String tok = yytext();
989+
{REDAUX_NOT}/[^\p{Latin}'’] { String tok = yytext();
941990
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
942-
if (DEBUG) { logger.info("Used {SREDAUX} to recognize " + tok + " as " + norm +
943-
"; probablyLeft=" + false); }
991+
if (DEBUG) { logger.info("Used {REDAUX_NOT} to recognize " + tok + " as " + norm +
992+
"; quoteStyle=" + quoteStyle); }
944993
return getNext(norm, tok);
945994
}
946995
{DATE} { String origTxt = yytext();
@@ -1175,7 +1224,7 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
11751224
}
11761225
{DBLQUOT} { String tok = yytext();
11771226
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
1178-
if (DEBUG) { logger.info("Used {SREDAUX} to recognize " + tok + " as " + norm +
1227+
if (DEBUG) { logger.info("Used {DBLQUOT} to recognize " + tok + " as " + norm +
11791228
"; probablyLeft=" + false); }
11801229
return getNext(norm, tok);
11811230
}
@@ -1185,6 +1234,18 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
11851234
if (DEBUG) { logger.info("Used {SMILEY} to recognize " + origText + " as " + txt); }
11861235
return getNext(txt, origText);
11871236
}
1237+
1238+
/* This rule doesn't seem to fire to block {CAP_NUM} when it could. I have no idea why. Ignoring for now as a rare case. */
1239+
{ABBREV3}/{CAP_NUM_REST} {
1240+
String txt = yytext();
1241+
if (DEBUG) { logger.info("Used {ABBREV3} (2) to recognize " + txt); }
1242+
return getNext(txt, txt);
1243+
}
1244+
{CAP_NUM} {
1245+
String txt = yytext();
1246+
if (DEBUG) { logger.info("Used {CAP_NUM} to recognize " + txt); }
1247+
return getNext(txt, txt);
1248+
}
11881249
{ASIANSMILEY} { String txt = yytext();
11891250
String origText = txt;
11901251
txt = LexerUtils.pennNormalizeParens(txt, normalizeParentheses);
@@ -1457,7 +1518,7 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
14571518
"; probablyLeft=" + false); }
14581519
return getNext(norm, tok);
14591520
}
1460-
/* This QUOTES must proceed (S)REDAUX (2) so it by preference matches straight quote before word.
1521+
/* This QUOTES must proceed REDAUX (2) so it by preference matches straight quote before word.
14611522
Trying to collapse the first two cases seemed to break things (?!?). */
14621523
{QUOTES}/[:letter:]{NOT_SPACENL_ONE_CHAR}
14631524
{ // Extra context is to not match on ones like 'd but you do want words like "a"
@@ -1485,17 +1546,12 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
14851546
"; probablyLeft=" + false); }
14861547
return getNext(norm, tok);
14871548
}
1488-
/* These (S)REDAUX (2) cases are needed in case string ends on "it's". See: testJacobEisensteinApostropheCase */
1549+
/* This REDAUX (2) case is needed in case string ends on "it's". See: testJacobEisensteinApostropheCase */
14891550
{REDAUX} { String tok = yytext();
14901551
if (DEBUG) { logger.info("Used {REDAUX} (2) to recognize " + tok); }
14911552
return getNext(tok, tok);
14921553
}
1493-
{SREDAUX} { String tok = yytext();
1494-
String norm = LexerUtils.handleQuotes(tok, false, quoteStyle);
1495-
if (DEBUG) { logger.info("Used {SREDAUX} (2) to recognize " + tok + " as " + norm +
1496-
"; probablyLeft=" + false); }
1497-
return getNext(norm, tok);
1498-
}
1554+
/* Plain {REDAUX_NOT} is captured by {APOWORD} */
14991555

15001556
{FAKEDUCKFEET} {
15011557
String tok = yytext();

0 commit comments

Comments
 (0)