1
+ /* Defines common token types for our JFlex-based tokenizers */
2
+
3
+ /* Todo: Really SGML shouldn't be here at all, it's kind of legacy. But we continue to tokenize
4
+ some simple standard forms of concrete SGML syntax, since it tends to give robustness. */
5
+ /* ---
6
+ ( +([A-Za-z][A-Za-z0-9:.-]*( *= *['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]| *\/))*
7
+ SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]*([ ]+([A-Za-z][A-Za-z0-9:.-]*([ ]*=[ ]*['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)>
8
+ ( +[A-Za-z][A-Za-z0-9:.-]*)*
9
+ FOO = ([ ]+[A-Za-z][A-Za-z0-9:.-]*)*
10
+ SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]* *)>
11
+ SGML = \<([!\?][A-Za-z\-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*['\"][^\r\n'\"]*['\"]|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)\>
12
+ --- */
13
+
14
+ /* <STORYID cat=w pri=u> */
15
+ /* SGML1 allows attribute value match over newline; SGML2 does not. */
16
+ SGML1 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ \r\n]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ \r\n]*=[ \r\n]*('[^']*'|\"[^\"]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ \r\n]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ \r\n]*\>
17
+ SGML2 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ ]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ ]*\>
18
+ SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
19
+ SPAMP = &
20
+ SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
21
+ SPLET = &[aeiouAEIOU](acute|grave|uml);
22
+
23
+
1
24
/* \u3000 is ideographic space; \u205F is medium math space */
2
25
/* \u2063 is an invisible separator */
3
26
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000]
@@ -12,3 +35,33 @@ NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000\r\n\u202
12
35
13
36
FILENAME_EXT = 3gp|aac|aspx|avi|bat|bmp|bz2|c|class|cgi|cpp|csv|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|m4a|m4v|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|pptx|ps|psd|py|rtf|sql|tar|tgz|tif|tiff|tmp|txt|wav|wm[va]|x|xls|xlsx|xml|zip
14
37
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
38
+
39
+
40
+ /* Slightly generous but generally reasonably good emoji parsing. These patterns handle correctly 100% of emoji through Unicode 14.0 (Sept 2021). */
41
+ /* These are emoji that can be followed by a zwj (U+200D) and then gender or similar things (as well as skin color). Mainly humans but certain others like bears, hearts */
42
+ EMOJI_GENDERED = [\u26F9\u2764\u{01F3C3}-\u{01F3C4}\u{01F3CA}-\u{01F3CC}\u{01F408}\u{01F415}\u{01F43B}\u{01F466}-\u{01F469}\u{01F46E}-\u{01F477}\u{01F481}-\u{01F482}\u{01F486}-\u{01F487}\u{01F575}\u{01F62E}\u{1F635}\u{01F636}\u{01F645}-\u{01F647}\u{01F64B}\u{01F64D}-\u{01F64E}\u{01F6A3}\u{01F6B4}-\u{01F6B6}\u{01F926}\u{01F934}-\u{01F93E}\u{01F9B8}-\u{01F9B9}\u{01F9CD}-\u{01F9DF}\u{01FAF1}-\u{01FAF2}]
43
+ /* Emoji follow is variation selector (emoji/non-emoji rendering) or Fitzpatrick skin tone */
44
+ EMOJI_FOLLOW = [\uFE0E\uFE0F\u{01F3FB}-\u{01F3FF}]
45
+ /* Just things followed by the keycap surrounding char - note that if not separated by space beforehand, may be mistokenized */
46
+ EMOJI_KEYCAPS = [\u0023\u002A\u0030-\u0039]\uFE0F?\u20E3
47
+ /* Flags (changed to use \U to avoid bug in IntelliJ JFlex plugin).
48
+ * 1st disjunct: Two geographic characters as a flag
49
+ * 2nd disjunct: Tag digits and small letters, currently used only for GB regions flags (Scotland, Wales, England)
50
+ * 3rd disjunct: emoji tag sequence (ETS) support for certain additional flags: gay, transgender, pirate
51
+ */
52
+ EMOJI_FLAG = [\U01F1E6-\U01F1FF]{2,2}|\U01F3F4[\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}]+\U0E007F
53
+ /* Rainbow flag, transgender flag, etc. */
54
+ EMOJI_MISC = [\u{01F3F3}\u{01F3F4}\u{01F441}][\uFE0E\uFE0F]?\u200D[\u2620\u26A7\u{01F308}\u{01F5E8}][\uFE0E\uFE0F]?|{EMOJI_KEYCAPS}
55
+ /* Things that have an emoji presentation form. This is where the general single character emoji appear */
56
+ EMOJI_PRESENTATION = [\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9-\u21AA\u231A-\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA-\u25AB\u25B6\u25C0\u25FB-\u27BF\u2934-\u2935\u2B05-\u2B07\u2B1B-\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\u{01F000}-\u{01FAFF}]
57
+ /* Emoji modifier is something that appears after a zero-width joiner (zwj) U+200D */
58
+ EMOJI_MODIFIER = [\u2640\u2642\u2695-\u2696\u2708\u2744\u2764\u2B1B\u{01F32B}\u{01F33E}\u{01F373}\u{01F37C}\u{01F384}\u{01F393}\u{01F3A4}\u{01F3A8}\u{01F3EB}\u{01F3ED}\u{01F466}-\u{01F469}\u{01F468}-\u{01F469}\u{01F48B}\u{01F4A8}\u{01F4AB}\u{01F4BB}-\u{01F4BC}\u{01F525}\u{01F527}\u{01F52C}\u{01F5E8}\u{01F680}\u{01F692}\u{01F91D}\u{01F9AF}\u{01F9B0}-\u{01F9B3}\u{01F9BA}-\u{01F9BD}\u{01F9D1}\u{01FA79}\u{01FAF2}]
59
+ /* flag | emoji optionally with follower | precomposed gendered/family consisting of human followed by one or more of zero width joiner then another human/profession | Misc */
60
+ EMOJI = {EMOJI_FLAG}|{EMOJI_PRESENTATION}{EMOJI_FOLLOW}?|{EMOJI_GENDERED}{EMOJI_FOLLOW}?(\u200D{EMOJI_MODIFIER}{EMOJI_FOLLOW}?){1,3}|{EMOJI_MISC}
61
+
62
+ /* Allow N'Ko numerals */
63
+ DIGIT = [:digit:]|[\u07C0-\u07C9]
64
+
65
+ /* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. And for new treebank tokenization 346-8792. 1st digit can't be 0 or 1 in NANP. */
66
+ /* 2022: Also allow hyphen between area code and number; allow French number like 47-42-17-11 */
67
+ PHONE = (\([0-9]{2,3}\)[- \u00A0\u2007]?|(\+\+?)?([0-9]{1,4}[- \u00A0\u2007\u2012])?[0-9]{2,4}[- \u00A0\u2007\u2012/])[0-9]{3,4}[- \u00A0\u2007\u2012]?[0-9]{3,5}|((\+\+?)?[0-9]{1,4}\.)?[0-9]{2,4}\.[0-9]{2,4}\.[0-9]{2,5}|((\+\+?)?[0-9]{1,4}-)?[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,5}|[2-9][0-9]{2}[-\u2012][0-9]{4}
0 commit comments