Skip to content

Commit eea4909

Browse files
committed
auto merge of #13700 : BurntSushi/rust/regexp, r=alexcrichton
Implements [RFC 7](https://github.com/rust-lang/rfcs/blob/master/active/0007-regexps.md) and will hopefully resolve #3591. The crate is marked as experimental. It includes a syntax extension for compiling regexps to native Rust code. Embeds and passes the `basic`, `nullsubexpr` and `repetition` tests from [Glenn Fowler's (slightly modified by Russ Cox for leftmost-first semantics) testregex test suite](http://www2.research.att.com/~astopen/testregex/testregex.html). I've also hand written a plethora of other tests that exercise Unicode support, the parser, public API, etc. Also includes a `regex-dna` benchmark for the shootout. I know the addition looks huge at first, but consider these things: 1. More than half the number of lines is dedicated to Unicode character classes. 2. Of the ~4,500 lines remaining, 1,225 of them are comments. 3. Another ~800 are tests. 4. That leaves 2500 lines for the meat. The parser is ~850 of them. The public API, compiler, dynamic VM and code generator (for `regexp!`) make up the rest.
2 parents 2bb2341 + 7269bc7 commit eea4909

24 files changed

+11108
-6
lines changed

mk/crates.mk

+4-2
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@
5151

5252
TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
5353
uuid serialize sync getopts collections num test time rand \
54-
workcache url log
55-
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat
54+
workcache url log regex
55+
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros
5656
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
5757
TOOLS := compiletest rustdoc rustc
5858

@@ -84,6 +84,8 @@ DEPS_rand := std
8484
DEPS_url := std collections
8585
DEPS_workcache := std serialize collections log
8686
DEPS_log := std sync
87+
DEPS_regex := std collections
88+
DEPS_regex_macros = syntax std regex
8789

8890
TOOL_DEPS_compiletest := test green rustuv getopts
8991
TOOL_DEPS_rustdoc := rustdoc native

mk/main.mk

+1-4
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,6 @@ HSREQ$(1)_H_$(3) = $$(HBIN$(1)_H_$(3))/rustc$$(X_$(3))
311311
else
312312
HSREQ$(1)_H_$(3) = \
313313
$$(HBIN$(1)_H_$(3))/rustc$$(X_$(3)) \
314-
$$(HLIB$(1)_H_$(3))/stamp.rustc \
315-
$$(foreach dep,$$(RUST_DEPS_rustc),$$(HLIB$(1)_H_$(3))/stamp.$$(dep)) \
316314
$$(MKFILE_DEPS)
317315
endif
318316

@@ -334,8 +332,7 @@ SREQ$(1)_T_$(2)_H_$(3) = \
334332
CSREQ$(1)_T_$(2)_H_$(3) = \
335333
$$(TSREQ$(1)_T_$(2)_H_$(3)) \
336334
$$(HBIN$(1)_H_$(3))/rustdoc$$(X_$(3)) \
337-
$$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep)) \
338-
$$(foreach dep,$$(HOST_CRATES),$$(HLIB$(1)_H_$(3))/stamp.$$(dep))
335+
$$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep))
339336

340337
ifeq ($(1),0)
341338
# Don't run the stage0 compiler under valgrind - that ship has sailed

src/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Source layout:
1919
| `libfourcc/` | Data format identifier library |
2020
| `libgetopts/` | Get command-line-options library |
2121
| `libglob/` | Unix glob patterns library |
22+
| `libregex/` | Regular expressions |
2223
| `libsemver/` | Rust's semantic versioning library |
2324
| `libserialize/` | Encode-Decode types library |
2425
| `libsync/` | Concurrency mechanisms and primitives |

src/doc/index.md

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ li {list-style-type: none; }
4141
* [The `native` 1:1 threading runtime](native/index.html)
4242
* [The `num` arbitrary precision numerics library](num/index.html)
4343
* [The `rand` library for random numbers and distributions](rand/index.html)
44+
* [The `regex` library for regular expressions](regex/index.html)
4445
* [The `rustc` compiler](rustc/index.html)
4546
* [The `rustuv` M:N I/O library](rustuv/index.html)
4647
* [The `semver` version collation library](semver/index.html)

src/etc/regex-match-tests.py

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env python2
2+
3+
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
from __future__ import absolute_import, division, print_function
14+
import argparse
15+
import datetime
16+
import os.path as path
17+
18+
19+
def print_tests(tests):
20+
print('\n'.join([test_tostr(t) for t in tests]))
21+
22+
23+
def read_tests(f):
24+
basename, _ = path.splitext(path.basename(f))
25+
tests = []
26+
for lineno, line in enumerate(open(f), 1):
27+
fields = filter(None, map(str.strip, line.split('\t')))
28+
if not (4 <= len(fields) <= 5) \
29+
or 'E' not in fields[0] or fields[0][0] == '#':
30+
continue
31+
32+
opts, pat, text, sgroups = fields[0:4]
33+
groups = [] # groups as integer ranges
34+
if sgroups == 'NOMATCH':
35+
groups = [None]
36+
elif ',' in sgroups:
37+
noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
38+
for g in noparen:
39+
s, e = map(str.strip, g.split(','))
40+
if s == '?' and e == '?':
41+
groups.append(None)
42+
else:
43+
groups.append((int(s), int(e)))
44+
else:
45+
# This skips tests that should result in an error.
46+
# There aren't many, so I think we can just capture those
47+
# manually. Possibly fix this in future.
48+
continue
49+
50+
if pat == 'SAME':
51+
pat = tests[-1][1]
52+
if '$' in opts:
53+
pat = pat.decode('string_escape')
54+
text = text.decode('string_escape')
55+
if 'i' in opts:
56+
pat = '(?i)%s' % pat
57+
58+
name = '%s_%d' % (basename, lineno)
59+
tests.append((name, pat, text, groups))
60+
return tests
61+
62+
63+
def test_tostr(t):
64+
lineno, pat, text, groups = t
65+
options = map(group_tostr, groups)
66+
return 'mat!(match_%s, r"%s", r"%s", %s)' \
67+
% (lineno, pat, '' if text == "NULL" else text, ', '.join(options))
68+
69+
70+
def group_tostr(g):
71+
if g is None:
72+
return 'None'
73+
else:
74+
return 'Some((%d, %d))' % (g[0], g[1])
75+
76+
77+
if __name__ == '__main__':
78+
parser = argparse.ArgumentParser(
79+
description='Generate match tests from an AT&T POSIX test file.')
80+
aa = parser.add_argument
81+
aa('files', nargs='+',
82+
help='A list of dat AT&T POSIX test files. See src/libregexp/testdata')
83+
args = parser.parse_args()
84+
85+
tests = []
86+
for f in args.files:
87+
tests += read_tests(f)
88+
89+
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
90+
// file at the top-level directory of this distribution and at
91+
// http://rust-lang.org/COPYRIGHT.
92+
//
93+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
94+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
95+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
96+
// option. This file may not be copied, modified, or distributed
97+
// except according to those terms.
98+
99+
// ignore-tidy-linelength
100+
101+
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests'
102+
// on {date}.
103+
'''
104+
print(tpl.format(date=str(datetime.datetime.now())))
105+
106+
for f in args.files:
107+
print('// Tests from %s' % path.basename(f))
108+
print_tests(read_tests(f))
109+
print('')

src/etc/regex-unicode-tables.py

+183
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#!/usr/bin/env python2
2+
3+
# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
from __future__ import absolute_import, division, print_function
14+
import argparse
15+
from collections import defaultdict
16+
import csv
17+
import datetime
18+
import urllib2
19+
20+
BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
21+
DATA = 'UnicodeData.txt'
22+
SCRIPTS = 'Scripts.txt'
23+
24+
# Mapping taken from Table 12 from:
25+
# http://www.unicode.org/reports/tr44/#General_Category_Values
26+
expanded_categories = {
27+
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
28+
'Lm': ['L'], 'Lo': ['L'],
29+
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
30+
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
31+
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
32+
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
33+
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
34+
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
35+
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
36+
}
37+
38+
39+
def as_4byte_uni(n):
40+
s = hex(n)[2:]
41+
return '\\U%s%s' % ('0' * (8 - len(s)), s)
42+
43+
44+
def expand_cat(c):
45+
return expanded_categories.get(c, []) + [c]
46+
47+
48+
def is_valid_unicode(n):
49+
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
50+
51+
52+
def read_cats(f):
53+
assigned = defaultdict(list)
54+
for row in csv.reader(f, delimiter=';'):
55+
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
56+
if not is_valid_unicode(hex):
57+
continue
58+
for cat in cats:
59+
assigned[cat].append(hex)
60+
return assigned
61+
62+
63+
def read_scripts(f):
64+
assigned = defaultdict(list)
65+
for line in f:
66+
line = line.strip()
67+
if not line or line.startswith('#'):
68+
continue
69+
hexes, name = map(str.strip, line.split(';'))[:2]
70+
name = name[:name.index('#')].strip()
71+
if '..' not in hexes:
72+
hex = int(hexes, 16)
73+
if is_valid_unicode(hex):
74+
assigned[name].append(hex)
75+
else:
76+
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
77+
for hex in xrange(hex1, hex2 + 1):
78+
if is_valid_unicode(hex):
79+
assigned[name].append(hex)
80+
return assigned
81+
82+
83+
def group(letters):
84+
letters = sorted(set(letters))
85+
grouped = []
86+
cur_start = letters.pop(0)
87+
cur_end = cur_start
88+
for letter in letters:
89+
assert letter > cur_end, \
90+
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))
91+
92+
if letter == cur_end + 1:
93+
cur_end = letter
94+
else:
95+
grouped.append((cur_start, cur_end))
96+
cur_start, cur_end = letter, letter
97+
grouped.append((cur_start, cur_end))
98+
return grouped
99+
100+
101+
def ranges_to_rust(rs):
102+
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
103+
return ',\n '.join(rs)
104+
105+
106+
def groups_to_rust(groups):
107+
rust_groups = []
108+
for group_name in sorted(groups):
109+
rust_groups.append('("%s", &[\n %s\n ]),'
110+
% (group_name, ranges_to_rust(groups[group_name])))
111+
return '\n'.join(rust_groups)
112+
113+
114+
if __name__ == '__main__':
115+
parser = argparse.ArgumentParser(
116+
description='Generate Unicode character class tables.')
117+
aa = parser.add_argument
118+
aa('--local', action='store_true',
119+
help='When set, Scripts.txt and UnicodeData.txt will be read from '
120+
'the CWD.')
121+
aa('--base-url', type=str, default=BASE_URL,
122+
help='The base URL to use for downloading Unicode data files.')
123+
args = parser.parse_args()
124+
125+
if args.local:
126+
cats = read_cats(open(DATA))
127+
scripts = read_scripts(open(SCRIPTS))
128+
else:
129+
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
130+
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))
131+
132+
# Get Rust code for all Unicode general categories and scripts.
133+
combined = dict(cats, **scripts)
134+
unigroups = groups_to_rust({k: group(letters)
135+
for k, letters in combined.items()})
136+
137+
# Now get Perl character classes that are Unicode friendly.
138+
perld = range(ord('0'), ord('9') + 1)
139+
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))
140+
141+
perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
142+
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))
143+
144+
low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
145+
perlw = [ord('_')] + perld + low + up
146+
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))
147+
148+
tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
149+
// file at the top-level directory of this distribution and at
150+
// http://rust-lang.org/COPYRIGHT.
151+
//
152+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
153+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
154+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
155+
// option. This file may not be copied, modified, or distributed
156+
// except according to those terms.
157+
158+
// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
159+
// on {date}.
160+
161+
use parse::{{Class, NamedClasses}};
162+
163+
pub static UNICODE_CLASSES: NamedClasses = &[
164+
165+
{groups}
166+
167+
];
168+
169+
pub static PERLD: Class = &[
170+
{dgroups}
171+
];
172+
173+
pub static PERLS: Class = &[
174+
{sgroups}
175+
];
176+
177+
pub static PERLW: Class = &[
178+
{wgroups}
179+
];
180+
'''
181+
now = datetime.datetime.now()
182+
print(tpl.format(date=str(now), groups=unigroups,
183+
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))

0 commit comments

Comments
 (0)