Skip to content

Commit 070c8e1

Browse files
committed
Model lexer: Fix remaining issues
1 parent e5e343a commit 070c8e1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+96
-174
lines changed

src/grammar/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ javac *.java
1212
rustc -O verify.rs
1313
for file in ../*/**.rs; do
1414
echo $file;
15-
grun RustLexer tokens -tokens < $file | ./verify $file RustLexer.tokens || break
15+
grun RustLexer tokens -tokens < "$file" | ./verify "$file" RustLexer.tokens || break
1616
done
1717
```
1818

src/grammar/RustLexer.g4

+47-65
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
lexer grammar RustLexer;
22

3+
@lexer::members {
4+
public boolean is_at(int pos) {
5+
return _input.index() == pos;
6+
}
7+
}
8+
9+
310
tokens {
411
EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
512
MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
@@ -8,7 +15,7 @@ tokens {
815
LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
916
LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
1017
LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
11-
COMMENT
18+
COMMENT, SHEBANG
1219
}
1320

1421
import xidstart , xidcontinue;
@@ -86,94 +93,63 @@ fragment CHAR_ESCAPE
8693
| [xX] HEXIT HEXIT
8794
| 'u' HEXIT HEXIT HEXIT HEXIT
8895
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
96+
| 'u{' HEXIT '}'
97+
| 'u{' HEXIT HEXIT '}'
98+
| 'u{' HEXIT HEXIT HEXIT '}'
99+
| 'u{' HEXIT HEXIT HEXIT HEXIT '}'
100+
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT '}'
101+
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT '}'
89102
;
90103
91104
fragment SUFFIX
92105
: IDENT
93106
;
94107
108+
fragment INTEGER_SUFFIX
109+
: { _input.LA(1) != 'e' && _input.LA(1) != 'E' }? SUFFIX
110+
;
111+
95112
LIT_CHAR
96-
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] | '\ud800' .. '\udbff' '\udc00' .. '\udfff' ) '\'' SUFFIX?
113+
: '\'' ( '\\' CHAR_ESCAPE
114+
| ~[\\'\n\t\r]
115+
| '\ud800' .. '\udbff' '\udc00' .. '\udfff'
116+
)
117+
'\'' SUFFIX?
97118
;
98119

99120
LIT_BYTE
100-
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\'' SUFFIX?
121+
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT
122+
| [nrt\\'"0] )
123+
| ~[\\'\n\t\r] '\udc00'..'\udfff'?
124+
)
125+
'\'' SUFFIX?
101126
;
102127

103128
LIT_INTEGER
104-
: [0-9][0-9_]* SUFFIX?
105-
| '0b' [01][01_]* SUFFIX?
106-
| '0o' [0-7][0-7_]* SUFFIX?
107-
| '0x' [0-9a-fA-F][0-9a-fA-F_]* SUFFIX?
129+
130+
: [0-9][0-9_]* INTEGER_SUFFIX?
131+
| '0b' [01_]+ INTEGER_SUFFIX?
132+
| '0o' [0-7_]+ INTEGER_SUFFIX?
133+
| '0x' [0-9a-fA-F_]+ INTEGER_SUFFIX?
108134
;
109135

110136
LIT_FLOAT
111137
: [0-9][0-9_]* ('.' {
112-
/* dot followed by another dot is a range, no float */
138+
/* dot followed by another dot is a range, not a float */
113139
_input.LA(1) != '.' &&
114-
/* dot followed by an identifier is an integer with a function call, no float */
140+
/* dot followed by an identifier is an integer with a function call, not a float */
115141
_input.LA(1) != '_' &&
116-
_input.LA(1) != 'a' &&
117-
_input.LA(1) != 'b' &&
118-
_input.LA(1) != 'c' &&
119-
_input.LA(1) != 'd' &&
120-
_input.LA(1) != 'e' &&
121-
_input.LA(1) != 'f' &&
122-
_input.LA(1) != 'g' &&
123-
_input.LA(1) != 'h' &&
124-
_input.LA(1) != 'i' &&
125-
_input.LA(1) != 'j' &&
126-
_input.LA(1) != 'k' &&
127-
_input.LA(1) != 'l' &&
128-
_input.LA(1) != 'm' &&
129-
_input.LA(1) != 'n' &&
130-
_input.LA(1) != 'o' &&
131-
_input.LA(1) != 'p' &&
132-
_input.LA(1) != 'q' &&
133-
_input.LA(1) != 'r' &&
134-
_input.LA(1) != 's' &&
135-
_input.LA(1) != 't' &&
136-
_input.LA(1) != 'u' &&
137-
_input.LA(1) != 'v' &&
138-
_input.LA(1) != 'w' &&
139-
_input.LA(1) != 'x' &&
140-
_input.LA(1) != 'y' &&
141-
_input.LA(1) != 'z' &&
142-
_input.LA(1) != 'A' &&
143-
_input.LA(1) != 'B' &&
144-
_input.LA(1) != 'C' &&
145-
_input.LA(1) != 'D' &&
146-
_input.LA(1) != 'E' &&
147-
_input.LA(1) != 'F' &&
148-
_input.LA(1) != 'G' &&
149-
_input.LA(1) != 'H' &&
150-
_input.LA(1) != 'I' &&
151-
_input.LA(1) != 'J' &&
152-
_input.LA(1) != 'K' &&
153-
_input.LA(1) != 'L' &&
154-
_input.LA(1) != 'M' &&
155-
_input.LA(1) != 'N' &&
156-
_input.LA(1) != 'O' &&
157-
_input.LA(1) != 'P' &&
158-
_input.LA(1) != 'Q' &&
159-
_input.LA(1) != 'R' &&
160-
_input.LA(1) != 'S' &&
161-
_input.LA(1) != 'T' &&
162-
_input.LA(1) != 'U' &&
163-
_input.LA(1) != 'V' &&
164-
_input.LA(1) != 'W' &&
165-
_input.LA(1) != 'X' &&
166-
_input.LA(1) != 'Y' &&
167-
_input.LA(1) != 'Z'
142+
!(_input.LA(1) >= 'a' && _input.LA(1) <= 'z') &&
143+
!(_input.LA(1) >= 'A' && _input.LA(1) <= 'Z')
168144
}? | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX?)
169145
;
170146

171147
LIT_STR
172148
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"' SUFFIX?
173149
;
174150

175-
LIT_BINARY : 'b' LIT_STR SUFFIX?;
176-
LIT_BINARY_RAW : 'rb' LIT_STR_RAW SUFFIX?;
151+
LIT_BINARY : 'b' LIT_STR ;
152+
LIT_BINARY_RAW : 'b' LIT_STR_RAW ;
177153

178154
/* this is a bit messy */
179155

@@ -201,13 +177,19 @@ LIFETIME : '\'' IDENT ;
201177

202178
WHITESPACE : [ \r\n\t]+ ;
203179

204-
UNDOC_COMMENT : '////' ~[\r\n]* -> type(COMMENT) ;
180+
UNDOC_COMMENT : '////' ~[\n]* -> type(COMMENT) ;
205181
YESDOC_COMMENT : '///' ~[\r\n]* -> type(DOC_COMMENT) ;
206182
OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;
207-
LINE_COMMENT : '//' ~[\r\n]* -> type(COMMENT) ;
183+
LINE_COMMENT : '//' ( ~[/\n] ~[\n]* )? -> type(COMMENT) ;
208184

209185
DOC_BLOCK_COMMENT
210186
: ('/**' ~[*] | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/' -> type(DOC_COMMENT)
211187
;
212188

213189
BLOCK_COMMENT : '/*' (BLOCK_COMMENT | .)*? '*/' -> type(COMMENT) ;
190+
191+
/* these appear at the beginning of a file */
192+
193+
SHEBANG : '#!' { is_at(2) && _input.LA(1) != '[' }? ~[\r\n]* -> type(SHEBANG) ;
194+
195+
UTF8_BOM : '\ufeff' { is_at(1) }? -> skip ;

src/grammar/check.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ failed=0
1818
skipped=0
1919

2020
check() {
21-
grep --silent "// ignore-lexer-test" $1;
21+
grep --silent "// ignore-lexer-test" "$1";
2222

2323
# if it's *not* found...
2424
if [ $? -eq 1 ]; then
2525
cd $2 # This `cd` is so java will pick up RustLexer.class. I couldn't
26-
# figure out how to wrangle the CLASSPATH, just adding build/grammr didn't
27-
# seem to have anny effect.
26+
# figure out how to wrangle the CLASSPATH, just adding build/grammar
27+
# didn't seem to have any effect.
2828
if $3 RustLexer tokens -tokens < $1 | $4 $1 $5; then
2929
echo "pass: $1"
3030
passed=`expr $passed + 1`
@@ -39,7 +39,7 @@ check() {
3939
}
4040

4141
for file in $(find $1 -iname '*.rs' ! -path '*/test/compile-fail*'); do
42-
check $file $2 $3 $4 $5
42+
check "$file" $2 $3 $4 $5
4343
done
4444

4545
printf "\ntest result: "

src/grammar/verify.rs

+44-29
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
#![feature(plugin)]
12-
13-
#![allow(unstable)]
11+
#![feature(plugin, rustc_private, str_char, collections)]
1412

1513
extern crate syntax;
1614
extern crate rustc;
@@ -19,7 +17,10 @@ extern crate rustc;
1917
extern crate log;
2018

2119
use std::collections::HashMap;
22-
use std::io::File;
20+
use std::env;
21+
use std::fs::File;
22+
use std::io::{BufRead, Read};
23+
use std::path::Path;
2324

2425
use syntax::parse;
2526
use syntax::parse::lexer;
@@ -108,6 +109,7 @@ fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
108109
"LIT_BINARY" => token::Literal(token::Binary(Name(0)), None),
109110
"LIT_BINARY_RAW" => token::Literal(token::BinaryRaw(Name(0), 0), None),
110111
"QUESTION" => token::Question,
112+
"SHEBANG" => token::Shebang(Name(0)),
111113
_ => continue,
112114
};
113115

@@ -166,24 +168,26 @@ fn count(lit: &str) -> usize {
166168
lit.chars().take_while(|c| *c == '#').count()
167169
}
168170

169-
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize])
171+
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize],
172+
has_bom: bool)
170173
-> TokenAndSpan {
171174
// old regex:
172175
// \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
173-
let start = s.find_str("[@").unwrap();
174-
let comma = start + s[start..].find_str(",").unwrap();
175-
let colon = comma + s[comma..].find_str(":").unwrap();
176-
let content_start = colon + s[colon..].find_str("='").unwrap();
177-
let content_end = content_start + s[content_start..].find_str("',<").unwrap();
178-
let toknum_end = content_end + s[content_end..].find_str(">,").unwrap();
176+
let start = s.find("[@").unwrap();
177+
let comma = start + s[start..].find(",").unwrap();
178+
let colon = comma + s[comma..].find(":").unwrap();
179+
let content_start = colon + s[colon..].find("='").unwrap();
180+
// Use rfind instead of find, because we don't want to stop at the content
181+
let content_end = content_start + s[content_start..].rfind("',<").unwrap();
182+
let toknum_end = content_end + s[content_end..].find(">,").unwrap();
179183

180184
let start = &s[comma + 1 .. colon];
181185
let end = &s[colon + 1 .. content_start];
182186
let content = &s[content_start + 2 .. content_end];
183187
let toknum = &s[content_end + 3 .. toknum_end];
184188

185-
let proto_tok = tokens.get(toknum).expect(format!("didn't find token {:?} in the map",
186-
toknum));
189+
let not_found = format!("didn't find token {:?} in the map", toknum);
190+
let proto_tok = tokens.get(toknum).expect(&not_found[..]);
187191

188192
let nm = parse::token::intern(content);
189193

@@ -209,15 +213,16 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_
209213
ref t => t.clone()
210214
};
211215

212-
let offset = if real_tok == token::Eof
213-
{
216+
let start_offset = if real_tok == token::Eof {
214217
1
215218
} else {
216219
0
217220
};
218221

219-
let mut lo = start.parse::<u32>().unwrap() - offset;
220-
let mut hi = end.parse::<u32>().unwrap() + 1;
222+
let offset = if has_bom { 1 } else { 0 };
223+
224+
let mut lo = start.parse::<u32>().unwrap() - start_offset - offset;
225+
let mut hi = end.parse::<u32>().unwrap() + 1 - offset;
221226

222227
// Adjust the span: For each surrogate pair already encountered, subtract one position.
223228
lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32;
@@ -247,8 +252,8 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
247252

248253
fn span_cmp(antlr_sp: syntax::codemap::Span, rust_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
249254
antlr_sp.expn_id == rust_sp.expn_id &&
250-
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
251-
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint()
255+
antlr_sp.lo.to_usize() == cm.bytepos_to_file_charpos(rust_sp.lo).to_usize() &&
256+
antlr_sp.hi.to_usize() == cm.bytepos_to_file_charpos(rust_sp.hi).to_usize()
252257
}
253258

254259
fn main() {
@@ -257,10 +262,15 @@ fn main() {
257262
r.next_token()
258263
}
259264

260-
let args = std::os::args();
265+
let mut args = env::args().skip(1);
266+
let filename = args.next().unwrap();
267+
if filename.find("parse-fail").is_some() {
268+
return;
269+
}
261270

262271
// Rust's lexer
263-
let code = File::open(&Path::new(args[1])).unwrap().read_to_string().unwrap();
272+
let mut code = String::new();
273+
File::open(&Path::new(&filename)).unwrap().read_to_string(&mut code).unwrap();
264274

265275
let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate()
266276
.filter(|&(_, c)| c as usize > 0xFFFF)
@@ -269,6 +279,8 @@ fn main() {
269279
.map(|(x, n)| x + n)
270280
.collect();
271281

282+
let has_bom = code.starts_with("\u{feff}");
283+
272284
debug!("Pairs: {:?}", surrogate_pairs_pos);
273285

274286
let options = config::basic_options();
@@ -281,15 +293,18 @@ fn main() {
281293
let ref cm = lexer.span_diagnostic.cm;
282294

283295
// ANTLR
284-
let mut token_file = File::open(&Path::new(args[2]));
285-
let token_map = parse_token_list(token_file.read_to_string().unwrap());
296+
let mut token_file = File::open(&Path::new(&args.next().unwrap())).unwrap();
297+
let mut token_list = String::new();
298+
token_file.read_to_string(&mut token_list).unwrap();
299+
let token_map = parse_token_list(&token_list[..]);
286300

287-
let mut stdin = std::io::stdin();
288-
let mut lock = stdin.lock();
301+
let stdin = std::io::stdin();
302+
let lock = stdin.lock();
289303
let lines = lock.lines();
290-
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
291-
&token_map,
292-
&surrogate_pairs_pos[]));
304+
let antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
305+
&token_map,
306+
&surrogate_pairs_pos[..],
307+
has_bom));
293308

294309
for antlr_tok in antlr_tokens {
295310
let rustc_tok = next(&mut lexer);
@@ -314,7 +329,7 @@ fn main() {
314329
}
315330
_ => panic!("{:?} is not {:?}", antlr_tok, rustc_tok)
316331
},)*
317-
ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", rustc_tok, antlr_tok)
332+
ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", antlr_tok, rustc_tok)
318333
}
319334
)
320335
}

src/libcollections/fmt.rs

-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
10-
//
11-
// ignore-lexer-test FIXME #15679
1210

1311
//! Utilities for formatting and printing strings
1412
//!

src/libcollections/str.rs

-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
10-
//
11-
// ignore-lexer-test FIXME #15679
1210

1311
//! Unicode string manipulation (the `str` type).
1412
//!

src/libcollections/string.rs

-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
10-
//
11-
// ignore-lexer-test FIXME #15679
1210

1311
//! An owned, growable string that enforces that its contents are valid UTF-8.
1412

0 commit comments

Comments
 (0)