Skip to content

Commit 13bc8af

Browse files
committed
Model lexer: Fix remaining issues
1 parent e5e343a commit 13bc8af

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+102
-179
lines changed

src/grammar/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ javac *.java
1212
rustc -O verify.rs
1313
for file in ../*/**.rs; do
1414
echo $file;
15-
grun RustLexer tokens -tokens < $file | ./verify $file RustLexer.tokens || break
15+
grun RustLexer tokens -tokens < "$file" | ./verify "$file" RustLexer.tokens || break
1616
done
1717
```
1818

src/grammar/RustLexer.g4

+47-65
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
lexer grammar RustLexer;
22

3+
@lexer::members {
4+
public boolean is_at(int pos) {
5+
return _input.index() == pos;
6+
}
7+
}
8+
9+
310
tokens {
411
EQ, LT, LE, EQEQ, NE, GE, GT, ANDAND, OROR, NOT, TILDE, PLUT,
512
MINUS, STAR, SLASH, PERCENT, CARET, AND, OR, SHL, SHR, BINOP,
@@ -8,7 +15,7 @@ tokens {
815
LBRACE, RBRACE, POUND, DOLLAR, UNDERSCORE, LIT_CHAR,
916
LIT_INTEGER, LIT_FLOAT, LIT_STR, LIT_STR_RAW, LIT_BINARY,
1017
LIT_BINARY_RAW, IDENT, LIFETIME, WHITESPACE, DOC_COMMENT,
11-
COMMENT
18+
COMMENT, SHEBANG
1219
}
1320

1421
import xidstart , xidcontinue;
@@ -86,94 +93,63 @@ fragment CHAR_ESCAPE
8693
| [xX] HEXIT HEXIT
8794
| 'u' HEXIT HEXIT HEXIT HEXIT
8895
| 'U' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT
96+
| 'u{' HEXIT '}'
97+
| 'u{' HEXIT HEXIT '}'
98+
| 'u{' HEXIT HEXIT HEXIT '}'
99+
| 'u{' HEXIT HEXIT HEXIT HEXIT '}'
100+
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT '}'
101+
| 'u{' HEXIT HEXIT HEXIT HEXIT HEXIT HEXIT '}'
89102
;
90103
91104
fragment SUFFIX
92105
: IDENT
93106
;
94107
108+
fragment INTEGER_SUFFIX
109+
: { _input.LA(1) != 'e' && _input.LA(1) != 'E' }? SUFFIX
110+
;
111+
95112
LIT_CHAR
96-
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] | '\ud800' .. '\udbff' '\udc00' .. '\udfff' ) '\'' SUFFIX?
113+
: '\'' ( '\\' CHAR_ESCAPE
114+
| ~[\\'\n\t\r]
115+
| '\ud800' .. '\udbff' '\udc00' .. '\udfff'
116+
)
117+
'\'' SUFFIX?
97118
;
98119

99120
LIT_BYTE
100-
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT | [nrt\\'"0] ) | ~[\\'\n\t\r] ) '\'' SUFFIX?
121+
: 'b\'' ( '\\' ( [xX] HEXIT HEXIT
122+
| [nrt\\'"0] )
123+
| ~[\\'\n\t\r] '\udc00'..'\udfff'?
124+
)
125+
'\'' SUFFIX?
101126
;
102127

103128
LIT_INTEGER
104-
: [0-9][0-9_]* SUFFIX?
105-
| '0b' [01][01_]* SUFFIX?
106-
| '0o' [0-7][0-7_]* SUFFIX?
107-
| '0x' [0-9a-fA-F][0-9a-fA-F_]* SUFFIX?
129+
130+
: [0-9][0-9_]* INTEGER_SUFFIX?
131+
| '0b' [01_]+ INTEGER_SUFFIX?
132+
| '0o' [0-7_]+ INTEGER_SUFFIX?
133+
| '0x' [0-9a-fA-F_]+ INTEGER_SUFFIX?
108134
;
109135

110136
LIT_FLOAT
111137
: [0-9][0-9_]* ('.' {
112-
/* dot followed by another dot is a range, no float */
138+
/* dot followed by another dot is a range, not a float */
113139
_input.LA(1) != '.' &&
114-
/* dot followed by an identifier is an integer with a function call, no float */
140+
/* dot followed by an identifier is an integer with a function call, not a float */
115141
_input.LA(1) != '_' &&
116-
_input.LA(1) != 'a' &&
117-
_input.LA(1) != 'b' &&
118-
_input.LA(1) != 'c' &&
119-
_input.LA(1) != 'd' &&
120-
_input.LA(1) != 'e' &&
121-
_input.LA(1) != 'f' &&
122-
_input.LA(1) != 'g' &&
123-
_input.LA(1) != 'h' &&
124-
_input.LA(1) != 'i' &&
125-
_input.LA(1) != 'j' &&
126-
_input.LA(1) != 'k' &&
127-
_input.LA(1) != 'l' &&
128-
_input.LA(1) != 'm' &&
129-
_input.LA(1) != 'n' &&
130-
_input.LA(1) != 'o' &&
131-
_input.LA(1) != 'p' &&
132-
_input.LA(1) != 'q' &&
133-
_input.LA(1) != 'r' &&
134-
_input.LA(1) != 's' &&
135-
_input.LA(1) != 't' &&
136-
_input.LA(1) != 'u' &&
137-
_input.LA(1) != 'v' &&
138-
_input.LA(1) != 'w' &&
139-
_input.LA(1) != 'x' &&
140-
_input.LA(1) != 'y' &&
141-
_input.LA(1) != 'z' &&
142-
_input.LA(1) != 'A' &&
143-
_input.LA(1) != 'B' &&
144-
_input.LA(1) != 'C' &&
145-
_input.LA(1) != 'D' &&
146-
_input.LA(1) != 'E' &&
147-
_input.LA(1) != 'F' &&
148-
_input.LA(1) != 'G' &&
149-
_input.LA(1) != 'H' &&
150-
_input.LA(1) != 'I' &&
151-
_input.LA(1) != 'J' &&
152-
_input.LA(1) != 'K' &&
153-
_input.LA(1) != 'L' &&
154-
_input.LA(1) != 'M' &&
155-
_input.LA(1) != 'N' &&
156-
_input.LA(1) != 'O' &&
157-
_input.LA(1) != 'P' &&
158-
_input.LA(1) != 'Q' &&
159-
_input.LA(1) != 'R' &&
160-
_input.LA(1) != 'S' &&
161-
_input.LA(1) != 'T' &&
162-
_input.LA(1) != 'U' &&
163-
_input.LA(1) != 'V' &&
164-
_input.LA(1) != 'W' &&
165-
_input.LA(1) != 'X' &&
166-
_input.LA(1) != 'Y' &&
167-
_input.LA(1) != 'Z'
142+
!(_input.LA(1) >= 'a' && _input.LA(1) <= 'z') &&
143+
!(_input.LA(1) >= 'A' && _input.LA(1) <= 'Z')
168144
}? | ('.' [0-9][0-9_]*)? ([eE] [-+]? [0-9][0-9_]*)? SUFFIX?)
169145
;
170146

171147
LIT_STR
172148
: '"' ('\\\n' | '\\\r\n' | '\\' CHAR_ESCAPE | .)*? '"' SUFFIX?
173149
;
174150

175-
LIT_BINARY : 'b' LIT_STR SUFFIX?;
176-
LIT_BINARY_RAW : 'rb' LIT_STR_RAW SUFFIX?;
151+
LIT_BINARY : 'b' LIT_STR ;
152+
LIT_BINARY_RAW : 'b' LIT_STR_RAW ;
177153

178154
/* this is a bit messy */
179155

@@ -201,13 +177,19 @@ LIFETIME : '\'' IDENT ;
201177

202178
WHITESPACE : [ \r\n\t]+ ;
203179

204-
UNDOC_COMMENT : '////' ~[\r\n]* -> type(COMMENT) ;
180+
UNDOC_COMMENT : '////' ~[\n]* -> type(COMMENT) ;
205181
YESDOC_COMMENT : '///' ~[\r\n]* -> type(DOC_COMMENT) ;
206182
OUTER_DOC_COMMENT : '//!' ~[\r\n]* -> type(DOC_COMMENT) ;
207-
LINE_COMMENT : '//' ~[\r\n]* -> type(COMMENT) ;
183+
LINE_COMMENT : '//' ( ~[/\n] ~[\n]* )? -> type(COMMENT) ;
208184

209185
DOC_BLOCK_COMMENT
210186
: ('/**' ~[*] | '/*!') (DOC_BLOCK_COMMENT | .)*? '*/' -> type(DOC_COMMENT)
211187
;
212188

213189
BLOCK_COMMENT : '/*' (BLOCK_COMMENT | .)*? '*/' -> type(COMMENT) ;
190+
191+
/* these appear at the beginning of a file */
192+
193+
SHEBANG : '#!' { is_at(2) && _input.LA(1) != '[' }? ~[\r\n]* -> type(SHEBANG) ;
194+
195+
UTF8_BOM : '\ufeff' { is_at(1) }? -> skip ;

src/grammar/check.sh

+4-4
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ failed=0
1818
skipped=0
1919

2020
check() {
21-
grep --silent "// ignore-lexer-test" $1;
21+
grep --silent "// ignore-lexer-test" "$1";
2222

2323
# if it's *not* found...
2424
if [ $? -eq 1 ]; then
2525
cd $2 # This `cd` is so java will pick up RustLexer.class. I couldn't
26-
# figure out how to wrangle the CLASSPATH, just adding build/grammr didn't
27-
# seem to have anny effect.
26+
# figure out how to wrangle the CLASSPATH, just adding build/grammar
27+
# didn't seem to have any effect.
2828
if $3 RustLexer tokens -tokens < $1 | $4 $1 $5; then
2929
echo "pass: $1"
3030
passed=`expr $passed + 1`
@@ -39,7 +39,7 @@ check() {
3939
}
4040

4141
for file in $(find $1 -iname '*.rs' ! -path '*/test/compile-fail*'); do
42-
check $file $2 $3 $4 $5
42+
check "$file" $2 $3 $4 $5
4343
done
4444

4545
printf "\ntest result: "

src/grammar/verify.rs

+50-34
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11-
#![feature(plugin)]
12-
13-
#![allow(unstable)]
11+
#![feature(plugin, rustc_private, str_char, collections)]
1412

1513
extern crate syntax;
1614
extern crate rustc;
@@ -19,14 +17,18 @@ extern crate rustc;
1917
extern crate log;
2018

2119
use std::collections::HashMap;
22-
use std::io::File;
20+
use std::env;
21+
use std::fs::File;
22+
use std::io::{BufRead, Read};
23+
use std::path::Path;
2324

2425
use syntax::parse;
2526
use syntax::parse::lexer;
2627
use rustc::session::{self, config};
2728

2829
use syntax::ast;
2930
use syntax::ast::Name;
31+
use syntax::codemap;
3032
use syntax::codemap::Pos;
3133
use syntax::parse::token;
3234
use syntax::parse::lexer::TokenAndSpan;
@@ -108,6 +110,7 @@ fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
108110
"LIT_BINARY" => token::Literal(token::Binary(Name(0)), None),
109111
"LIT_BINARY_RAW" => token::Literal(token::BinaryRaw(Name(0), 0), None),
110112
"QUESTION" => token::Question,
113+
"SHEBANG" => token::Shebang(Name(0)),
111114
_ => continue,
112115
};
113116

@@ -166,24 +169,26 @@ fn count(lit: &str) -> usize {
166169
lit.chars().take_while(|c| *c == '#').count()
167170
}
168171

169-
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize])
172+
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize],
173+
has_bom: bool)
170174
-> TokenAndSpan {
171175
// old regex:
172176
// \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
173-
let start = s.find_str("[@").unwrap();
174-
let comma = start + s[start..].find_str(",").unwrap();
175-
let colon = comma + s[comma..].find_str(":").unwrap();
176-
let content_start = colon + s[colon..].find_str("='").unwrap();
177-
let content_end = content_start + s[content_start..].find_str("',<").unwrap();
178-
let toknum_end = content_end + s[content_end..].find_str(">,").unwrap();
177+
let start = s.find("[@").unwrap();
178+
let comma = start + s[start..].find(",").unwrap();
179+
let colon = comma + s[comma..].find(":").unwrap();
180+
let content_start = colon + s[colon..].find("='").unwrap();
181+
// Use rfind instead of find, because we don't want to stop at the content
182+
let content_end = content_start + s[content_start..].rfind("',<").unwrap();
183+
let toknum_end = content_end + s[content_end..].find(">,").unwrap();
179184

180185
let start = &s[comma + 1 .. colon];
181186
let end = &s[colon + 1 .. content_start];
182187
let content = &s[content_start + 2 .. content_end];
183188
let toknum = &s[content_end + 3 .. toknum_end];
184189

185-
let proto_tok = tokens.get(toknum).expect(format!("didn't find token {:?} in the map",
186-
toknum));
190+
let not_found = format!("didn't find token {:?} in the map", toknum);
191+
let proto_tok = tokens.get(toknum).expect(&not_found[..]);
187192

188193
let nm = parse::token::intern(content);
189194

@@ -209,24 +214,25 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_
209214
ref t => t.clone()
210215
};
211216

212-
let offset = if real_tok == token::Eof
213-
{
217+
let start_offset = if real_tok == token::Eof {
214218
1
215219
} else {
216220
0
217221
};
218222

219-
let mut lo = start.parse::<u32>().unwrap() - offset;
220-
let mut hi = end.parse::<u32>().unwrap() + 1;
223+
let offset = if has_bom { 1 } else { 0 };
224+
225+
let mut lo = start.parse::<u32>().unwrap() - start_offset - offset;
226+
let mut hi = end.parse::<u32>().unwrap() + 1 - offset;
221227

222228
// Adjust the span: For each surrogate pair already encountered, subtract one position.
223229
lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32;
224230
hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32;
225231

226-
let sp = syntax::codemap::Span {
227-
lo: syntax::codemap::BytePos(lo),
228-
hi: syntax::codemap::BytePos(hi),
229-
expn_id: syntax::codemap::NO_EXPANSION
232+
let sp = codemap::Span {
233+
lo: codemap::BytePos(lo),
234+
hi: codemap::BytePos(hi),
235+
expn_id: codemap::NO_EXPANSION
230236
};
231237

232238
TokenAndSpan {
@@ -245,10 +251,10 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
245251
}
246252
}
247253

248-
fn span_cmp(antlr_sp: syntax::codemap::Span, rust_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
254+
fn span_cmp(antlr_sp: codemap::Span, rust_sp: codemap::Span, cm: &codemap::CodeMap) -> bool {
249255
antlr_sp.expn_id == rust_sp.expn_id &&
250-
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
251-
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint()
256+
antlr_sp.lo.to_usize() == cm.bytepos_to_file_charpos(rust_sp.lo).to_usize() &&
257+
antlr_sp.hi.to_usize() == cm.bytepos_to_file_charpos(rust_sp.hi).to_usize()
252258
}
253259

254260
fn main() {
@@ -257,10 +263,15 @@ fn main() {
257263
r.next_token()
258264
}
259265

260-
let args = std::os::args();
266+
let mut args = env::args().skip(1);
267+
let filename = args.next().unwrap();
268+
if filename.find("parse-fail").is_some() {
269+
return;
270+
}
261271

262272
// Rust's lexer
263-
let code = File::open(&Path::new(args[1])).unwrap().read_to_string().unwrap();
273+
let mut code = String::new();
274+
File::open(&Path::new(&filename)).unwrap().read_to_string(&mut code).unwrap();
264275

265276
let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate()
266277
.filter(|&(_, c)| c as usize > 0xFFFF)
@@ -269,6 +280,8 @@ fn main() {
269280
.map(|(x, n)| x + n)
270281
.collect();
271282

283+
let has_bom = code.starts_with("\u{feff}");
284+
272285
debug!("Pairs: {:?}", surrogate_pairs_pos);
273286

274287
let options = config::basic_options();
@@ -281,15 +294,18 @@ fn main() {
281294
let ref cm = lexer.span_diagnostic.cm;
282295

283296
// ANTLR
284-
let mut token_file = File::open(&Path::new(args[2]));
285-
let token_map = parse_token_list(token_file.read_to_string().unwrap());
297+
let mut token_file = File::open(&Path::new(&args.next().unwrap())).unwrap();
298+
let mut token_list = String::new();
299+
token_file.read_to_string(&mut token_list).unwrap();
300+
let token_map = parse_token_list(&token_list[..]);
286301

287-
let mut stdin = std::io::stdin();
288-
let mut lock = stdin.lock();
302+
let stdin = std::io::stdin();
303+
let lock = stdin.lock();
289304
let lines = lock.lines();
290-
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
291-
&token_map,
292-
&surrogate_pairs_pos[]));
305+
let antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
306+
&token_map,
307+
&surrogate_pairs_pos[..],
308+
has_bom));
293309

294310
for antlr_tok in antlr_tokens {
295311
let rustc_tok = next(&mut lexer);
@@ -314,7 +330,7 @@ fn main() {
314330
}
315331
_ => panic!("{:?} is not {:?}", antlr_tok, rustc_tok)
316332
},)*
317-
ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", rustc_tok, antlr_tok)
333+
ref c => assert!(c == &antlr_tok.tok, "{:?} is not {:?}", antlr_tok, rustc_tok)
318334
}
319335
)
320336
}

src/libcollections/fmt.rs

-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
10-
//
11-
// ignore-lexer-test FIXME #15679
1210

1311
//! Utilities for formatting and printing strings
1412
//!

0 commit comments

Comments
 (0)