8
8
// option. This file may not be copied, modified, or distributed
9
9
// except according to those terms.
10
10
11
- #![ feature( plugin) ]
12
-
13
- #![ allow( unstable) ]
11
+ #![ feature( plugin, rustc_private, str_char, collections) ]
14
12
15
13
extern crate syntax;
16
14
extern crate rustc;
@@ -19,7 +17,10 @@ extern crate rustc;
19
17
extern crate log;
20
18
21
19
use std:: collections:: HashMap ;
22
- use std:: io:: File ;
20
+ use std:: env;
21
+ use std:: fs:: File ;
22
+ use std:: io:: { BufRead , Read } ;
23
+ use std:: path:: Path ;
23
24
24
25
use syntax:: parse;
25
26
use syntax:: parse:: lexer;
@@ -108,6 +109,7 @@ fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
108
109
"LIT_BINARY" => token:: Literal ( token:: Binary ( Name ( 0 ) ) , None ) ,
109
110
"LIT_BINARY_RAW" => token:: Literal ( token:: BinaryRaw ( Name ( 0 ) , 0 ) , None ) ,
110
111
"QUESTION" => token:: Question ,
112
+ "SHEBANG" => token:: Shebang ( Name ( 0 ) ) ,
111
113
_ => continue ,
112
114
} ;
113
115
@@ -166,24 +168,26 @@ fn count(lit: &str) -> usize {
166
168
lit. chars ( ) . take_while ( |c| * c == '#' ) . count ( )
167
169
}
168
170
169
- fn parse_antlr_token ( s : & str , tokens : & HashMap < String , token:: Token > , surrogate_pairs_pos : & [ usize ] )
171
+ fn parse_antlr_token ( s : & str , tokens : & HashMap < String , token:: Token > , surrogate_pairs_pos : & [ usize ] ,
172
+ has_bom : bool )
170
173
-> TokenAndSpan {
171
174
// old regex:
172
175
// \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
173
- let start = s. find_str ( "[@" ) . unwrap ( ) ;
174
- let comma = start + s[ start..] . find_str ( "," ) . unwrap ( ) ;
175
- let colon = comma + s[ comma..] . find_str ( ":" ) . unwrap ( ) ;
176
- let content_start = colon + s[ colon..] . find_str ( "='" ) . unwrap ( ) ;
177
- let content_end = content_start + s[ content_start..] . find_str ( "',<" ) . unwrap ( ) ;
178
- let toknum_end = content_end + s[ content_end..] . find_str ( ">," ) . unwrap ( ) ;
176
+ let start = s. find ( "[@" ) . unwrap ( ) ;
177
+ let comma = start + s[ start..] . find ( "," ) . unwrap ( ) ;
178
+ let colon = comma + s[ comma..] . find ( ":" ) . unwrap ( ) ;
179
+ let content_start = colon + s[ colon..] . find ( "='" ) . unwrap ( ) ;
180
+ // Use rfind instead of find, because we don't want to stop at the content
181
+ let content_end = content_start + s[ content_start..] . rfind ( "',<" ) . unwrap ( ) ;
182
+ let toknum_end = content_end + s[ content_end..] . find ( ">," ) . unwrap ( ) ;
179
183
180
184
let start = & s[ comma + 1 .. colon] ;
181
185
let end = & s[ colon + 1 .. content_start] ;
182
186
let content = & s[ content_start + 2 .. content_end] ;
183
187
let toknum = & s[ content_end + 3 .. toknum_end] ;
184
188
185
- let proto_tok = tokens . get ( toknum ) . expect ( format ! ( "didn't find token {:?} in the map" ,
186
- toknum) ) ;
189
+ let not_found = format ! ( "didn't find token {:?} in the map" , toknum ) ;
190
+ let proto_tok = tokens . get ( toknum) . expect ( & not_found [ .. ] ) ;
187
191
188
192
let nm = parse:: token:: intern ( content) ;
189
193
@@ -209,15 +213,16 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_
209
213
ref t => t. clone ( )
210
214
} ;
211
215
212
- let offset = if real_tok == token:: Eof
213
- {
216
+ let start_offset = if real_tok == token:: Eof {
214
217
1
215
218
} else {
216
219
0
217
220
} ;
218
221
219
- let mut lo = start. parse :: < u32 > ( ) . unwrap ( ) - offset;
220
- let mut hi = end. parse :: < u32 > ( ) . unwrap ( ) + 1 ;
222
+ let offset = if has_bom { 1 } else { 0 } ;
223
+
224
+ let mut lo = start. parse :: < u32 > ( ) . unwrap ( ) - start_offset - offset;
225
+ let mut hi = end. parse :: < u32 > ( ) . unwrap ( ) + 1 - offset;
221
226
222
227
// Adjust the span: For each surrogate pair already encountered, subtract one position.
223
228
lo -= surrogate_pairs_pos. binary_search ( & ( lo as usize ) ) . unwrap_or_else ( |x| x) as u32 ;
@@ -247,8 +252,8 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
247
252
248
253
fn span_cmp ( antlr_sp : syntax:: codemap:: Span , rust_sp : syntax:: codemap:: Span , cm : & syntax:: codemap:: CodeMap ) -> bool {
249
254
antlr_sp. expn_id == rust_sp. expn_id &&
250
- antlr_sp. lo . to_uint ( ) == cm. bytepos_to_file_charpos ( rust_sp. lo ) . to_uint ( ) &&
251
- antlr_sp. hi . to_uint ( ) == cm. bytepos_to_file_charpos ( rust_sp. hi ) . to_uint ( )
255
+ antlr_sp. lo . to_usize ( ) == cm. bytepos_to_file_charpos ( rust_sp. lo ) . to_usize ( ) &&
256
+ antlr_sp. hi . to_usize ( ) == cm. bytepos_to_file_charpos ( rust_sp. hi ) . to_usize ( )
252
257
}
253
258
254
259
fn main ( ) {
@@ -257,10 +262,15 @@ fn main() {
257
262
r. next_token ( )
258
263
}
259
264
260
- let args = std:: os:: args ( ) ;
265
+ let mut args = env:: args ( ) . skip ( 1 ) ;
266
+ let filename = args. next ( ) . unwrap ( ) ;
267
+ if filename. find ( "parse-fail" ) . is_some ( ) {
268
+ return ;
269
+ }
261
270
262
271
// Rust's lexer
263
- let code = File :: open ( & Path :: new ( args[ 1 ] ) ) . unwrap ( ) . read_to_string ( ) . unwrap ( ) ;
272
+ let mut code = String :: new ( ) ;
273
+ File :: open ( & Path :: new ( & filename) ) . unwrap ( ) . read_to_string ( & mut code) . unwrap ( ) ;
264
274
265
275
let surrogate_pairs_pos: Vec < usize > = code. chars ( ) . enumerate ( )
266
276
. filter ( |& ( _, c) | c as usize > 0xFFFF )
@@ -269,6 +279,8 @@ fn main() {
269
279
. map ( |( x, n) | x + n)
270
280
. collect ( ) ;
271
281
282
+ let has_bom = code. starts_with ( "\u{feff} " ) ;
283
+
272
284
debug ! ( "Pairs: {:?}" , surrogate_pairs_pos) ;
273
285
274
286
let options = config:: basic_options ( ) ;
@@ -281,15 +293,18 @@ fn main() {
281
293
let ref cm = lexer. span_diagnostic . cm ;
282
294
283
295
// ANTLR
284
- let mut token_file = File :: open ( & Path :: new ( args[ 2 ] ) ) ;
285
- let token_map = parse_token_list ( token_file. read_to_string ( ) . unwrap ( ) ) ;
296
+ let mut token_file = File :: open ( & Path :: new ( & args. next ( ) . unwrap ( ) ) ) . unwrap ( ) ;
297
+ let mut token_list = String :: new ( ) ;
298
+ token_file. read_to_string ( & mut token_list) . unwrap ( ) ;
299
+ let token_map = parse_token_list ( & token_list[ ..] ) ;
286
300
287
- let mut stdin = std:: io:: stdin ( ) ;
288
- let mut lock = stdin. lock ( ) ;
301
+ let stdin = std:: io:: stdin ( ) ;
302
+ let lock = stdin. lock ( ) ;
289
303
let lines = lock. lines ( ) ;
290
- let mut antlr_tokens = lines. map ( |l| parse_antlr_token ( l. unwrap ( ) . trim ( ) ,
291
- & token_map,
292
- & surrogate_pairs_pos[ ] ) ) ;
304
+ let antlr_tokens = lines. map ( |l| parse_antlr_token ( l. unwrap ( ) . trim ( ) ,
305
+ & token_map,
306
+ & surrogate_pairs_pos[ ..] ,
307
+ has_bom) ) ;
293
308
294
309
for antlr_tok in antlr_tokens {
295
310
let rustc_tok = next ( & mut lexer) ;
@@ -314,7 +329,7 @@ fn main() {
314
329
}
315
330
_ => panic!( "{:?} is not {:?}" , antlr_tok, rustc_tok)
316
331
} , ) *
317
- ref c => assert!( c == & antlr_tok. tok, "{:?} is not {:?}" , rustc_tok , antlr_tok )
332
+ ref c => assert!( c == & antlr_tok. tok, "{:?} is not {:?}" , antlr_tok , rustc_tok )
318
333
}
319
334
)
320
335
}
0 commit comments