8
8
// option. This file may not be copied, modified, or distributed
9
9
// except according to those terms.
10
10
11
- #![ feature( plugin) ]
12
-
13
- #![ allow( unstable) ]
11
+ #![ feature( plugin, rustc_private, str_char, collections) ]
14
12
15
13
extern crate syntax;
16
14
extern crate rustc;
@@ -19,14 +17,18 @@ extern crate rustc;
19
17
extern crate log;
20
18
21
19
use std:: collections:: HashMap ;
22
- use std:: io:: File ;
20
+ use std:: env;
21
+ use std:: fs:: File ;
22
+ use std:: io:: { BufRead , Read } ;
23
+ use std:: path:: Path ;
23
24
24
25
use syntax:: parse;
25
26
use syntax:: parse:: lexer;
26
27
use rustc:: session:: { self , config} ;
27
28
28
29
use syntax:: ast;
29
30
use syntax:: ast:: Name ;
31
+ use syntax:: codemap;
30
32
use syntax:: codemap:: Pos ;
31
33
use syntax:: parse:: token;
32
34
use syntax:: parse:: lexer:: TokenAndSpan ;
@@ -108,6 +110,7 @@ fn parse_token_list(file: &str) -> HashMap<String, token::Token> {
108
110
"LIT_BINARY" => token:: Literal ( token:: Binary ( Name ( 0 ) ) , None ) ,
109
111
"LIT_BINARY_RAW" => token:: Literal ( token:: BinaryRaw ( Name ( 0 ) , 0 ) , None ) ,
110
112
"QUESTION" => token:: Question ,
113
+ "SHEBANG" => token:: Shebang ( Name ( 0 ) ) ,
111
114
_ => continue ,
112
115
} ;
113
116
@@ -166,24 +169,26 @@ fn count(lit: &str) -> usize {
166
169
lit. chars ( ) . take_while ( |c| * c == '#' ) . count ( )
167
170
}
168
171
169
- fn parse_antlr_token ( s : & str , tokens : & HashMap < String , token:: Token > , surrogate_pairs_pos : & [ usize ] )
172
+ fn parse_antlr_token ( s : & str , tokens : & HashMap < String , token:: Token > , surrogate_pairs_pos : & [ usize ] ,
173
+ has_bom : bool )
170
174
-> TokenAndSpan {
171
175
// old regex:
172
176
// \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
173
- let start = s. find_str ( "[@" ) . unwrap ( ) ;
174
- let comma = start + s[ start..] . find_str ( "," ) . unwrap ( ) ;
175
- let colon = comma + s[ comma..] . find_str ( ":" ) . unwrap ( ) ;
176
- let content_start = colon + s[ colon..] . find_str ( "='" ) . unwrap ( ) ;
177
- let content_end = content_start + s[ content_start..] . find_str ( "',<" ) . unwrap ( ) ;
178
- let toknum_end = content_end + s[ content_end..] . find_str ( ">," ) . unwrap ( ) ;
177
+ let start = s. find ( "[@" ) . unwrap ( ) ;
178
+ let comma = start + s[ start..] . find ( "," ) . unwrap ( ) ;
179
+ let colon = comma + s[ comma..] . find ( ":" ) . unwrap ( ) ;
180
+ let content_start = colon + s[ colon..] . find ( "='" ) . unwrap ( ) ;
181
+ // Use rfind instead of find, because we don't want to stop at the content
182
+ let content_end = content_start + s[ content_start..] . rfind ( "',<" ) . unwrap ( ) ;
183
+ let toknum_end = content_end + s[ content_end..] . find ( ">," ) . unwrap ( ) ;
179
184
180
185
let start = & s[ comma + 1 .. colon] ;
181
186
let end = & s[ colon + 1 .. content_start] ;
182
187
let content = & s[ content_start + 2 .. content_end] ;
183
188
let toknum = & s[ content_end + 3 .. toknum_end] ;
184
189
185
- let proto_tok = tokens . get ( toknum ) . expect ( format ! ( "didn't find token {:?} in the map" ,
186
- toknum) ) ;
190
+ let not_found = format ! ( "didn't find token {:?} in the map" , toknum ) ;
191
+ let proto_tok = tokens . get ( toknum) . expect ( & not_found [ .. ] ) ;
187
192
188
193
let nm = parse:: token:: intern ( content) ;
189
194
@@ -209,24 +214,25 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_
209
214
ref t => t. clone ( )
210
215
} ;
211
216
212
- let offset = if real_tok == token:: Eof
213
- {
217
+ let start_offset = if real_tok == token:: Eof {
214
218
1
215
219
} else {
216
220
0
217
221
} ;
218
222
219
- let mut lo = start. parse :: < u32 > ( ) . unwrap ( ) - offset;
220
- let mut hi = end. parse :: < u32 > ( ) . unwrap ( ) + 1 ;
223
+ let offset = if has_bom { 1 } else { 0 } ;
224
+
225
+ let mut lo = start. parse :: < u32 > ( ) . unwrap ( ) - start_offset - offset;
226
+ let mut hi = end. parse :: < u32 > ( ) . unwrap ( ) + 1 - offset;
221
227
222
228
// Adjust the span: For each surrogate pair already encountered, subtract one position.
223
229
lo -= surrogate_pairs_pos. binary_search ( & ( lo as usize ) ) . unwrap_or_else ( |x| x) as u32 ;
224
230
hi -= surrogate_pairs_pos. binary_search ( & ( hi as usize ) ) . unwrap_or_else ( |x| x) as u32 ;
225
231
226
- let sp = syntax :: codemap:: Span {
227
- lo : syntax :: codemap:: BytePos ( lo) ,
228
- hi : syntax :: codemap:: BytePos ( hi) ,
229
- expn_id : syntax :: codemap:: NO_EXPANSION
232
+ let sp = codemap:: Span {
233
+ lo : codemap:: BytePos ( lo) ,
234
+ hi : codemap:: BytePos ( hi) ,
235
+ expn_id : codemap:: NO_EXPANSION
230
236
} ;
231
237
232
238
TokenAndSpan {
@@ -245,10 +251,10 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
245
251
}
246
252
}
247
253
248
- fn span_cmp ( antlr_sp : syntax :: codemap:: Span , rust_sp : syntax :: codemap:: Span , cm : & syntax :: codemap:: CodeMap ) -> bool {
254
+ fn span_cmp ( antlr_sp : codemap:: Span , rust_sp : codemap:: Span , cm : & codemap:: CodeMap ) -> bool {
249
255
antlr_sp. expn_id == rust_sp. expn_id &&
250
- antlr_sp. lo . to_uint ( ) == cm. bytepos_to_file_charpos ( rust_sp. lo ) . to_uint ( ) &&
251
- antlr_sp. hi . to_uint ( ) == cm. bytepos_to_file_charpos ( rust_sp. hi ) . to_uint ( )
256
+ antlr_sp. lo . to_usize ( ) == cm. bytepos_to_file_charpos ( rust_sp. lo ) . to_usize ( ) &&
257
+ antlr_sp. hi . to_usize ( ) == cm. bytepos_to_file_charpos ( rust_sp. hi ) . to_usize ( )
252
258
}
253
259
254
260
fn main ( ) {
@@ -257,10 +263,15 @@ fn main() {
257
263
r. next_token ( )
258
264
}
259
265
260
- let args = std:: os:: args ( ) ;
266
+ let mut args = env:: args ( ) . skip ( 1 ) ;
267
+ let filename = args. next ( ) . unwrap ( ) ;
268
+ if filename. find ( "parse-fail" ) . is_some ( ) {
269
+ return ;
270
+ }
261
271
262
272
// Rust's lexer
263
- let code = File :: open ( & Path :: new ( args[ 1 ] ) ) . unwrap ( ) . read_to_string ( ) . unwrap ( ) ;
273
+ let mut code = String :: new ( ) ;
274
+ File :: open ( & Path :: new ( & filename) ) . unwrap ( ) . read_to_string ( & mut code) . unwrap ( ) ;
264
275
265
276
let surrogate_pairs_pos: Vec < usize > = code. chars ( ) . enumerate ( )
266
277
. filter ( |& ( _, c) | c as usize > 0xFFFF )
@@ -269,6 +280,8 @@ fn main() {
269
280
. map ( |( x, n) | x + n)
270
281
. collect ( ) ;
271
282
283
+ let has_bom = code. starts_with ( "\u{feff} " ) ;
284
+
272
285
debug ! ( "Pairs: {:?}" , surrogate_pairs_pos) ;
273
286
274
287
let options = config:: basic_options ( ) ;
@@ -281,15 +294,18 @@ fn main() {
281
294
let ref cm = lexer. span_diagnostic . cm ;
282
295
283
296
// ANTLR
284
- let mut token_file = File :: open ( & Path :: new ( args[ 2 ] ) ) ;
285
- let token_map = parse_token_list ( token_file. read_to_string ( ) . unwrap ( ) ) ;
297
+ let mut token_file = File :: open ( & Path :: new ( & args. next ( ) . unwrap ( ) ) ) . unwrap ( ) ;
298
+ let mut token_list = String :: new ( ) ;
299
+ token_file. read_to_string ( & mut token_list) . unwrap ( ) ;
300
+ let token_map = parse_token_list ( & token_list[ ..] ) ;
286
301
287
- let mut stdin = std:: io:: stdin ( ) ;
288
- let mut lock = stdin. lock ( ) ;
302
+ let stdin = std:: io:: stdin ( ) ;
303
+ let lock = stdin. lock ( ) ;
289
304
let lines = lock. lines ( ) ;
290
- let mut antlr_tokens = lines. map ( |l| parse_antlr_token ( l. unwrap ( ) . trim ( ) ,
291
- & token_map,
292
- & surrogate_pairs_pos[ ] ) ) ;
305
+ let antlr_tokens = lines. map ( |l| parse_antlr_token ( l. unwrap ( ) . trim ( ) ,
306
+ & token_map,
307
+ & surrogate_pairs_pos[ ..] ,
308
+ has_bom) ) ;
293
309
294
310
for antlr_tok in antlr_tokens {
295
311
let rustc_tok = next ( & mut lexer) ;
@@ -314,7 +330,7 @@ fn main() {
314
330
}
315
331
_ => panic!( "{:?} is not {:?}" , antlr_tok, rustc_tok)
316
332
} , ) *
317
- ref c => assert!( c == & antlr_tok. tok, "{:?} is not {:?}" , rustc_tok , antlr_tok )
333
+ ref c => assert!( c == & antlr_tok. tok, "{:?} is not {:?}" , antlr_tok , rustc_tok )
318
334
}
319
335
)
320
336
}
0 commit comments