auto merge of #5559 : jbclements/rust/change-to-tt-based-parsing, r=jbclements

bors · bors · commit 6153aae80938 · 2013-04-03T11:31:03.000-07:00
Changes the parser to parse all streams into token-trees before hitting the parser proper, in preparation for hygiene.  As an added bonus, it appears to speed up the parser (albeit by a totally imperceptible 1%).

Also, many comments in the parser.
Also, field renaming in token-trees (readme-&gt;forest, cur-&gt;stack).
diff --git a/src/librustc/driver/driver.rs b/src/librustc/driver/driver.rs
@@ -151,7 +151,7 @@ pub fn parse_input(sess: Session, +cfg: ast::crate_cfg, input: input)
     -> @ast::crate {
     match input {
       file_input(ref file) => {
-        parse::parse_crate_from_file(&(*file), cfg, sess.parse_sess)
+        parse::parse_crate_from_file_using_tts(&(*file), cfg, sess.parse_sess)
       }
       str_input(ref src) => {
         // FIXME (#2319): Don't really want to box the source string
diff --git a/src/libsyntax/ext/tt/transcribe.rs b/src/libsyntax/ext/tt/transcribe.rs
@@ -26,7 +26,7 @@ use core::vec;
    `~` */
 ///an unzipping of `token_tree`s
 struct TtFrame {
-    readme: @mut ~[ast::token_tree],
+    forest: @mut ~[ast::token_tree],
     idx: uint,
     dotdotdoted: bool,
     sep: Option<Token>,
@@ -37,7 +37,7 @@ pub struct TtReader {
     sp_diag: @span_handler,
     interner: @ident_interner,
     // the unzipped tree:
-    cur: @mut TtFrame,
+    stack: @mut TtFrame,
     /* for MBE-style macro transcription */
     interpolations: LinearMap<ident, @named_match>,
     repeat_idx: ~[uint],
@@ -58,8 +58,8 @@ pub fn new_tt_reader(sp_diag: @span_handler,
     let r = @mut TtReader {
         sp_diag: sp_diag,
         interner: itr,
-        cur: @mut TtFrame {
-            readme: @mut src,
+        stack: @mut TtFrame {
+            forest: @mut src,
             idx: 0u,
             dotdotdoted: false,
             sep: None,
@@ -81,7 +81,7 @@ pub fn new_tt_reader(sp_diag: @span_handler,
 
 fn dup_tt_frame(f: @mut TtFrame) -> @mut TtFrame {
     @mut TtFrame {
-        readme: @mut (copy *f.readme),
+        forest: @mut (copy *f.forest),
         idx: f.idx,
         dotdotdoted: f.dotdotdoted,
         sep: copy f.sep,
@@ -96,7 +96,7 @@ pub fn dup_tt_reader(r: @mut TtReader) -> @mut TtReader {
     @mut TtReader {
         sp_diag: r.sp_diag,
         interner: r.interner,
-        cur: dup_tt_frame(r.cur),
+        stack: dup_tt_frame(r.stack),
         interpolations: r.interpolations,
         repeat_idx: copy r.repeat_idx,
         repeat_len: copy r.repeat_len,
@@ -167,45 +167,46 @@ fn lockstep_iter_size(t: token_tree, r: &mut TtReader) -> lis {
     }
 }
 
-
+// return the next token from the TtReader.
+// EFFECT: advances the reader's token field
 pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
     let ret_val = TokenAndSpan {
         tok: copy r.cur_tok,
         sp: r.cur_span,
     };
     loop {
         {
-            let cur = &mut *r.cur;
-            let readme = &mut *cur.readme;
-            if cur.idx < readme.len() {
+            let stack = &mut *r.stack;
+            let forest = &mut *stack.forest;
+            if stack.idx < forest.len() {
                 break;
             }
         }
 
         /* done with this set; pop or repeat? */
-        if ! r.cur.dotdotdoted
+        if ! r.stack.dotdotdoted
             || { *r.repeat_idx.last() == *r.repeat_len.last() - 1 } {
 
-            match r.cur.up {
+            match r.stack.up {
               None => {
                 r.cur_tok = EOF;
                 return ret_val;
               }
               Some(tt_f) => {
-                if r.cur.dotdotdoted {
+                if r.stack.dotdotdoted {
                     r.repeat_idx.pop();
                     r.repeat_len.pop();
                 }
 
-                r.cur = tt_f;
-                r.cur.idx += 1u;
+                r.stack = tt_f;
+                r.stack.idx += 1u;
               }
             }
 
         } else { /* repeat */
-            r.cur.idx = 0u;
+            r.stack.idx = 0u;
             r.repeat_idx[r.repeat_idx.len() - 1u] += 1u;
-            match r.cur.sep {
+            match r.stack.sep {
               Some(copy tk) => {
                 r.cur_tok = tk; /* repeat same span, I guess */
                 return ret_val;
@@ -216,21 +217,21 @@ pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
     }
     loop { /* because it's easiest, this handles `tt_delim` not starting
     with a `tt_tok`, even though it won't happen */
-        match r.cur.readme[r.cur.idx] {
+        match r.stack.forest[r.stack.idx] {
           tt_delim(copy tts) => {
-            r.cur = @mut TtFrame {
-                readme: @mut tts,
+            r.stack = @mut TtFrame {
+                forest: @mut tts,
                 idx: 0u,
                 dotdotdoted: false,
                 sep: None,
-                up: option::Some(r.cur)
+                up: option::Some(r.stack)
             };
             // if this could be 0-length, we'd need to potentially recur here
           }
           tt_tok(sp, copy tok) => {
             r.cur_span = sp;
             r.cur_tok = tok;
-            r.cur.idx += 1u;
+            r.stack.idx += 1u;
             return ret_val;
           }
           tt_seq(sp, copy tts, copy sep, zerok) => {
@@ -256,17 +257,17 @@ pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
                                                once");
                           }
 
-                    r.cur.idx += 1u;
+                    r.stack.idx += 1u;
                     return tt_next_token(r);
                 } else {
                     r.repeat_len.push(len);
                     r.repeat_idx.push(0u);
-                    r.cur = @mut TtFrame {
-                        readme: @mut tts,
+                    r.stack = @mut TtFrame {
+                        forest: @mut tts,
                         idx: 0u,
                         dotdotdoted: true,
                         sep: sep,
-                        up: Some(r.cur)
+                        up: Some(r.stack)
                     };
                 }
               }
@@ -280,13 +281,13 @@ pub fn tt_next_token(r: &mut TtReader) -> TokenAndSpan {
               (b) we actually can, since it's a token. */
               matched_nonterminal(nt_ident(sn,b)) => {
                 r.cur_span = sp; r.cur_tok = IDENT(sn,b);
-                r.cur.idx += 1u;
+                r.stack.idx += 1u;
                 return ret_val;
               }
               matched_nonterminal(ref other_whole_nt) => {
                 r.cur_span = sp;
                 r.cur_tok = INTERPOLATED(copy *other_whole_nt);
-                r.cur.idx += 1u;
+                r.stack.idx += 1u;
                 return ret_val;
               }
               matched_seq(*) => {
diff --git a/src/libsyntax/parse/common.rs b/src/libsyntax/parse/common.rs
@@ -159,6 +159,9 @@ pub impl Parser {
         }
     }
 
+    // if the given word is not a keyword, signal an error.
+    // if the next token is the given keyword, eat it and return
+    // true. Otherwise, return false.
     fn eat_keyword(&self, word: &~str) -> bool {
         self.require_keyword(word);
         let is_kw = match *self.token {
@@ -169,6 +172,9 @@ pub impl Parser {
         is_kw
     }
 
+    // if the given word is not a keyword, signal an error.
+    // if the next token is not the given word, signal an error.
+    // otherwise, eat it.
     fn expect_keyword(&self, word: &~str) {
         self.require_keyword(word);
         if !self.eat_keyword(word) {
@@ -182,10 +188,12 @@ pub impl Parser {
         }
     }
 
+    // return true if the given string is a strict keyword
     fn is_strict_keyword(&self, word: &~str) -> bool {
         self.strict_keywords.contains(word)
     }
 
+    // signal an error if the current token is a strict keyword
     fn check_strict_keywords(&self) {
         match *self.token {
             token::IDENT(_, false) => {
@@ -196,16 +204,19 @@ pub impl Parser {
         }
     }
 
+    // signal an error if the given string is a strict keyword
     fn check_strict_keywords_(&self, w: &~str) {
         if self.is_strict_keyword(w) {
             self.fatal(fmt!("found `%s` in ident position", *w));
         }
     }
 
+    // return true if this is a reserved keyword
     fn is_reserved_keyword(&self, word: &~str) -> bool {
         self.reserved_keywords.contains(word)
     }
 
+    // signal an error if the current token is a reserved keyword
     fn check_reserved_keywords(&self) {
         match *self.token {
             token::IDENT(_, false) => {
@@ -216,14 +227,16 @@ pub impl Parser {
         }
     }
 
+    // signal an error if the given string is a reserved keyword
     fn check_reserved_keywords_(&self, w: &~str) {
         if self.is_reserved_keyword(w) {
             self.fatal(fmt!("`%s` is a reserved keyword", *w));
         }
     }
 
     // expect and consume a GT. if a >> is seen, replace it
-    // with a single > and continue.
+    // with a single > and continue. If a GT is not seen,
+    // signal an error.
     fn expect_gt(&self) {
         if *self.token == token::GT {
             self.bump();
diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs
@@ -80,7 +80,8 @@ pub fn new_low_level_string_reader(span_diagnostic: @span_handler,
         last_pos: filemap.start_pos,
         col: CharPos(0),
         curr: initial_char,
-        filemap: filemap, interner: itr,
+        filemap: filemap,
+        interner: itr,
         /* dummy values; not read */
         peek_tok: token::EOF,
         peek_span: codemap::dummy_sp()
@@ -150,6 +151,7 @@ impl reader for TtReader {
 }
 
 // EFFECT: advance peek_tok and peek_span to refer to the next token.
+// EFFECT: update the interner, maybe.
 fn string_advance_token(r: @mut StringReader) {
     match (consume_whitespace_and_comments(r)) {
         Some(comment) => {
@@ -539,6 +541,9 @@ fn ident_continue(c: char) -> bool {
         || (c > 'z' && char::is_XID_continue(c))
 }
 
+// return the next token from the string
+// EFFECT: advances the input past that token
+// EFFECT: updates the interner
 fn next_token_inner(rdr: @mut StringReader) -> token::Token {
     let mut accum_str = ~"";
     let mut c = rdr.curr;
diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs
@@ -45,10 +45,14 @@ pub mod classify;
 /// Reporting obsolete syntax
 pub mod obsolete;
 
+// info about a parsing session.
+// This structure and the reader both have
+// an interner associated with them. If they're
+// not the same, bad things can happen.
 pub struct ParseSess {
-    cm: @codemap::CodeMap,
+    cm: @codemap::CodeMap, // better be the same as the one in the reader!
     next_id: node_id,
-    span_diagnostic: @span_handler,
+    span_diagnostic: @span_handler, // better be the same as the one in the reader!
     interner: @ident_interner,
 }
 
@@ -90,6 +94,19 @@ pub fn parse_crate_from_file(
     // why is there no p.abort_if_errors here?
 }
 
+pub fn parse_crate_from_file_using_tts(
+    input: &Path,
+    cfg: ast::crate_cfg,
+    sess: @mut ParseSess
+) -> @ast::crate {
+    let p = new_parser_from_file(sess, /*bad*/ copy cfg, input);
+    let tts = p.parse_all_token_trees();
+    new_parser_from_tts(sess,cfg,tts).parse_crate_mod(/*bad*/ copy cfg)
+    // why is there no p.abort_if_errors here?
+}
+
+
+
 pub fn parse_crate_from_source_str(
     name: ~str,
     source: @~str,
@@ -313,17 +330,46 @@ mod test {
     use std;
     use core::io;
     use core::option::None;
+    use ast;
 
     #[test] fn to_json_str<E : Encodable<std::json::Encoder>>(val: @E) -> ~str {
         do io::with_str_writer |writer| {
             val.encode(~std::json::Encoder(writer));
         }
     }
 
+    fn string_to_crate (source_str : @~str) -> @ast::crate {
+        parse_crate_from_source_str(
+            ~"bogofile",
+            source_str,
+            ~[],
+            new_parse_sess(None))
+    }
+
+    fn string_to_tt_to_crate (source_str : @~str) -> @ast::crate {
+        let tts = parse_tts_from_source_str(
+            ~"bogofile",
+           source_str,
+           ~[],
+           new_parse_sess(None));
+        new_parser_from_tts(new_parse_sess(None),~[],tts)
+            .parse_crate_mod(~[])
+    }
+
+    // make sure that parsing from TTs produces the same result
+    // as parsing from strings
+    #[test] fn tts_produce_the_same_result () {
+        let source_str = @~"fn foo (x : int) { x; }";
+        assert_eq!(string_to_tt_to_crate(source_str),
+                     string_to_crate(source_str));
+    }
+
+    // check the contents of the tt manually:
     #[test] fn alltts () {
+        let source_str = @~"fn foo (x : int) { x; }";
         let tts = parse_tts_from_source_str(
             ~"bogofile",
-            @~"fn foo (x : int) { x; }",
+            source_str,
             ~[],
             new_parse_sess(None));
         assert_eq!(
diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs

Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,7 @@ pub fn parse_input(sess: Session, +cfg: ast::crate_cfg, input: input)`
`151`	`151`	`-> @ast::crate {`
`152`	`152`	`match input {`
`153`	`153`	`file_input(ref file) => {`
`154`		`- parse::parse_crate_from_file(&(*file), cfg, sess.parse_sess)`
	`154`	`+ parse::parse_crate_from_file_using_tts(&(*file), cfg, sess.parse_sess)`
`155`	`155`	`}`
`156`	`156`	`str_input(ref src) => {`
`157`	`157`	`// FIXME (#2319): Don't really want to box the source string`
Original file line number	Diff line number	Diff line change
`@@ -159,6 +159,9 @@ pub impl Parser {`
`159`	`159`	`}`
`160`	`160`	`}`
`161`	`161`
	`162`	`+ // if the given word is not a keyword, signal an error.`
	`163`	`+ // if the next token is the given keyword, eat it and return`
	`164`	`+ // true. Otherwise, return false.`
`162`	`165`	`fn eat_keyword(&self, word: &~str) -> bool {`
`163`	`166`	`self.require_keyword(word);`
`164`	`167`	`let is_kw = match *self.token {`
`@@ -169,6 +172,9 @@ pub impl Parser {`
`169`	`172`	`is_kw`
`170`	`173`	`}`
`171`	`174`
	`175`	`+ // if the given word is not a keyword, signal an error.`
	`176`	`+ // if the next token is not the given word, signal an error.`
	`177`	`+ // otherwise, eat it.`
`172`	`178`	`fn expect_keyword(&self, word: &~str) {`
`173`	`179`	`self.require_keyword(word);`
`174`	`180`	`if !self.eat_keyword(word) {`
`@@ -182,10 +188,12 @@ pub impl Parser {`
`182`	`188`	`}`
`183`	`189`	`}`
`184`	`190`
	`191`	`+ // return true if the given string is a strict keyword`
`185`	`192`	`fn is_strict_keyword(&self, word: &~str) -> bool {`
`186`	`193`	`self.strict_keywords.contains(word)`
`187`	`194`	`}`
`188`	`195`
	`196`	`+ // signal an error if the current token is a strict keyword`
`189`	`197`	`fn check_strict_keywords(&self) {`
`190`	`198`	`match *self.token {`
`191`	`199`	`token::IDENT(_, false) => {`
`@@ -196,16 +204,19 @@ pub impl Parser {`
`196`	`204`	`}`
`197`	`205`	`}`
`198`	`206`
	`207`	`+ // signal an error if the given string is a strict keyword`
`199`	`208`	`fn check_strict_keywords_(&self, w: &~str) {`
`200`	`209`	`if self.is_strict_keyword(w) {`
`201`	`210`	self.fatal(fmt!("found `%s` in ident position", *w));
`202`	`211`	`}`
`203`	`212`	`}`
`204`	`213`
	`214`	`+ // return true if this is a reserved keyword`
`205`	`215`	`fn is_reserved_keyword(&self, word: &~str) -> bool {`
`206`	`216`	`self.reserved_keywords.contains(word)`
`207`	`217`	`}`
`208`	`218`
	`219`	`+ // signal an error if the current token is a reserved keyword`
`209`	`220`	`fn check_reserved_keywords(&self) {`
`210`	`221`	`match *self.token {`
`211`	`222`	`token::IDENT(_, false) => {`
`@@ -216,14 +227,16 @@ pub impl Parser {`
`216`	`227`	`}`
`217`	`228`	`}`
`218`	`229`
	`230`	`+ // signal an error if the given string is a reserved keyword`
`219`	`231`	`fn check_reserved_keywords_(&self, w: &~str) {`
`220`	`232`	`if self.is_reserved_keyword(w) {`
`221`	`233`	self.fatal(fmt!("`%s` is a reserved keyword", *w));
`222`	`234`	`}`
`223`	`235`	`}`
`224`	`236`
`225`	`237`	`// expect and consume a GT. if a >> is seen, replace it`
`226`		`- // with a single > and continue.`
	`238`	`+ // with a single > and continue. If a GT is not seen,`
	`239`	`+ // signal an error.`
`227`	`240`	`fn expect_gt(&self) {`
`228`	`241`	`if *self.token == token::GT {`
`229`	`242`	`self.bump();`