Skip to content

Commit 30a978d

Browse files
Refactor string escape sequence parsing (rescript-lang#431)
* Refactor parsing of string literals with a state machine * Only log string escape sequence errors during scanning * Remove unused string escape error messages in parsing. Errors are reported during scanning, we don't need to report them both. Ideally we should all do this in one pass…
1 parent fb2b7dd commit 30a978d

File tree

3 files changed

+75
-143
lines changed

3 files changed

+75
-143
lines changed

syntax/src/res_core.ml

Lines changed: 74 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,14 @@ let ifLetAttr = (Location.mknoloc "ns.iflet", Parsetree.PStr [])
137137
let suppressFragileMatchWarningAttr = (Location.mknoloc "warning", Parsetree.PStr [Ast_helper.Str.eval (Ast_helper.Exp.constant (Pconst_string ("-4", None)))])
138138
let makeBracesAttr loc = (Location.mkloc "ns.braces" loc, Parsetree.PStr [])
139139

140+
type stringLiteralState =
141+
| Start
142+
| Backslash
143+
| HexEscape
144+
| DecimalEscape
145+
| OctalEscape
146+
| EscapedLineBreak
147+
140148
type typDefOrExt =
141149
| TypeDef of {recFlag: Asttypes.rec_flag; types: Parsetree.type_declaration list}
142150
| TypeExt of Parsetree.type_extension
@@ -488,149 +496,74 @@ let parseStringLiteral s =
488496
let len = String.length s in
489497
let b = Buffer.create (String.length s) in
490498

491-
let rec loop i =
499+
let rec parse state i d =
492500
if i = len then
493-
()
501+
(match state with
502+
| HexEscape | DecimalEscape | OctalEscape -> false
503+
| _ -> true)
494504
else
495505
let c = String.unsafe_get s i in
496-
match c with
497-
| '\\' as c ->
498-
let nextIx = i + 1 in
499-
if nextIx < len then
500-
let nextChar = String.unsafe_get s nextIx in
501-
begin match nextChar with
502-
(* this is interesting:
503-
* let x = "foo\
504-
* bar"
505-
* The `\` escapes the newline, as if there was nothing here.
506-
* Essentialy transforming this piece of code in `let x = "foobar"`
507-
*
508-
* What is even more interesting is that any space or tabs after the
509-
* escaped newline are also dropped.
510-
* let x = "foo\
511-
* bar"
512-
* is the same as `let x = "foobar"`
513-
*)
514-
| '\010' | '\013' ->
515-
let i = ref (nextIx + 1) in
516-
while !i < len && (
517-
let c = String.unsafe_get s !i in
518-
c = ' ' || c = '\t'
519-
) do
520-
incr i
521-
done;
522-
loop !i
523-
| 'n' ->
524-
Buffer.add_char b '\010';
525-
loop (nextIx + 1)
526-
| 'r' ->
527-
Buffer.add_char b '\013';
528-
loop (nextIx + 1)
529-
| 'b' ->
530-
Buffer.add_char b '\008';
531-
loop (nextIx + 1)
532-
| 't' ->
533-
Buffer.add_char b '\009';
534-
loop (nextIx + 1)
535-
| '\\' as c ->
536-
Buffer.add_char b c;
537-
loop (nextIx + 1)
538-
| ' ' as c ->
539-
Buffer.add_char b c;
540-
loop (nextIx + 1)
541-
| '\'' as c ->
542-
Buffer.add_char b c;
543-
loop (nextIx + 1)
544-
| '\"' as c ->
545-
Buffer.add_char b c;
546-
loop (nextIx + 1)
547-
| '0' .. '9' ->
548-
if nextIx + 2 < len then
549-
let c0 = nextChar in
550-
let c1 = (String.unsafe_get s (nextIx + 1)) in
551-
let c2 = (String.unsafe_get s (nextIx + 2)) in
552-
let c =
553-
100 * (Char.code c0 - 48) +
554-
10 * (Char.code c1 - 48) +
555-
(Char.code c2 - 48)
556-
in
557-
if (c < 0 || c > 255) then (
558-
Buffer.add_char b '\\';
559-
Buffer.add_char b c0;
560-
Buffer.add_char b c1;
561-
Buffer.add_char b c2;
562-
loop (nextIx + 3)
563-
) else (
564-
Buffer.add_char b (Char.unsafe_chr c);
565-
loop (nextIx + 3)
566-
)
567-
else (
568-
Buffer.add_char b '\\';
569-
Buffer.add_char b nextChar;
570-
loop (nextIx + 1)
571-
)
572-
| 'o' ->
573-
if nextIx + 3 < len then
574-
let c0 = (String.unsafe_get s (nextIx + 1)) in
575-
let c1 = (String.unsafe_get s (nextIx + 2)) in
576-
let c2 = (String.unsafe_get s (nextIx + 3)) in
577-
let c =
578-
64 * (Char.code c0 - 48) +
579-
8 * (Char.code c1 - 48) +
580-
(Char.code c2 - 48)
581-
in
582-
if (c < 0 || c > 255) then (
583-
Buffer.add_char b '\\';
584-
Buffer.add_char b '0';
585-
Buffer.add_char b c0;
586-
Buffer.add_char b c1;
587-
Buffer.add_char b c2;
588-
loop (nextIx + 4)
589-
) else (
590-
Buffer.add_char b (Char.unsafe_chr c);
591-
loop (nextIx + 4)
592-
)
593-
else (
594-
Buffer.add_char b '\\';
595-
Buffer.add_char b nextChar;
596-
loop (nextIx + 1)
597-
)
598-
| 'x' as c ->
599-
if nextIx + 2 < len then
600-
let c0 = (String.unsafe_get s (nextIx + 1)) in
601-
let c1 = (String.unsafe_get s (nextIx + 2)) in
602-
let c = (16 * (hexValue c0)) + (hexValue c1) in
603-
if (c < 0 || c > 255) then (
604-
Buffer.add_char b '\\';
605-
Buffer.add_char b 'x';
606-
Buffer.add_char b c0;
607-
Buffer.add_char b c1;
608-
loop (nextIx + 3)
609-
) else (
610-
Buffer.add_char b (Char.unsafe_chr c);
611-
loop (nextIx + 3)
612-
)
613-
else (
614-
Buffer.add_char b '\\';
615-
Buffer.add_char b c;
616-
loop (nextIx + 2)
617-
)
618-
| _ ->
619-
Buffer.add_char b c;
620-
Buffer.add_char b nextChar;
621-
loop (nextIx + 1)
622-
end
623-
else (
624-
Buffer.add_char b c;
625-
()
626-
)
627-
| c ->
628-
Buffer.add_char b c;
629-
loop (i + 1)
506+
match state with
507+
| Start ->
508+
(match c with
509+
| '\\' -> parse Backslash (i + 1) d
510+
| c -> Buffer.add_char b c; parse Start (i + 1) d)
511+
| Backslash ->
512+
(match c with
513+
| 'n' -> Buffer.add_char b '\n'; parse Start (i + 1) d
514+
| 'r' -> Buffer.add_char b '\r'; parse Start (i + 1) d
515+
| 'b' -> Buffer.add_char b '\008'; parse Start (i + 1) d
516+
| 't' -> Buffer.add_char b '\009'; parse Start (i + 1) d
517+
| ('\\' | ' ' | '\'' | '"') as c -> Buffer.add_char b c; parse Start (i + 1) d
518+
| 'x' -> parse HexEscape (i + 1) 0
519+
| 'o' -> parse OctalEscape (i + 1) 0
520+
| '0' .. '9' -> parse DecimalEscape i 0
521+
| '\010' | '\013' -> parse EscapedLineBreak (i + 1) d
522+
| c -> Buffer.add_char b '\\'; Buffer.add_char b c; parse Start (i + 1) d)
523+
| HexEscape ->
524+
if d == 1 then
525+
let c0 = String.unsafe_get s (i - 1) in
526+
let c1 = String.unsafe_get s i in
527+
let c = (16 * (hexValue c0)) + (hexValue c1) in
528+
if c < 0 || c > 255 then false
529+
else (
530+
Buffer.add_char b (Char.unsafe_chr c);
531+
parse Start (i + 1) 0
532+
)
533+
else
534+
parse HexEscape (i + 1) (d + 1)
535+
| DecimalEscape ->
536+
if d == 2 then
537+
let c0 = String.unsafe_get s (i - 2) in
538+
let c1 = String.unsafe_get s (i - 1) in
539+
let c2 = String.unsafe_get s i in
540+
let c = 100 * (Char.code c0 - 48) + 10 * (Char.code c1 - 48) + (Char.code c2 - 48) in
541+
if c < 0 || c > 255 then false
542+
else (
543+
Buffer.add_char b (Char.unsafe_chr c);
544+
parse Start (i + 1) 0
545+
)
546+
else
547+
parse DecimalEscape (i + 1) (d + 1)
548+
| OctalEscape ->
549+
if d == 2 then
550+
let c0 = String.unsafe_get s (i - 2) in
551+
let c1 = String.unsafe_get s (i - 1) in
552+
let c2 = String.unsafe_get s i in
553+
let c = 64 * (Char.code c0 - 48) + 8 * (Char.code c1 - 48) + (Char.code c2 - 48) in
554+
if c < 0 || c > 255 then false
555+
else (
556+
Buffer.add_char b (Char.unsafe_chr c);
557+
parse Start (i + 1) 0
558+
)
559+
else
560+
parse OctalEscape (i + 1) (d + 1)
561+
| EscapedLineBreak ->
562+
(match c with
563+
| ' ' | '\t' -> parse EscapedLineBreak (i + 1) d
564+
| c -> Buffer.add_char b c; parse Start (i + 1) d)
630565
in
631-
loop 0;
632-
Buffer.contents b
633-
566+
if parse Start 0 0 then Buffer.contents b else s
634567

635568
let rec parseLident p =
636569
let recoverLident p =
@@ -696,8 +629,8 @@ let parseHashIdent ~startPos p =
696629
Parser.expect Hash p;
697630
match p.token with
698631
| String text ->
699-
Parser.next p;
700632
let text = if p.mode = ParseForTypeChecker then parseStringLiteral text else text in
633+
Parser.next p;
701634
(text, mkLoc startPos p.prevEndPos)
702635
| Int {i; suffix} ->
703636
let () = match suffix with
@@ -1207,8 +1140,8 @@ let rec parsePattern ?(alias=true) ?(or_=true) p =
12071140
) else (
12081141
let (ident, loc) = match p.token with
12091142
| String text ->
1210-
Parser.next p;
12111143
let text = if p.mode = ParseForTypeChecker then parseStringLiteral text else text in
1144+
Parser.next p;
12121145
(text, mkLoc startPos p.prevEndPos)
12131146
| Int {i; suffix} ->
12141147
let () = match suffix with

syntax/tests/parsing/errors/scanner/expected/escapeSequence.res.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@
2020
unknown escape sequence
2121

2222
let x = "\\0"
23-
let x = "\\0AAA"
23+
let x = "\\oAAA"

syntax/tests/res_test.ml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,6 @@ module ParserApiTest = struct
173173
makeDefault();
174174
unixLf();
175175
windowsCrlf()
176-
177176
end
178177

179178
let () = OutcomePrinterTests.run()

0 commit comments

Comments
 (0)