Skip to content

Commit cfcea33

Browse files
Unicode support (rescript-lang#433)
* ## Unicode support This PR adds support for Unicode codepoints at the syntax level: ReScript source code is now unicode text encoded in UTF-8. Fixes rescript-lang/syntax#397 ### Codepoint literals A codepoint literal represents an integer value identifying a unicode code point. It is expressed as one or more characters enclosed in single quotes. Examples are `’x’`, `’\n’` or `\u{00A9}`. Multiple UTF-8-encoded bytes may represent a single integer value. ### String literals String literals are (possibly multi-byte) UTF-8 encoded character sequences between double quotes, as in `"fox"`. ### New escape sequences Both codepoint and string literals accept the following new escape sequences: 1) Unicode escape sequences Any character with a character code lower than 65536 can be escaped using the hexadecimal value of its character code, prefixed with `\u`. Unicode escapes are six characters long. They require exactly four characters following `\u` . If the hexadecimal character code is only one, two or three characters long, you’ll need to pad it with leading zeroes. Example: `'\u2665'` (Represents ♥) 2) Unicode codepoint escape sequences Any code point or character can be escaped using the hexadecimal value of its character code, prefixed with `\u{` and suffixed with `}` . This allows for code points up to 0x10FFFF, which is the highest code point defined by Unicode. Unicode code point escapes consist of at least five characters. At least one hexadecimal character can be wrapped in `\u{…}` . There is no upper limit on the number of hex digits in use (for example '\u{000000000061}' == 'a') Example: `'\u{2318}'` (Represents ⌘) * Rename Character token to Codepoint token. Codepoint makes more sense with unicode * Add comment about codepoint literal encoding for printer. * Parse all normal strings as {js||js} strings. The compiler processes these strings with js semantics. Previously {js||js} where interpreted as template literal strings. The internal encoding has been changed to use an attribute (@res.template) to detect template literal strings
1 parent aefb65b commit cfcea33

File tree

76 files changed

+911
-409
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

76 files changed

+911
-409
lines changed

syntax/.depend

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ src/res_comment.cmx : src/res_comment.cmi
1212
src/res_comment.cmi :
1313
src/res_comments_table.cmx : src/res_parsetree_viewer.cmx src/res_doc.cmx \
1414
src/res_comment.cmx
15-
src/res_core.cmx : src/res_token.cmx src/res_scanner.cmx src/res_printer.cmx \
16-
src/res_parser.cmx src/res_js_ffi.cmx src/res_grammar.cmx src/res_doc.cmx \
17-
src/res_diagnostics.cmx src/res_comments_table.cmx src/res_core.cmi
15+
src/res_core.cmx : src/res_utf8.cmx src/res_token.cmx src/res_scanner.cmx \
16+
src/res_printer.cmx src/res_parser.cmx src/res_js_ffi.cmx \
17+
src/res_grammar.cmx src/res_doc.cmx src/res_diagnostics.cmx \
18+
src/res_comments_table.cmx src/res_core.cmi
1819
src/res_core.cmi : src/res_parser.cmi
1920
src/res_diagnostics.cmx : src/res_token.cmx src/res_grammar.cmx \
2021
src/res_diagnostics_printing_utils.cmx src/res_diagnostics.cmi
@@ -60,16 +61,19 @@ src/res_parser.cmi : src/res_token.cmx src/res_scanner.cmi \
6061
src/res_comment.cmi
6162
src/res_parsetree_viewer.cmx : src/res_parsetree_viewer.cmi
6263
src/res_parsetree_viewer.cmi :
63-
src/res_printer.cmx : src/res_token.cmx src/res_parsetree_viewer.cmx \
64-
src/res_parens.cmx src/res_doc.cmx src/res_comments_table.cmx \
65-
src/res_comment.cmx src/res_printer.cmi
64+
src/res_printer.cmx : src/res_utf8.cmx src/res_token.cmx \
65+
src/res_parsetree_viewer.cmx src/res_parens.cmx src/res_doc.cmx \
66+
src/res_comments_table.cmx src/res_comment.cmx src/res_printer.cmi
6667
src/res_printer.cmi : src/res_doc.cmi src/res_comments_table.cmx \
6768
src/res_comment.cmi
6869
src/res_reporting.cmx : src/res_token.cmx src/res_grammar.cmx
69-
src/res_scanner.cmx : src/res_token.cmx src/res_diagnostics.cmx \
70-
src/res_comment.cmx src/res_scanner.cmi
70+
src/res_scanner.cmx : src/res_utf8.cmx src/res_token.cmx \
71+
src/res_diagnostics.cmx src/res_comment.cmx src/res_scanner.cmi
7172
src/res_scanner.cmi : src/res_token.cmx src/res_diagnostics.cmi
7273
src/res_token.cmx : src/res_comment.cmx
73-
tests/res_test.cmx : src/res_token.cmx src/res_parser.cmx \
74-
src/res_outcome_printer.cmx src/res_multi_printer.cmx src/res_io.cmx \
75-
src/res_driver.cmx src/res_core.cmx
74+
src/res_utf8.cmx : src/res_utf8.cmi
75+
src/res_utf8.cmi :
76+
tests/res_test.cmx : tests/res_utf8_test.cmx src/res_token.cmx \
77+
src/res_parser.cmx src/res_outcome_printer.cmx src/res_multi_printer.cmx \
78+
src/res_io.cmx src/res_driver.cmx src/res_core.cmx
79+
tests/res_utf8_test.cmx : src/res_utf8.cmx

syntax/Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ API_FILES = \
2424
src/res_parsetree_viewer.cmx\
2525
src/res_parens.cmx\
2626
src/res_comments_table.cmx\
27+
src/res_utf8.cmx\
2728
src/res_printer.cmx\
2829
src/res_scanner.cmx\
2930
src/res_js_ffi.cmx\
@@ -40,7 +41,7 @@ API_FILES = \
4041

4142
CLI_FILES = $(API_FILES) src/res_cli.cmx
4243

43-
TEST_FILES = $(API_FILES) tests/res_test.cmx
44+
TEST_FILES = $(API_FILES) tests/res_utf8_test.cmx tests/res_test.cmx
4445

4546
.DEFAULT_GOAL := build-native
4647

@@ -74,7 +75,7 @@ test: reanalyze build-native lib/test.exe
7475
./lib/test.exe
7576
./test.sh
7677

77-
roundtrip-test: reanalyze bootstrap lib/test.exe
78+
roundtrip-test: reanalyze lib/test.exe
7879
./lib/test.exe
7980
ROUNDTRIP_TEST=1 ./test.sh
8081

syntax/benchmarks/Benchmark.ml

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ module Printer = Res_printer
66

77
module IO: sig
88
val readFile: string -> string
9-
val readStdin: unit -> string
109
end = struct
1110
(* random chunk size: 2^15, TODO: why do we guess randomly? *)
1211
let chunkSize = 32768
@@ -26,21 +25,6 @@ end = struct
2625
)
2726
in
2827
loop ()
29-
30-
let readStdin () =
31-
let buffer = Buffer.create chunkSize in
32-
let chunk = (Bytes.create [@doesNotRaise]) chunkSize in
33-
let rec loop () =
34-
let len = try input stdin chunk 0 chunkSize with Invalid_argument _ -> 0 in
35-
if len == 0 then (
36-
close_in_noerr stdin;
37-
Buffer.contents buffer
38-
) else (
39-
Buffer.add_subbytes buffer chunk 0 len;
40-
loop ()
41-
)
42-
in
43-
loop ()
4428
end
4529

4630
module Time: sig
@@ -188,29 +172,6 @@ end = struct
188172
done
189173
end
190174

191-
module Profile: sig
192-
val record : name:string -> (unit -> 'a) -> 'a
193-
val print: unit -> unit
194-
end = struct
195-
let state = Hashtbl.create 2
196-
197-
let record ~name f =
198-
let startTime = Time.now() in
199-
let result = f() in
200-
let endTime = Time.now() in
201-
202-
Hashtbl.add state name (Time.diff startTime endTime);
203-
result
204-
205-
let print () =
206-
let report = Hashtbl.fold (fun k v acc ->
207-
let line = Printf.sprintf "%s: %fms\n" k (Time.print v) in
208-
acc ^ line
209-
) state "\n\n"
210-
in
211-
print_endline report
212-
end
213-
214175
module Benchmarks: sig
215176
val run: unit -> unit
216177
end = struct

syntax/src/res_ast_conversion.ml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,8 @@ let hasUncurriedAttribute attrs = List.exists (fun attr -> match attr with
323323
| _ -> false
324324
) attrs
325325

326+
let templateLiteralAttr = (Location.mknoloc "res.template", Parsetree.PStr [])
327+
326328
let normalize =
327329
let open Ast_mapper in
328330
{ default_mapper with
@@ -368,7 +370,7 @@ let normalize =
368370
in
369371
let s = Parsetree.Pconst_string ((escapeTemplateLiteral txt), newTag) in
370372
{p with
371-
ppat_attributes = mapper.attributes mapper p.ppat_attributes;
373+
ppat_attributes = templateLiteralAttr::(mapper.attributes mapper p.ppat_attributes);
372374
ppat_desc = Ppat_constant s
373375
}
374376
| _ ->
@@ -396,7 +398,7 @@ let normalize =
396398
in
397399
let s = Parsetree.Pconst_string ((escapeTemplateLiteral txt), newTag) in
398400
{expr with
399-
pexp_attributes = mapper.attributes mapper expr.pexp_attributes;
401+
pexp_attributes= templateLiteralAttr::(mapper.attributes mapper expr.pexp_attributes);
400402
pexp_desc = Pexp_constant s
401403
}
402404
| Pexp_apply (

syntax/src/res_ast_debugger.ml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,13 @@ module SexpAst = struct
143143
string txt;
144144
optChar tag;
145145
]
146-
| Pconst_char c ->
146+
| Pconst_char _ ->
147+
Sexp.list [
148+
Sexp.atom "Pconst_char";
149+
]
150+
| Pconst_string(_, Some "INTERNAL_RES_CHAR_CONTENTS") ->
147151
Sexp.list [
148152
Sexp.atom "Pconst_char";
149-
Sexp.atom (Char.escaped c);
150153
]
151154
| Pconst_string (txt, tag) ->
152155
Sexp.list [

syntax/src/res_cli.ml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ module ResClflags: sig
165165
val file: string ref
166166
val interface: bool ref
167167
val ppx: string ref
168+
val typechecker: bool ref
168169

169170
val parse: unit -> unit
170171
end = struct
@@ -176,6 +177,7 @@ end = struct
176177
let interface = ref false
177178
let ppx = ref ""
178179
let file = ref ""
180+
let typechecker = ref false
179181

180182
let usage = "\n**This command line is for the repo developer's testing purpose only. DO NOT use it in production**!\n\n" ^
181183
"Usage:\n rescript <options> <file>\n\n" ^
@@ -192,6 +194,7 @@ end = struct
192194
("-width", Arg.Int (fun w -> width := w), "Specify the line length for the printer (formatter)");
193195
("-interface", Arg.Unit (fun () -> interface := true), "Parse as interface");
194196
("-ppx", Arg.String (fun txt -> ppx := txt), "Apply a specific built-in ppx before parsing, none or jsx. Default: none");
197+
("-typechecker", Arg.Unit (fun () -> typechecker := true), "Parses the ast as it would be passed to the typechecker and not the printer")
195198
]
196199

197200
let parse () = Arg.parse spec (fun f -> file := f) usage
@@ -200,7 +203,7 @@ end
200203
module CliArgProcessor = struct
201204
type backend = Parser: ('diagnostics) Res_driver.parsingEngine -> backend [@@unboxed]
202205

203-
let processFile ~isInterface ~width ~recover ~origin ~target ~ppx filename =
206+
let processFile ~isInterface ~width ~recover ~origin ~target ~ppx ~typechecker filename =
204207
let len = String.length filename in
205208
let processInterface =
206209
isInterface || len > 0 && (String.get [@doesNotRaise]) filename (len - 1) = 'i'
@@ -233,7 +236,7 @@ module CliArgProcessor = struct
233236
in
234237

235238
let forPrinter = match target with
236-
| "res" | "sexp" -> true
239+
| "res" | "sexp" when not typechecker -> true
237240
| _ -> false
238241
in
239242

@@ -292,5 +295,6 @@ let [@raises Invalid_argument, Failure, exit] () =
292295
~target:!ResClflags.print
293296
~origin:!ResClflags.origin
294297
~ppx:!ResClflags.ppx
298+
~typechecker:!ResClflags.typechecker
295299
!ResClflags.file
296300
end

0 commit comments

Comments
 (0)