Skip to content

Commit d670834

Browse files
committed
Make intenral encoding of locations aware of unicode
When some unicode characters are present on a line, the existing encoding of positions, based on number of bytes since line start, is incorrect. This can be seen in e.g. error messages picked up in the editor (or on the command-line). This PR takes unicode into account. Even thought the ocaml locations are byte-based, one can trick the system by encoding as pos_cnum: (number of bytes from file start to line start) + (number of utf16 code units since line start) Since the compiler's printer performs a subtraction, the utf16 character position is shown. Notice that editors, vscode in particular, show you something in "Col", but its internal commands expect correct utf16 character which is different.
1 parent 4fd5159 commit d670834

File tree

7 files changed

+49
-14
lines changed

7 files changed

+49
-14
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ These are only breaking changes for unformatted code.
5252
- Parser: fix location of variable when function definition `{v => ...}` is enclosed in braces https://github.com/rescript-lang/rescript-compiler/pull/5949
5353
- Fix issue with error messages for uncurried functions where expected and given type were swapped https://github.com/rescript-lang/rescript-compiler/pull/5973
5454
- Fix issue with integer overflow check https://github.com/rescript-lang/rescript-compiler/pull/6028
55+
- Make internal encoding of locations aware of unicode https://github.com/rescript-lang/rescript-compiler/pull/6073
5556

5657
#### :nail_care: Polish
5758

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
We've found a bug for you!
3+
/.../fixtures/unicode_location.res:1:43
4+
5+
1 │ let q = "💩💩💩💩💩💩💩💩����💩" ++ ("a" ++ 3 ++ "b")
6+
2 │ // ^ character position 33 + 10
7+
│ (unicode symbols of length 2)
8+
9+
This has type: int
10+
Somewhere wanted: string
11+
12+
You can convert int to string with Belt.Int.toString.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
let q = "💩💩💩💩💩💩💩💩💩💩" ++ ("a" ++ 3 ++ "b")
2+
// ^ character position 33 + 10 (unicode symbols of length 2)

res_syntax/src/res_core.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2469,6 +2469,7 @@ and parseAttributesAndBinding (p : Parser.t) =
24692469
let err = p.scanner.err in
24702470
let ch = p.scanner.ch in
24712471
let offset = p.scanner.offset in
2472+
let offset16 = p.scanner.offset16 in
24722473
let lineOffset = p.scanner.lineOffset in
24732474
let lnum = p.scanner.lnum in
24742475
let mode = p.scanner.mode in
@@ -2490,6 +2491,7 @@ and parseAttributesAndBinding (p : Parser.t) =
24902491
p.scanner.err <- err;
24912492
p.scanner.ch <- ch;
24922493
p.scanner.offset <- offset;
2494+
p.scanner.offset16 <- offset16;
24932495
p.scanner.lineOffset <- lineOffset;
24942496
p.scanner.lnum <- lnum;
24952497
p.scanner.mode <- mode;

res_syntax/src/res_parser.ml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ let lookahead p callback =
159159
let err = p.scanner.err in
160160
let ch = p.scanner.ch in
161161
let offset = p.scanner.offset in
162+
let offset16 = p.scanner.offset16 in
162163
let lineOffset = p.scanner.lineOffset in
163164
let lnum = p.scanner.lnum in
164165
let mode = p.scanner.mode in
@@ -177,6 +178,7 @@ let lookahead p callback =
177178
p.scanner.err <- err;
178179
p.scanner.ch <- ch;
179180
p.scanner.offset <- offset;
181+
p.scanner.offset16 <- offset16;
180182
p.scanner.lineOffset <- lineOffset;
181183
p.scanner.lnum <- lnum;
182184
p.scanner.mode <- mode;

res_syntax/src/res_scanner.ml

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ type t = {
1919
Diagnostics.category ->
2020
unit;
2121
mutable ch: charEncoding; (* current character *)
22-
mutable offset: int; (* character offset *)
22+
mutable offset: int; (* current byte offset *)
23+
mutable offset16: int;
24+
(* current number of utf16 code units since line start *)
2325
mutable lineOffset: int; (* current line offset *)
2426
mutable lnum: int; (* current line number *)
2527
mutable mode: mode list;
@@ -51,12 +53,11 @@ let position scanner =
5153
(* line number *)
5254
pos_lnum = scanner.lnum;
5355
(* offset of the beginning of the line (number
54-
of characters between the beginning of the scanner and the beginning
56+
of bytes between the beginning of the scanner and the beginning
5557
of the line) *)
5658
pos_bol = scanner.lineOffset;
57-
(* [pos_cnum] is the offset of the position (number of
58-
characters between the beginning of the scanner and the position). *)
59-
pos_cnum = scanner.offset;
59+
(* [pos_cnum - pos_bol] is the number of utf16 code units since line start *)
60+
pos_cnum = scanner.lineOffset + scanner.offset16;
6061
}
6162

6263
(* Small debugging util
@@ -95,19 +96,29 @@ let _printDebug ~startPos ~endPos scanner token =
9596

9697
let next scanner =
9798
let nextOffset = scanner.offset + 1 in
98-
(match scanner.ch with
99-
| '\n' ->
100-
scanner.lineOffset <- nextOffset;
101-
scanner.lnum <- scanner.lnum + 1
99+
let utf16len =
100+
match Ext_utf8.classify scanner.ch with
101+
| Single _ | Invalid -> 1
102+
| Leading (n, _) -> ( (((n + 1) / 2) [@doesNotRaise]))
103+
| Cont _ -> 0
104+
in
105+
let newline =
106+
scanner.ch = '\n'
102107
(* What about CRLF (\r + \n) on windows?
103-
* \r\n will always be terminated by a \n
104-
* -> we can just bump the line count on \n *)
105-
| _ -> ());
108+
\r\n will always be terminated by a \n
109+
-> we can just bump the line count on \n *)
110+
in
111+
if newline then (
112+
scanner.lineOffset <- nextOffset;
113+
scanner.offset16 <- 0;
114+
scanner.lnum <- scanner.lnum + 1)
115+
else scanner.offset16 <- scanner.offset16 + utf16len;
106116
if nextOffset < String.length scanner.src then (
107117
scanner.offset <- nextOffset;
108-
scanner.ch <- String.unsafe_get scanner.src scanner.offset)
118+
scanner.ch <- String.unsafe_get scanner.src nextOffset)
109119
else (
110120
scanner.offset <- String.length scanner.src;
121+
scanner.offset16 <- scanner.offset - scanner.lineOffset;
111122
scanner.ch <- hackyEOFChar)
112123

113124
let next2 scanner =
@@ -141,6 +152,7 @@ let make ~filename src =
141152
err = (fun ~startPos:_ ~endPos:_ _ -> ());
142153
ch = (if src = "" then hackyEOFChar else String.unsafe_get src 0);
143154
offset = 0;
155+
offset16 = 0;
144156
lineOffset = 0;
145157
lnum = 1;
146158
mode = [];
@@ -847,6 +859,7 @@ let rec scan scanner =
847859
| ch, _ ->
848860
next scanner;
849861
let offset = scanner.offset in
862+
let offset16 = scanner.offset16 in
850863
let codepoint, length =
851864
Res_utf8.decodeCodePoint scanner.offset scanner.src
852865
(String.length scanner.src)
@@ -863,6 +876,7 @@ let rec scan scanner =
863876
else (
864877
scanner.ch <- ch;
865878
scanner.offset <- offset;
879+
scanner.offset16 <- offset16;
866880
SingleQuote))
867881
| '!' -> (
868882
match (peek scanner, peek2 scanner) with

res_syntax/src/res_scanner.mli

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ type t = {
1111
Res_diagnostics.category ->
1212
unit;
1313
mutable ch: charEncoding; (* current character *)
14-
mutable offset: int; (* character offset *)
14+
mutable offset: int; (* current byte offset *)
15+
mutable offset16: int;
16+
(* current number of utf16 code units since line start *)
1517
mutable lineOffset: int; (* current line offset *)
1618
mutable lnum: int; (* current line number *)
1719
mutable mode: mode list;

0 commit comments

Comments
 (0)