Make intenral encoding of locations aware of unicode

cristianoc · cristianoc · commit d670834c8338 · 2023-03-14T15:22:42.000+01:00
When some unicode characters are present on a line, the existing encoding of positions, based on number of bytes since line start, is incorrect.
This can be seen in e.g. error messages picked up in the editor (or on the command-line).

This PR takes unicode into account. Even thought the ocaml locations are byte-based, one can trick the system by encoding as pos_cnum:
  (number of bytes from file start to line start) + (number of utf16 code units since line start)
Since the compiler's printer performs a subtraction, the utf16 character position is shown.
Notice that editors, vscode in particular, show you something in "Col", but its internal commands expect correct utf16 character which is different.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -52,6 +52,7 @@ These are only breaking changes for unformatted code.
 - Parser: fix location of variable when function definition `{v => ...}` is enclosed in braces https://github.com/rescript-lang/rescript-compiler/pull/5949
 - Fix issue with error messages for uncurried functions where expected and given type were swapped https://github.com/rescript-lang/rescript-compiler/pull/5973
 - Fix issue with integer overflow check https://github.com/rescript-lang/rescript-compiler/pull/6028
+- Make internal encoding of locations aware of unicode https://github.com/rescript-lang/rescript-compiler/pull/6073
 
 #### :nail_care: Polish
 
diff --git a/jscomp/build_tests/super_errors/expected/unicode_location.res.expected b/jscomp/build_tests/super_errors/expected/unicode_location.res.expected
@@ -0,0 +1,12 @@
+
+  [1;31mWe've found a bug for you![0m
+  [36m/.../fixtures/unicode_location.res[0m:[2m1:43[0m
+
+  [1;31m1[0m [2m│[0m let q = "💩💩💩💩💩💩💩💩�[1;31m�[0m��💩" ++ ("a" ++ 3 ++ "b")
+  2 [2m│[0m //                                         ^ character position 33 + 10 
+    [2m│[0m (unicode symbols of length 2)
+
+  This has type: [1;31mint[0m
+  Somewhere wanted: [1;33mstring[0m
+  
+  You can convert [1;33mint[0m to [1;33mstring[0m with [1;33mBelt.Int.toString[0m.
diff --git a/jscomp/build_tests/super_errors/fixtures/unicode_location.res b/jscomp/build_tests/super_errors/fixtures/unicode_location.res
@@ -0,0 +1,2 @@
+let q = "💩💩💩💩💩💩💩💩💩💩" ++ ("a" ++ 3 ++ "b")
+//                                         ^ character position 33 + 10 (unicode symbols of length 2)
diff --git a/res_syntax/src/res_core.ml b/res_syntax/src/res_core.ml
@@ -2469,6 +2469,7 @@ and parseAttributesAndBinding (p : Parser.t) =
   let err = p.scanner.err in
   let ch = p.scanner.ch in
   let offset = p.scanner.offset in
+  let offset16 = p.scanner.offset16 in
   let lineOffset = p.scanner.lineOffset in
   let lnum = p.scanner.lnum in
   let mode = p.scanner.mode in
@@ -2490,6 +2491,7 @@ and parseAttributesAndBinding (p : Parser.t) =
       p.scanner.err <- err;
       p.scanner.ch <- ch;
       p.scanner.offset <- offset;
+      p.scanner.offset16 <- offset16;
       p.scanner.lineOffset <- lineOffset;
       p.scanner.lnum <- lnum;
       p.scanner.mode <- mode;
diff --git a/res_syntax/src/res_parser.ml b/res_syntax/src/res_parser.ml
@@ -159,6 +159,7 @@ let lookahead p callback =
   let err = p.scanner.err in
   let ch = p.scanner.ch in
   let offset = p.scanner.offset in
+  let offset16 = p.scanner.offset16 in
   let lineOffset = p.scanner.lineOffset in
   let lnum = p.scanner.lnum in
   let mode = p.scanner.mode in
@@ -177,6 +178,7 @@ let lookahead p callback =
   p.scanner.err <- err;
   p.scanner.ch <- ch;
   p.scanner.offset <- offset;
+  p.scanner.offset16 <- offset16;
   p.scanner.lineOffset <- lineOffset;
   p.scanner.lnum <- lnum;
   p.scanner.mode <- mode;
diff --git a/res_syntax/src/res_scanner.ml b/res_syntax/src/res_scanner.ml
@@ -19,7 +19,9 @@ type t = {
     Diagnostics.category ->
     unit;
   mutable ch: charEncoding; (* current character *)
-  mutable offset: int; (* character offset *)
+  mutable offset: int; (* current byte offset *)
+  mutable offset16: int;
+      (* current number of utf16 code units since line start *)
   mutable lineOffset: int; (* current line offset *)
   mutable lnum: int; (* current line number *)
   mutable mode: mode list;
@@ -51,12 +53,11 @@ let position scanner =
       (* line number *)
       pos_lnum = scanner.lnum;
       (* offset of the beginning of the line (number
-         of characters between the beginning of the scanner and the beginning
+         of bytes between the beginning of the scanner and the beginning
          of the line) *)
       pos_bol = scanner.lineOffset;
-      (* [pos_cnum] is the offset of the position (number of
-         characters between the beginning of the scanner and the position). *)
-      pos_cnum = scanner.offset;
+      (* [pos_cnum - pos_bol]  is the number of utf16 code units since line start *)
+      pos_cnum = scanner.lineOffset + scanner.offset16;
     }
 
 (* Small debugging util
@@ -95,19 +96,29 @@ let _printDebug ~startPos ~endPos scanner token =
 
 let next scanner =
   let nextOffset = scanner.offset + 1 in
-  (match scanner.ch with
-  | '\n' ->
-    scanner.lineOffset <- nextOffset;
-    scanner.lnum <- scanner.lnum + 1
+  let utf16len =
+    match Ext_utf8.classify scanner.ch with
+    | Single _ | Invalid -> 1
+    | Leading (n, _) -> ( (((n + 1) / 2) [@doesNotRaise]))
+    | Cont _ -> 0
+  in
+  let newline =
+    scanner.ch = '\n'
     (* What about CRLF (\r + \n) on windows?
-     * \r\n will always be terminated by a \n
-     * -> we can just bump the line count on \n *)
-  | _ -> ());
+       \r\n will always be terminated by a \n
+       -> we can just bump the line count on \n *)
+  in
+  if newline then (
+    scanner.lineOffset <- nextOffset;
+    scanner.offset16 <- 0;
+    scanner.lnum <- scanner.lnum + 1)
+  else scanner.offset16 <- scanner.offset16 + utf16len;
   if nextOffset < String.length scanner.src then (
     scanner.offset <- nextOffset;
-    scanner.ch <- String.unsafe_get scanner.src scanner.offset)
+    scanner.ch <- String.unsafe_get scanner.src nextOffset)
   else (
     scanner.offset <- String.length scanner.src;
+    scanner.offset16 <- scanner.offset - scanner.lineOffset;
     scanner.ch <- hackyEOFChar)
 
 let next2 scanner =
@@ -141,6 +152,7 @@ let make ~filename src =
     err = (fun ~startPos:_ ~endPos:_ _ -> ());
     ch = (if src = "" then hackyEOFChar else String.unsafe_get src 0);
     offset = 0;
+    offset16 = 0;
     lineOffset = 0;
     lnum = 1;
     mode = [];
@@ -847,6 +859,7 @@ let rec scan scanner =
       | ch, _ ->
         next scanner;
         let offset = scanner.offset in
+        let offset16 = scanner.offset16 in
         let codepoint, length =
           Res_utf8.decodeCodePoint scanner.offset scanner.src
             (String.length scanner.src)
@@ -863,6 +876,7 @@ let rec scan scanner =
         else (
           scanner.ch <- ch;
           scanner.offset <- offset;
+          scanner.offset16 <- offset16;
           SingleQuote))
     | '!' -> (
       match (peek scanner, peek2 scanner) with
diff --git a/res_syntax/src/res_scanner.mli b/res_syntax/src/res_scanner.mli
@@ -11,7 +11,9 @@ type t = {
     Res_diagnostics.category ->
     unit;
   mutable ch: charEncoding; (* current character *)
-  mutable offset: int; (* character offset *)
+  mutable offset: int; (* current byte offset *)
+  mutable offset16: int;
+      (* current number of utf16 code units since line start *)
   mutable lineOffset: int; (* current line offset *)
   mutable lnum: int; (* current line number *)
   mutable mode: mode list;

-Original file line number
+Diff line change
@@ @@ -0,0 +1,12 @@ @@
++
 +  [1;31mWe've found a bug for you![0m
 +  [36m/.../fixtures/unicode_location.res[0m:[2m1:43[0m
++
 +  [1;31m1[0m [2m│[0m let q = "💩💩💩💩💩💩💩💩�[1;31m�[0m��💩" ++ ("a" ++ 3 ++ "b")
 +  2 [2m│[0m //                                         ^ character position 33 + 10
 +    [2m│[0m (unicode symbols of length 2)
++
 +  This has type: [1;31mint[0m
 +  Somewhere wanted: [1;33mstring[0m
++
 +  You can convert [1;33mint[0m to [1;33mstring[0m with [1;33mBelt.Int.toString[0m.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+let q = "💩💩💩💩💩💩💩💩💩💩" ++ ("a" ++ 3 ++ "b")`
	`2`	`+// ^ character position 33 + 10 (unicode symbols of length 2)`