Add index field to Position

jamesdbrock · jamesdbrock · commit 7b31d812da55 · 2022-04-01T21:50:24.000+09:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,7 @@ Breaking changes:
   without causing issues with `<$>`.
 - Rename module prefix from `Text.Parsing.Parser` to `Parsing` (#169 by @jamesdbrock)
 - Delete the `regex` parser and replace it with `mkRegex`. (#170 by @jamesdbrock)
+- Add the `index` field to `Position`. (#171 by @jamesdbrock)
 
 New features:
 
diff --git a/bench/Main.purs b/bench/Main.purs
@@ -57,7 +57,6 @@ import Bench.Json.Parsing as BenchParsing
 import Bench.Json.StringParser as BenchStringParser
 import Bench.Json.TestData (largeJson, mediumJson, smallJson)
 import Control.Monad.Trampoline (runTrampoline)
-import Control.Monad.Free (liftF)
 import Data.Array (fold, replicate)
 import Data.Either (either)
 import Data.List (many, manyRec)
diff --git a/src/Parsing/Indent.purs b/src/Parsing/Indent.purs
@@ -85,18 +85,6 @@ get' = do
 put' :: forall s. Position -> IndentParser s Unit
 put' p = lift (put p)
 
-sourceColumn :: Position -> Int
-sourceColumn (Position { line: _, column: c }) = c
-
-sourceLine :: Position -> Int
-sourceLine (Position { line: l, column: _ }) = l
-
-setSourceLine :: Position -> Int -> Position
-setSourceLine (Position { line: _, column: c }) l = Position { line: l, column: c }
-
-biAp :: forall a b c. (a -> b) -> (b -> b -> c) -> a -> a -> c
-biAp f c v1 v2 = c (f v1) (f v2)
-
 many1 :: forall s m a. ParserT s m a -> ParserT s m (List a)
 many1 p = lift2 Cons p (many p)
 
@@ -121,19 +109,17 @@ withBlock' = withBlock (flip const)
 -- | Parses only when indented past the level of the reference
 indented :: forall s. IndentParser s Unit
 indented = do
-  pos <- position
-  s <- get'
-  if biAp sourceColumn (<=) pos s then fail "not indented"
-  else do
-    put' $ setSourceLine s (sourceLine pos)
-    pure unit
+  Position p <- position
+  Position s <- get'
+  if p.column <= s.column then fail "not indented"
+  else put' $ Position { index: 0, line: p.line, column: s.column }
 
 -- | Same as `indented`, but does not change internal state
 indented' :: forall s. IndentParser s Unit
 indented' = do
-  pos <- position
-  s <- get'
-  if biAp sourceColumn (<=) pos s then fail "not indented" else pure unit
+  Position p <- position
+  Position s <- get'
+  if p.column <= s.column then fail "not indented" else pure unit
 
 -- | Parses only when indented past the level of the reference or on the same line
 sameOrIndented :: forall s. IndentParser s Unit
@@ -142,9 +128,9 @@ sameOrIndented = sameLine <|> indented
 -- | Parses only on the same line as the reference
 sameLine :: forall s. IndentParser s Unit
 sameLine = do
-  pos <- position
-  s <- get'
-  if biAp sourceLine (==) pos s then pure unit else fail "over one line"
+  Position p <- position
+  Position s <- get'
+  if p.line == s.line then pure unit else fail "over one line"
 
 -- | Parses a block of lines at the same indentation level
 block1 :: forall s a. IndentParser s a -> IndentParser s (List a)
@@ -169,9 +155,9 @@ withPos x = do
 -- | Ensures the current indentation level matches that of the reference
 checkIndent :: forall s. IndentParser s Unit
 checkIndent = do
-  s <- get'
-  p <- position
-  if biAp sourceColumn (==) p s then pure unit else fail "indentation doesn't match"
+  Position p <- position
+  Position s <- get'
+  if p.column == s.column then pure unit else fail "indentation doesn't match"
 
 -- | Run the result of an indentation sensitive parse
 runIndent :: forall a. State Position a -> a
diff --git a/src/Parsing/Pos.purs b/src/Parsing/Pos.purs
@@ -3,25 +3,30 @@ module Parsing.Pos where
 import Prelude
 
 import Data.Generic.Rep (class Generic)
+import Data.Show.Generic (genericShow)
 
 -- | `Position` represents the position of the parser in the input.
 -- |
--- | - `line` is the current line in the input
--- | - `column` is the column of the next character in the current line that will be parsed
+-- | - `index` is the position since the start of the input. Starts at 0.
+-- | - `line` is the current line in the input. Starts at 1.
+-- | - `column` is the column of the next character in the current line that
+-- |   will be parsed. Starts at 1.
 newtype Position = Position
-  { line :: Int
+  { index :: Int
+  , line :: Int
   , column :: Int
   }
 
-derive instance genericPosition :: Generic Position _
+derive instance Generic Position _
+instance Show Position where
+  show x = genericShow x
 
-instance showPosition :: Show Position where
-  show (Position { line: line, column: column }) =
-    "(Position { line: " <> show line <> ", column: " <> show column <> " })"
+instance Eq Position where
+  eq (Position l) (Position r) = l.index == r.index
 
-derive instance eqPosition :: Eq Position
-derive instance ordPosition :: Ord Position
+instance Ord Position where
+  compare (Position l) (Position r) = compare l.index r.index
 
 -- | The `Position` before any input has been parsed.
 initialPos :: Position
-initialPos = Position { line: 1, column: 1 }
+initialPos = Position { index: 0, line: 1, column: 1 }
diff --git a/src/Parsing/String.purs b/src/Parsing/String.purs
@@ -19,6 +19,20 @@
 -- | The other primitive parsers, which return `CodePoint` and `String` types,
 -- | can parse the full Unicode character set. All of the primitive parsers
 -- | in this module can be used together.
+-- |
+-- | ### Position
+-- |
+-- | In a `String` parser, the `Position {index}` counts the number of
+-- | unicode `CodePoint`s since the beginning of the input string.
+-- |
+-- | Each tab character (`0x09`) encountered in a `String` parser will advance
+-- | the `Position {column}` by 8.
+-- |
+-- | These patterns will advance the `Position {line}` by 1 and reset
+-- | the `Position {column}` to 1:
+-- | - newline (`0x0A`)
+-- | - carriage-return (`0x0D`)
+-- | - carriage-return-newline (`0x0D 0x0A`)
 module Parsing.String
   ( string
   , eof
@@ -187,14 +201,14 @@ updatePosString pos before after = case uncons before of
 -- | Updates a `Position` by adding the columns and lines in a
 -- | single `CodePoint`.
 updatePosSingle :: Position -> CodePoint -> String -> Position
-updatePosSingle (Position { line, column }) cp after = case fromEnum cp of
-  10 -> Position { line: line + 1, column: 1 } -- "\n"
+updatePosSingle (Position { index, line, column }) cp after = case fromEnum cp of
+  10 -> Position { index: index + 1, line: line + 1, column: 1 } -- "\n"
   13 ->
     case codePointAt 0 after of
-      Just nextCp | fromEnum nextCp == 10 -> Position { line, column } -- "\r\n" lookahead
-      _ -> Position { line: line + 1, column: 1 } -- "\r"
-  9 -> Position { line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns?
-  _ -> Position { line, column: column + 1 }
+      Just nextCp | fromEnum nextCp == 10 -> Position { index: index + 1, line, column } -- "\r\n" lookahead
+      _ -> Position { index: index + 1, line: line + 1, column: 1 } -- "\r"
+  9 -> Position { index: index + 1, line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns?
+  _ -> Position { index: index + 1, line, column: column + 1 }
 
 -- | Combinator which returns both the result of a parse and the slice of
 -- | the input that was consumed while it was being parsed.
diff --git a/test/Main.purs b/test/Main.purs
@@ -128,7 +128,7 @@ stackSafeLoopsTest = do
   parseErrorTestPosition
     (many1TillRec (string "a") (string "b"))
     "baa"
-    (Position { line: 1, column: 1 })
+    (Position { index: 0, line: 1, column: 1 })
 
   parseTest "a,a,a,b,a,a" (toUnfoldable [ "a", "a", "a" ]) $
     sepEndByRec (string "a") (string ",")
@@ -144,7 +144,7 @@ stackSafeLoopsTest = do
   parseErrorTestPosition
     (sepEndBy1Rec (string "a") (string ","))
     "b,a,a"
-    (Position { line: 1, column: 1 })
+    (Position { index: 0, line: 1, column: 1 })
 
   -- 8 `div` (8 `div` 2) == 2
   parseTest "8x8x2" 2 $
@@ -156,7 +156,7 @@ stackSafeLoopsTest = do
   parseErrorTestPosition
     (chainr1Rec digit (string "x" $> div))
     ""
-    (Position { line: 1, column: 1 })
+    (Position { index: 0, line: 1, column: 1 })
 
   -- (8 `div` 2) `div` 2 == 2
   parseTest "8x2x2" 2 $
@@ -168,15 +168,15 @@ stackSafeLoopsTest = do
   parseErrorTestPosition
     (chainl1Rec digit (string "x" $> div))
     ""
-    (Position { line: 1, column: 1 })
+    (Position { index: 0, line: 1, column: 1 })
 
   parseTest "aaaabcd" "b"
     $ skipMany1Rec (string "a")
         *> string "b"
   parseErrorTestPosition
     (skipMany1Rec (string "a"))
     "bcd"
-    (Position { line: 1, column: 1 })
+    (Position { index: 0, line: 1, column: 1 })
 
   parseTest "aaaabcd" "b"
     $ skipManyRec (string "a")
@@ -190,7 +190,7 @@ stackSafeLoopsTest = do
   parseErrorTestPosition
     (many1Rec (string "a"))
     ""
-    (Position { line: 1, column: 1 })
+    (Position { index: 0, line: 1, column: 1 })
 
   parseTest "a,a,ab" (toUnfoldable [ "a", "a", "a" ])
     $ sepByRec (string "a") (string ",")
@@ -204,11 +204,11 @@ stackSafeLoopsTest = do
   parseErrorTestPosition
     (sepBy1Rec (string "a") (string ","))
     ""
-    (Position { line: 1, column: 1 })
+    (Position { index: 0, line: 1, column: 1 })
   parseErrorTestPosition
     (sepBy1Rec (string "a") (string ","))
     "a,"
-    (Position { line: 1, column: 3 })
+    (Position { index: 2, line: 1, column: 3 })
 
   parseTest "a,a,a,b" (toUnfoldable [ "a", "a", "a" ])
     $ endByRec (string "a") (string ",")
@@ -222,11 +222,11 @@ stackSafeLoopsTest = do
   parseErrorTestPosition
     (endBy1Rec (string "a") (string ","))
     ""
-    (Position { line: 1, column: 1 })
+    (Position { index: 0, line: 1, column: 1 })
   parseErrorTestPosition
     (endBy1Rec (string "a") (string ","))
     "a,a"
-    (Position { line: 1, column: 4 })
+    (Position { index: 3, line: 1, column: 4 })
 
 data TestToken = A | B
 
@@ -247,10 +247,7 @@ testTokenParser :: TokenParser
 testTokenParser = makeTokenParser haskellDef
 
 mkPos :: Int -> Position
-mkPos n = mkPos' n 1
-
-mkPos' :: Int -> Int -> Position
-mkPos' column line = Position { column: column, line: line }
+mkPos n = Position { index: n - 1, line: 1, column: n }
 
 type TestM = Effect Unit
 
@@ -577,12 +574,12 @@ main = do
   parseErrorTestPosition
     (many $ char 'f' *> char '?')
     "foo"
-    (Position { column: 2, line: 1 })
+    (Position { index: 1, column: 2, line: 1 })
 
   parseErrorTestPosition
     (satisfy (_ == '?'))
     "foo"
-    (Position { column: 1, line: 1 })
+    (Position { index: 0, column: 1, line: 1 })
 
   parseTest
     "foo"
@@ -607,17 +604,17 @@ main = do
 
   parseTest "rest" "rest" rest
   parseTest "rest" unit (rest *> eof)
-  parseTest "rest\nrest" (Position { line: 2, column: 5 }) (rest *> position)
+  parseTest "rest\nrest" (Position { index: 9, line: 2, column: 5 }) (rest *> position)
 
   parseErrorTestPosition
     (rest *> notFollowedBy eof)
     "aa\naa"
-    (Position { column: 3, line: 2 })
+    (Position { index: 5, column: 3, line: 2 })
 
   parseErrorTestPosition
-    anyChar
-    "𝅘𝅥𝅯"
-    (Position { column: 1, line: 1 })
+    (string "𝅘𝅥𝅘𝅥𝅮" *> string "𝅘𝅥𝅘𝅥𝅮")
+    "𝅘𝅥𝅘𝅥𝅮x𝅘𝅥𝅯"
+    (Position { index: 2, column: 3, line: 1 })
 
   parseTest "𝅘𝅥𝅘𝅥𝅮x𝅘𝅥𝅯" [ "𝅘𝅥", "𝅘𝅥𝅮", "x", "𝅘𝅥𝅯" ] do
     quarter <- anyCodePoint
@@ -633,8 +630,8 @@ main = do
 
   parseTest "abcd" "ab" $ takeN 2
   parseTest "abcd" "" $ takeN 0
-  parseErrorTestPosition (takeN 10) "abcd" (Position { column: 1, line: 1 })
-  parseErrorTestPosition (takeN (-1)) "abcd" (Position { column: 1, line: 1 })
+  parseErrorTestPosition (takeN 10) "abcd" (Position { index: 0, column: 1, line: 1 })
+  parseErrorTestPosition (takeN (-1)) "abcd" (Position { index: 0, column: 1, line: 1 })
 
   parseErrorTestMessage
     (noneOfCodePoints $ SCP.toCodePointArray "❓✅")
@@ -677,10 +674,10 @@ main = do
   parseTest "ababab" [ 'b', 'b', 'b' ] $ Array.many (char 'a' *> char 'b')
   parseTest "abaXab" [ 'b' ] $ Array.many (try (char 'a' *> char 'b'))
 
-  parseErrorTestPosition (string "abc") "bcd" (Position { column: 1, line: 1 })
-  parseErrorTestPosition (string "abc" *> eof) "abcdefg" (Position { column: 4, line: 1 })
-  parseErrorTestPosition (string "a\nb\nc\n" *> eof) "a\nb\nc\nd\n" (Position { column: 1, line: 4 })
-  parseErrorTestPosition (string "\ta" *> eof) "\tab" (Position { column: 10, line: 1 })
+  parseErrorTestPosition (string "abc") "bcd" (Position { index: 0, column: 1, line: 1 })
+  parseErrorTestPosition (string "abc" *> eof) "abcdefg" (Position { index: 3, column: 4, line: 1 })
+  parseErrorTestPosition (string "a\nb\nc\n" *> eof) "a\nb\nc\nd\n" (Position { index: 6, column: 1, line: 4 })
+  parseErrorTestPosition (string "\ta" *> eof) "\tab" (Position { index: 2, column: 10, line: 1 })
 
   log "\nTESTS number\n"