Use slices instead of cursors (#83)

chtenb · web-flow · commit 04fdd95b9ba2 · 2022-03-05T09:56:26.000+01:00
* improve performance of Char parsers with fixed domain
* improve performance of code unit string parser
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ Notable changes to this project are documented in this file. The format is based
 ## [Unreleased]
 
 Breaking changes:
+- Issue #77: Fix CodePoint parser quadratic performance (#83 by @chtenb). The parser now tracks the remaining unparsed substring. This change is breaking, but will trigger compile errors in all places where this definition is used.
 - Fix semantics of endBy and sepEndBy parser combinators (#84 by @chtenb)
 
 New features:
diff --git a/bench/Main.purs b/bench/Main.purs
@@ -59,21 +59,21 @@ main = do
     $ \_ -> runParser parse23AnyCharUnits string23_10000
 
   log "StringParser.runParser parse23DigitPoints"
-  benchWith 20
+  benchWith 200
     $ \_ -> runParser parse23DigitPoints string23_10000
   log "StringParser.runParser parse23DigitUnits"
   benchWith 200
     $ \_ -> runParser parse23DigitUnits string23_10000
 
   log "StringParser.runParser parse23StringPoints"
-  benchWith 20
+  benchWith 200
     $ \_ -> runParser parse23StringPoints string23_10000
   log "StringParser.runParser parse23StringUnits"
   benchWith 200
     $ \_ -> runParser parse23StringUnits string23_10000
 
   log "StringParser.runParser parse23RegexPoints"
-  benchWith 20
+  benchWith 200
     $ \_ -> runParser parse23RegexPoints string23_10000
   log "StringParser.runParser parse23RegexUnits"
   benchWith 200
diff --git a/src/Text/Parsing/StringParser.purs b/src/Text/Parsing/StringParser.purs
@@ -9,21 +9,19 @@ import Control.MonadPlus (class MonadPlus, class MonadZero, class Alternative)
 import Control.Monad.Rec.Class (class MonadRec, tailRecM, Step(..))
 import Control.Plus (class Plus, class Alt)
 import Control.Lazy (class Lazy)
-import Data.Bifunctor (lmap)
 import Data.Either (Either(..))
 
 -- | A position in an input string.
 type Pos = Int
 
--- | Strings are represented as a string with an index from the
+-- | Strings are represented as a substring with an index from the
 -- | start of the string.
 -- |
--- | `{ str: s, pos: n }` is interpreted as the substring of `s`
--- | starting at index n.
+-- | `{ substring: s, position: n }` is interpreted as the substring `s`
+-- | starting at index n of the original string.
 -- |
--- | This allows us to avoid repeatedly finding substrings
--- | every time we match a character.
-type PosString = { str :: String, pos :: Pos }
+-- | The position is only kept for error messaging.
+type PosString = { substring :: String, position :: Pos }
 
 -- | The type of parsing errors.
 type ParseError = { error :: String, pos :: Pos }
@@ -43,7 +41,7 @@ unParser (Parser p) = p
 -- | Run a parser for an input string. See also `printParserError`
 -- | and `unParser` for more flexible usages.
 runParser :: forall a. Parser a -> String -> Either ParseError a
-runParser (Parser p) s = map _.result (p { str: s, pos: 0 })
+runParser (Parser p) s = map _.result (p { substring: s, position: 0 })
 
 -- | Prints a ParseError's the error message and the position of the error.
 printParserError :: ParseError -> String
@@ -65,7 +63,7 @@ instance altParser :: Alt Parser where
   alt (Parser p1) (Parser p2) = Parser \s ->
     case p1 s of
       Left { error, pos }
-        | s.pos == pos -> p2 s
+        | s.position == pos -> p2 s
         | otherwise -> Left { error, pos }
       right -> right
 
@@ -92,17 +90,20 @@ instance monadRecParser :: MonadRec Parser where
     split { result: Done b, suffix } = Done { result: b, suffix }
 
 instance lazyParser :: Lazy (Parser a) where
-  defer f = Parser $ \str -> unParser (f unit) str
+  defer f = Parser \str -> unParser (f unit) str
 
 -- | Fail with the specified message.
 fail :: forall a. String -> Parser a
-fail error = Parser \{ pos } -> Left { pos, error }
+fail error = Parser \{ position } -> Left { pos: position, error }
 
 -- | In case of error, the default behavior is to backtrack if no input was consumed.
 -- |
 -- | `try p` backtracks even if input was consumed.
 try :: forall a. Parser a -> Parser a
-try (Parser p) = Parser \(s@{ pos }) -> lmap (_ { pos = pos }) (p s)
+try (Parser p) = Parser \s ->
+  case p s of
+    Left { error } -> Left { pos: s.position, error }
+    right -> right
 
 instance semigroupParser :: Semigroup a => Semigroup (Parser a) where
   append = lift2 append
diff --git a/src/Text/Parsing/StringParser/CodePoints.purs b/src/Text/Parsing/StringParser/CodePoints.purs
@@ -31,45 +31,48 @@ import Data.Either (Either(..))
 import Data.Enum (fromEnum)
 import Data.Foldable (class Foldable, foldMap, elem, notElem)
 import Data.Maybe (Maybe(..))
-import Data.String.CodePoints (codePointAt, drop, indexOf', length)
-import Data.String.CodeUnits (singleton)
-import Data.String.Pattern (Pattern(..))
+import Data.String.CodePoints as SCP
+import Data.String.CodeUnits as SCU
 import Data.String.Regex as Regex
 import Data.String.Regex.Flags (noFlags)
 import Text.Parsing.StringParser (Parser(..), try, fail)
 import Text.Parsing.StringParser.Combinators (many, (<?>))
+import Text.Parsing.StringParser.CodeUnits as CodeUnitsParser
 
 -- | Match the end of the file.
 eof :: Parser Unit
 eof = Parser \s ->
   case s of
-    { str, pos } | pos < length str -> Left { pos, error: "Expected EOF" }
+    { substring, position } | 0 < SCP.length substring -> Left { pos: position, error: "Expected EOF" }
     _ -> Right { result: unit, suffix: s }
 
 -- | Match any character.
 anyChar :: Parser Char
-anyChar = Parser \{ str, pos } ->
-  case codePointAt pos str of
+anyChar = Parser \{ substring, position } ->
+  case SCP.codePointAt 0 substring of
     Just cp -> case toChar cp of
-      Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } }
-      Nothing -> Left { pos, error: "CodePoint " <> show cp <> " is not a character" }
-    Nothing -> Left { pos, error: "Unexpected EOF" }
+      Just chr -> Right { result: chr, suffix: { substring: SCP.drop 1 substring, position: position + 1 } }
+      Nothing -> Left { pos: position, error: "CodePoint " <> show cp <> " is not a character" }
+    Nothing -> Left { pos: position, error: "Unexpected EOF" }
   where
   toChar = fromCharCode <<< fromEnum
 
 -- | Match any digit.
 anyDigit :: Parser Char
 anyDigit = try do
-  c <- anyChar
+  c <- CodeUnitsParser.anyChar
   if c >= '0' && c <= '9' then pure c
   else fail $ "Character " <> show c <> " is not a digit"
 
 -- | Match the specified string.
 string :: String -> Parser String
-string nt = Parser \s ->
-  case s of
-    { str, pos } | indexOf' (Pattern nt) pos str == Just pos -> Right { result: nt, suffix: { str, pos: pos + length nt } }
-    { pos } -> Left { pos, error: "Expected '" <> nt <> "'." }
+string pattern = Parser \{ substring, position } ->
+  let
+    length = SCP.length pattern
+    { before, after } = SCP.splitAt length substring
+  in
+    if before == pattern then Right { result: pattern, suffix: { substring: after, position: position + length } }
+    else Left { pos: position, error: "Expected '" <> pattern <> "'." }
 
 -- | Match a character satisfying the given predicate.
 satisfy :: (Char -> Boolean) -> Parser Char
@@ -86,7 +89,7 @@ char c = satisfy (_ == c) <?> "Could not match character " <> show c
 whiteSpace :: Parser String
 whiteSpace = do
   cs <- many (satisfy \c -> c == '\n' || c == '\r' || c == ' ' || c == '\t')
-  pure (foldMap singleton cs)
+  pure (foldMap SCU.singleton cs)
 
 -- | Skip many whitespace characters.
 skipSpaces :: Parser Unit
@@ -103,14 +106,14 @@ noneOf = satisfy <<< flip notElem
 -- | Match any lower case character.
 lowerCaseChar :: Parser Char
 lowerCaseChar = try do
-  c <- anyChar
+  c <- CodeUnitsParser.anyChar
   if toCharCode c `elem` (97 .. 122) then pure c
   else fail $ "Expected a lower case character but found " <> show c
 
 -- | Match any upper case character.
 upperCaseChar :: Parser Char
 upperCaseChar = try do
-  c <- anyChar
+  c <- CodeUnitsParser.anyChar
   if toCharCode c `elem` (65 .. 90) then pure c
   else fail $ "Expected an upper case character but found " <> show c
 
@@ -135,10 +138,9 @@ regex pat =
   pattern = "^(" <> pat <> ")"
 
   matchRegex :: Regex.Regex -> Parser String
-  matchRegex r = Parser \{ str, pos } -> do
-    let remainder = drop pos str
-    case NEA.head <$> Regex.match r remainder of
+  matchRegex r = Parser \{ substring, position } -> do
+    case NEA.head <$> Regex.match r substring of
       Just (Just matched) ->
-        Right { result: matched, suffix: { str, pos: pos + length matched } }
+        Right { result: matched, suffix: { substring: SCP.drop (SCP.length matched) substring, position: position + SCP.length matched } }
       _ ->
-        Left { pos, error: "no match" }
+        Left { pos: position, error: "no match" }
diff --git a/src/Text/Parsing/StringParser/CodeUnits.purs b/src/Text/Parsing/StringParser/CodeUnits.purs
@@ -32,7 +32,6 @@ import Data.Foldable (class Foldable, foldMap, elem, notElem)
 import Data.Maybe (Maybe(..))
 import Data.String.CodeUnits (charAt, singleton)
 import Data.String.CodeUnits as SCU
-import Data.String.Pattern (Pattern(..))
 import Data.String.Regex as Regex
 import Data.String.Regex.Flags (noFlags)
 import Text.Parsing.StringParser (Parser(..), try, fail)
@@ -42,15 +41,15 @@ import Text.Parsing.StringParser.Combinators (many, (<?>))
 eof :: Parser Unit
 eof = Parser \s ->
   case s of
-    { str, pos } | pos < SCU.length str -> Left { pos, error: "Expected EOF" }
+    { substring, position } | 0 < SCU.length substring -> Left { pos: position, error: "Expected EOF" }
     _ -> Right { result: unit, suffix: s }
 
 -- | Match any character.
 anyChar :: Parser Char
-anyChar = Parser \{ str, pos } ->
-  case charAt pos str of
-    Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } }
-    Nothing -> Left { pos, error: "Unexpected EOF" }
+anyChar = Parser \{ substring, position } ->
+  case charAt 0 substring of
+    Just chr -> Right { result: chr, suffix: { substring: SCU.drop 1 substring, position: position + 1 } }
+    Nothing -> Left { pos: position, error: "Unexpected EOF" }
 
 -- | Match any digit.
 anyDigit :: Parser Char
@@ -61,10 +60,13 @@ anyDigit = try do
 
 -- | Match the specified string.
 string :: String -> Parser String
-string nt = Parser \s ->
-  case s of
-    { str, pos } | SCU.indexOf' (Pattern nt) pos str == Just pos -> Right { result: nt, suffix: { str, pos: pos + SCU.length nt } }
-    { pos } -> Left { pos, error: "Expected '" <> nt <> "'." }
+string pattern = Parser \{ substring, position } ->
+  let
+    length = SCU.length pattern
+    { before, after } = SCU.splitAt length substring
+  in
+    if before == pattern then Right { result: pattern, suffix: { substring: after, position: position + length } }
+    else Left { pos: position, error: "Expected '" <> pattern <> "'." }
 
 -- | Match a character satisfying the given predicate.
 satisfy :: (Char -> Boolean) -> Parser Char
@@ -130,10 +132,9 @@ regex pat =
   pattern = "^(" <> pat <> ")"
 
   matchRegex :: Regex.Regex -> Parser String
-  matchRegex r = Parser \{ str, pos } -> do
-    let remainder = SCU.drop pos str
-    case NEA.head <$> Regex.match r remainder of
+  matchRegex r = Parser \{ substring, position } -> do
+    case NEA.head <$> Regex.match r substring of
       Just (Just matched) ->
-        Right { result: matched, suffix: { str, pos: pos + SCU.length matched } }
+        Right { result: matched, suffix: { substring: SCU.drop (SCU.length matched) substring, position: position + SCU.length matched } }
       _ ->
-        Left { pos, error: "no match" }
+        Left { pos: position, error: "no match" }
diff --git a/src/Text/Parsing/StringParser/Combinators.purs b/src/Text/Parsing/StringParser/Combinators.purs
@@ -1,7 +1,6 @@
 -- | This module defines combinators for building string parsers.
 module Text.Parsing.StringParser.Combinators
-  ( lookAhead
-  , many
+  ( many
   , many1
   , withError
   , (<?>)
@@ -22,6 +21,7 @@ module Text.Parsing.StringParser.Combinators
   , choice
   , manyTill
   , many1Till
+  , lookAhead
   , module Control.Lazy
   ) where
 
diff --git a/test/CodePoints.purs b/test/CodePoints.purs
@@ -15,7 +15,7 @@ import Data.Unfoldable (replicate)
 import Effect (Effect)
 import Effect.Class.Console (log)
 import Test.Assert (assert', assert)
-import Text.Parsing.StringParser (Parser, runParser, try)
+import Text.Parsing.StringParser (ParseError, Parser(..), PosString, runParser, try)
 import Text.Parsing.StringParser.CodePoints (anyDigit, char, eof, string, anyChar, regex)
 import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between)
 import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser)
@@ -60,6 +60,9 @@ tryTest =
   try (string "aa" <> string "bb") <|>
     (string "aa" <> string "cc")
 
+testParser :: forall a. Parser a -> String -> Either ParseError { result :: a, suffix :: PosString }
+testParser (Parser p) s = p { substring: s, position: 0 }
+
 canParse :: forall a. Parser a -> String -> Boolean
 canParse p input = isRight $ runParser p input
 
@@ -69,6 +72,12 @@ parseFail p input = isLeft $ runParser p input
 expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean
 expectResult res p input = runParser p input == Right res
 
+expectPosition :: forall a. Int -> Parser a -> String -> Boolean
+expectPosition pos p input =
+  case testParser p input of
+    Right r -> r.suffix.position == pos
+    Left _ -> false
+
 testCodePoints :: Effect Unit
 testCodePoints = do
 
@@ -110,6 +119,8 @@ testCodePoints = do
   assert $ expectResult "\x458CA" (string "\x458CA" <* char ']' <* eof) "\x458CA]"
   assert $ expectResult "\x458CA" (string "\x458CA" <* string ")" <* eof) "\x458CA)"
   assert $ expectResult '\xEEE2' (char '\xEEE2' <* eof) "\xEEE2"
+  assert $ expectPosition 1 anyChar "\xEEE2"
+  assert $ expectPosition 1 anyChar "\x458CA"
 
   log "Running overflow tests (may take a while)"
 
diff --git a/test/Examples.purs b/test/Examples.purs
@@ -10,7 +10,7 @@ import Effect (Effect)
 import Effect.Console (log, logShow)
 import Text.Parsing.StringParser (Parser, fail, runParser, unParser)
 import Text.Parsing.StringParser.CodePoints (anyChar, char, eof, regex, skipSpaces, string)
-import Text.Parsing.StringParser.Combinators (between, endBy1, lookAhead, many, many1, sepBy1, (<?>))
+import Text.Parsing.StringParser.Combinators (between, lookAhead, endBy1, many, many1, sepBy1, (<?>))
 
 -- Serves only to make this file runnable
 main :: Effect Unit
@@ -234,7 +234,7 @@ doBoth parserName parser content = do
 doUnParser :: forall a. Show a => String -> Parser a -> String -> Effect Unit
 doUnParser parserName parser content = do
   log $ "(unParser) Parsing content with '" <> parserName <> "'"
-  case unParser parser { str: content, pos: 0 } of
+  case unParser parser { substring: content, position: 0 } of
     Left rec -> log $ "Position: " <> show rec.pos
       <>
         "\n\
diff --git a/test/Main.purs b/test/Main.purs
@@ -13,8 +13,8 @@ main = do
   log "Running basic spec test cases\n"
   runTestCases
 
-  log "\n\nTesting CodePoint parsing\n"
-  testCodePoints
-
   log "\n\nTesting CodeUnit parsing\n"
   testCodeUnits
+
+  log "\n\nTesting CodePoint parsing\n"
+  testCodePoints