Skip to content

Commit 04fdd95

Browse files
authored
Use slices instead of cursors (#83)
* improve performance of Char parsers with fixed domain * improve performance of code unit string parser
1 parent a8ecb39 commit 04fdd95

File tree

9 files changed

+76
-60
lines changed

9 files changed

+76
-60
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Notable changes to this project are documented in this file. The format is based
55
## [Unreleased]
66

77
Breaking changes:
8+
- Issue #77: Fix CodePoint parser quadratic performance (#83 by @chtenb). The parser now tracks the remaining unparsed substring. This change is breaking, but will trigger compile errors in all places where this definition is used.
89
- Fix semantics of endBy and sepEndBy parser combinators (#84 by @chtenb)
910

1011
New features:

bench/Main.purs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,21 +59,21 @@ main = do
5959
$ \_ -> runParser parse23AnyCharUnits string23_10000
6060

6161
log "StringParser.runParser parse23DigitPoints"
62-
benchWith 20
62+
benchWith 200
6363
$ \_ -> runParser parse23DigitPoints string23_10000
6464
log "StringParser.runParser parse23DigitUnits"
6565
benchWith 200
6666
$ \_ -> runParser parse23DigitUnits string23_10000
6767

6868
log "StringParser.runParser parse23StringPoints"
69-
benchWith 20
69+
benchWith 200
7070
$ \_ -> runParser parse23StringPoints string23_10000
7171
log "StringParser.runParser parse23StringUnits"
7272
benchWith 200
7373
$ \_ -> runParser parse23StringUnits string23_10000
7474

7575
log "StringParser.runParser parse23RegexPoints"
76-
benchWith 20
76+
benchWith 200
7777
$ \_ -> runParser parse23RegexPoints string23_10000
7878
log "StringParser.runParser parse23RegexUnits"
7979
benchWith 200

src/Text/Parsing/StringParser.purs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,19 @@ import Control.MonadPlus (class MonadPlus, class MonadZero, class Alternative)
99
import Control.Monad.Rec.Class (class MonadRec, tailRecM, Step(..))
1010
import Control.Plus (class Plus, class Alt)
1111
import Control.Lazy (class Lazy)
12-
import Data.Bifunctor (lmap)
1312
import Data.Either (Either(..))
1413

1514
-- | A position in an input string.
1615
type Pos = Int
1716

18-
-- | Strings are represented as a string with an index from the
17+
-- | Strings are represented as a substring with an index from the
1918
-- | start of the string.
2019
-- |
21-
-- | `{ str: s, pos: n }` is interpreted as the substring of `s`
22-
-- | starting at index n.
20+
-- | `{ substring: s, position: n }` is interpreted as the substring `s`
21+
-- | starting at index n of the original string.
2322
-- |
24-
-- | This allows us to avoid repeatedly finding substrings
25-
-- | every time we match a character.
26-
type PosString = { str :: String, pos :: Pos }
23+
-- | The position is only kept for error messaging.
24+
type PosString = { substring :: String, position :: Pos }
2725

2826
-- | The type of parsing errors.
2927
type ParseError = { error :: String, pos :: Pos }
@@ -43,7 +41,7 @@ unParser (Parser p) = p
4341
-- | Run a parser for an input string. See also `printParserError`
4442
-- | and `unParser` for more flexible usages.
4543
runParser :: forall a. Parser a -> String -> Either ParseError a
46-
runParser (Parser p) s = map _.result (p { str: s, pos: 0 })
44+
runParser (Parser p) s = map _.result (p { substring: s, position: 0 })
4745

4846
-- | Prints a ParseError's the error message and the position of the error.
4947
printParserError :: ParseError -> String
@@ -65,7 +63,7 @@ instance altParser :: Alt Parser where
6563
alt (Parser p1) (Parser p2) = Parser \s ->
6664
case p1 s of
6765
Left { error, pos }
68-
| s.pos == pos -> p2 s
66+
| s.position == pos -> p2 s
6967
| otherwise -> Left { error, pos }
7068
right -> right
7169

@@ -92,17 +90,20 @@ instance monadRecParser :: MonadRec Parser where
9290
split { result: Done b, suffix } = Done { result: b, suffix }
9391

9492
instance lazyParser :: Lazy (Parser a) where
95-
defer f = Parser $ \str -> unParser (f unit) str
93+
defer f = Parser \str -> unParser (f unit) str
9694

9795
-- | Fail with the specified message.
9896
fail :: forall a. String -> Parser a
99-
fail error = Parser \{ pos } -> Left { pos, error }
97+
fail error = Parser \{ position } -> Left { pos: position, error }
10098

10199
-- | In case of error, the default behavior is to backtrack if no input was consumed.
102100
-- |
103101
-- | `try p` backtracks even if input was consumed.
104102
try :: forall a. Parser a -> Parser a
105-
try (Parser p) = Parser \(s@{ pos }) -> lmap (_ { pos = pos }) (p s)
103+
try (Parser p) = Parser \s ->
104+
case p s of
105+
Left { error } -> Left { pos: s.position, error }
106+
right -> right
106107

107108
instance semigroupParser :: Semigroup a => Semigroup (Parser a) where
108109
append = lift2 append

src/Text/Parsing/StringParser/CodePoints.purs

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -31,45 +31,48 @@ import Data.Either (Either(..))
3131
import Data.Enum (fromEnum)
3232
import Data.Foldable (class Foldable, foldMap, elem, notElem)
3333
import Data.Maybe (Maybe(..))
34-
import Data.String.CodePoints (codePointAt, drop, indexOf', length)
35-
import Data.String.CodeUnits (singleton)
36-
import Data.String.Pattern (Pattern(..))
34+
import Data.String.CodePoints as SCP
35+
import Data.String.CodeUnits as SCU
3736
import Data.String.Regex as Regex
3837
import Data.String.Regex.Flags (noFlags)
3938
import Text.Parsing.StringParser (Parser(..), try, fail)
4039
import Text.Parsing.StringParser.Combinators (many, (<?>))
40+
import Text.Parsing.StringParser.CodeUnits as CodeUnitsParser
4141

4242
-- | Match the end of the file.
4343
eof :: Parser Unit
4444
eof = Parser \s ->
4545
case s of
46-
{ str, pos } | pos < length str -> Left { pos, error: "Expected EOF" }
46+
{ substring, position } | 0 < SCP.length substring -> Left { pos: position, error: "Expected EOF" }
4747
_ -> Right { result: unit, suffix: s }
4848

4949
-- | Match any character.
5050
anyChar :: Parser Char
51-
anyChar = Parser \{ str, pos } ->
52-
case codePointAt pos str of
51+
anyChar = Parser \{ substring, position } ->
52+
case SCP.codePointAt 0 substring of
5353
Just cp -> case toChar cp of
54-
Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } }
55-
Nothing -> Left { pos, error: "CodePoint " <> show cp <> " is not a character" }
56-
Nothing -> Left { pos, error: "Unexpected EOF" }
54+
Just chr -> Right { result: chr, suffix: { substring: SCP.drop 1 substring, position: position + 1 } }
55+
Nothing -> Left { pos: position, error: "CodePoint " <> show cp <> " is not a character" }
56+
Nothing -> Left { pos: position, error: "Unexpected EOF" }
5757
where
5858
toChar = fromCharCode <<< fromEnum
5959

6060
-- | Match any digit.
6161
anyDigit :: Parser Char
6262
anyDigit = try do
63-
c <- anyChar
63+
c <- CodeUnitsParser.anyChar
6464
if c >= '0' && c <= '9' then pure c
6565
else fail $ "Character " <> show c <> " is not a digit"
6666

6767
-- | Match the specified string.
6868
string :: String -> Parser String
69-
string nt = Parser \s ->
70-
case s of
71-
{ str, pos } | indexOf' (Pattern nt) pos str == Just pos -> Right { result: nt, suffix: { str, pos: pos + length nt } }
72-
{ pos } -> Left { pos, error: "Expected '" <> nt <> "'." }
69+
string pattern = Parser \{ substring, position } ->
70+
let
71+
length = SCP.length pattern
72+
{ before, after } = SCP.splitAt length substring
73+
in
74+
if before == pattern then Right { result: pattern, suffix: { substring: after, position: position + length } }
75+
else Left { pos: position, error: "Expected '" <> pattern <> "'." }
7376

7477
-- | Match a character satisfying the given predicate.
7578
satisfy :: (Char -> Boolean) -> Parser Char
@@ -86,7 +89,7 @@ char c = satisfy (_ == c) <?> "Could not match character " <> show c
8689
whiteSpace :: Parser String
8790
whiteSpace = do
8891
cs <- many (satisfy \c -> c == '\n' || c == '\r' || c == ' ' || c == '\t')
89-
pure (foldMap singleton cs)
92+
pure (foldMap SCU.singleton cs)
9093

9194
-- | Skip many whitespace characters.
9295
skipSpaces :: Parser Unit
@@ -103,14 +106,14 @@ noneOf = satisfy <<< flip notElem
103106
-- | Match any lower case character.
104107
lowerCaseChar :: Parser Char
105108
lowerCaseChar = try do
106-
c <- anyChar
109+
c <- CodeUnitsParser.anyChar
107110
if toCharCode c `elem` (97 .. 122) then pure c
108111
else fail $ "Expected a lower case character but found " <> show c
109112

110113
-- | Match any upper case character.
111114
upperCaseChar :: Parser Char
112115
upperCaseChar = try do
113-
c <- anyChar
116+
c <- CodeUnitsParser.anyChar
114117
if toCharCode c `elem` (65 .. 90) then pure c
115118
else fail $ "Expected an upper case character but found " <> show c
116119

@@ -135,10 +138,9 @@ regex pat =
135138
pattern = "^(" <> pat <> ")"
136139

137140
matchRegex :: Regex.Regex -> Parser String
138-
matchRegex r = Parser \{ str, pos } -> do
139-
let remainder = drop pos str
140-
case NEA.head <$> Regex.match r remainder of
141+
matchRegex r = Parser \{ substring, position } -> do
142+
case NEA.head <$> Regex.match r substring of
141143
Just (Just matched) ->
142-
Right { result: matched, suffix: { str, pos: pos + length matched } }
144+
Right { result: matched, suffix: { substring: SCP.drop (SCP.length matched) substring, position: position + SCP.length matched } }
143145
_ ->
144-
Left { pos, error: "no match" }
146+
Left { pos: position, error: "no match" }

src/Text/Parsing/StringParser/CodeUnits.purs

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ import Data.Foldable (class Foldable, foldMap, elem, notElem)
3232
import Data.Maybe (Maybe(..))
3333
import Data.String.CodeUnits (charAt, singleton)
3434
import Data.String.CodeUnits as SCU
35-
import Data.String.Pattern (Pattern(..))
3635
import Data.String.Regex as Regex
3736
import Data.String.Regex.Flags (noFlags)
3837
import Text.Parsing.StringParser (Parser(..), try, fail)
@@ -42,15 +41,15 @@ import Text.Parsing.StringParser.Combinators (many, (<?>))
4241
eof :: Parser Unit
4342
eof = Parser \s ->
4443
case s of
45-
{ str, pos } | pos < SCU.length str -> Left { pos, error: "Expected EOF" }
44+
{ substring, position } | 0 < SCU.length substring -> Left { pos: position, error: "Expected EOF" }
4645
_ -> Right { result: unit, suffix: s }
4746

4847
-- | Match any character.
4948
anyChar :: Parser Char
50-
anyChar = Parser \{ str, pos } ->
51-
case charAt pos str of
52-
Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } }
53-
Nothing -> Left { pos, error: "Unexpected EOF" }
49+
anyChar = Parser \{ substring, position } ->
50+
case charAt 0 substring of
51+
Just chr -> Right { result: chr, suffix: { substring: SCU.drop 1 substring, position: position + 1 } }
52+
Nothing -> Left { pos: position, error: "Unexpected EOF" }
5453

5554
-- | Match any digit.
5655
anyDigit :: Parser Char
@@ -61,10 +60,13 @@ anyDigit = try do
6160

6261
-- | Match the specified string.
6362
string :: String -> Parser String
64-
string nt = Parser \s ->
65-
case s of
66-
{ str, pos } | SCU.indexOf' (Pattern nt) pos str == Just pos -> Right { result: nt, suffix: { str, pos: pos + SCU.length nt } }
67-
{ pos } -> Left { pos, error: "Expected '" <> nt <> "'." }
63+
string pattern = Parser \{ substring, position } ->
64+
let
65+
length = SCU.length pattern
66+
{ before, after } = SCU.splitAt length substring
67+
in
68+
if before == pattern then Right { result: pattern, suffix: { substring: after, position: position + length } }
69+
else Left { pos: position, error: "Expected '" <> pattern <> "'." }
6870

6971
-- | Match a character satisfying the given predicate.
7072
satisfy :: (Char -> Boolean) -> Parser Char
@@ -130,10 +132,9 @@ regex pat =
130132
pattern = "^(" <> pat <> ")"
131133

132134
matchRegex :: Regex.Regex -> Parser String
133-
matchRegex r = Parser \{ str, pos } -> do
134-
let remainder = SCU.drop pos str
135-
case NEA.head <$> Regex.match r remainder of
135+
matchRegex r = Parser \{ substring, position } -> do
136+
case NEA.head <$> Regex.match r substring of
136137
Just (Just matched) ->
137-
Right { result: matched, suffix: { str, pos: pos + SCU.length matched } }
138+
Right { result: matched, suffix: { substring: SCU.drop (SCU.length matched) substring, position: position + SCU.length matched } }
138139
_ ->
139-
Left { pos, error: "no match" }
140+
Left { pos: position, error: "no match" }

src/Text/Parsing/StringParser/Combinators.purs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
-- | This module defines combinators for building string parsers.
22
module Text.Parsing.StringParser.Combinators
3-
( lookAhead
4-
, many
3+
( many
54
, many1
65
, withError
76
, (<?>)
@@ -22,6 +21,7 @@ module Text.Parsing.StringParser.Combinators
2221
, choice
2322
, manyTill
2423
, many1Till
24+
, lookAhead
2525
, module Control.Lazy
2626
) where
2727

test/CodePoints.purs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import Data.Unfoldable (replicate)
1515
import Effect (Effect)
1616
import Effect.Class.Console (log)
1717
import Test.Assert (assert', assert)
18-
import Text.Parsing.StringParser (Parser, runParser, try)
18+
import Text.Parsing.StringParser (ParseError, Parser(..), PosString, runParser, try)
1919
import Text.Parsing.StringParser.CodePoints (anyDigit, char, eof, string, anyChar, regex)
2020
import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between)
2121
import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser)
@@ -60,6 +60,9 @@ tryTest =
6060
try (string "aa" <> string "bb") <|>
6161
(string "aa" <> string "cc")
6262

63+
testParser :: forall a. Parser a -> String -> Either ParseError { result :: a, suffix :: PosString }
64+
testParser (Parser p) s = p { substring: s, position: 0 }
65+
6366
canParse :: forall a. Parser a -> String -> Boolean
6467
canParse p input = isRight $ runParser p input
6568

@@ -69,6 +72,12 @@ parseFail p input = isLeft $ runParser p input
6972
expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean
7073
expectResult res p input = runParser p input == Right res
7174

75+
expectPosition :: forall a. Int -> Parser a -> String -> Boolean
76+
expectPosition pos p input =
77+
case testParser p input of
78+
Right r -> r.suffix.position == pos
79+
Left _ -> false
80+
7281
testCodePoints :: Effect Unit
7382
testCodePoints = do
7483

@@ -110,6 +119,8 @@ testCodePoints = do
110119
assert $ expectResult "\x458CA" (string "\x458CA" <* char ']' <* eof) "\x458CA]"
111120
assert $ expectResult "\x458CA" (string "\x458CA" <* string ")" <* eof) "\x458CA)"
112121
assert $ expectResult '\xEEE2' (char '\xEEE2' <* eof) "\xEEE2"
122+
assert $ expectPosition 1 anyChar "\xEEE2"
123+
assert $ expectPosition 1 anyChar "\x458CA"
113124

114125
log "Running overflow tests (may take a while)"
115126

test/Examples.purs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import Effect (Effect)
1010
import Effect.Console (log, logShow)
1111
import Text.Parsing.StringParser (Parser, fail, runParser, unParser)
1212
import Text.Parsing.StringParser.CodePoints (anyChar, char, eof, regex, skipSpaces, string)
13-
import Text.Parsing.StringParser.Combinators (between, endBy1, lookAhead, many, many1, sepBy1, (<?>))
13+
import Text.Parsing.StringParser.Combinators (between, lookAhead, endBy1, many, many1, sepBy1, (<?>))
1414

1515
-- Serves only to make this file runnable
1616
main :: Effect Unit
@@ -234,7 +234,7 @@ doBoth parserName parser content = do
234234
doUnParser :: forall a. Show a => String -> Parser a -> String -> Effect Unit
235235
doUnParser parserName parser content = do
236236
log $ "(unParser) Parsing content with '" <> parserName <> "'"
237-
case unParser parser { str: content, pos: 0 } of
237+
case unParser parser { substring: content, position: 0 } of
238238
Left rec -> log $ "Position: " <> show rec.pos
239239
<>
240240
"\n\

test/Main.purs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ main = do
1313
log "Running basic spec test cases\n"
1414
runTestCases
1515

16-
log "\n\nTesting CodePoint parsing\n"
17-
testCodePoints
18-
1916
log "\n\nTesting CodeUnit parsing\n"
2017
testCodeUnits
18+
19+
log "\n\nTesting CodePoint parsing\n"
20+
testCodePoints

0 commit comments

Comments
 (0)