Skip to content

Commit e3066f8

Browse files
committed
Change String.regex so it pre-compiles
1 parent 6013a3c commit e3066f8

File tree

5 files changed

+85
-101
lines changed

5 files changed

+85
-101
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Breaking changes:
2626
prevents a compiler error (i.e. `MixedAssociativityError`)
2727
without causing issues with `<$>`.
2828
- Rename module prefix from `Text.Parsing.Parser` to `Parsing` (#169 by @jamesdbrock)
29+
- Delete the `regex` parser and replace it with `mkRegex`. (#170 by @jamesdbrock)
2930

3031
New features:
3132

bench/Json/Parsing.purs

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@ import Prelude hiding (between)
44

55
import Bench.Json.Common (Json(..))
66
import Control.Lazy (defer)
7+
import Data.Either (Either(..))
78
import Data.List (List)
89
import Data.Maybe (Maybe(..))
910
import Data.Number as Number
11+
import Data.String.Regex.Flags (noFlags)
1012
import Data.Tuple (Tuple(..))
13+
import Effect.Exception (throw)
14+
import Effect.Unsafe (unsafePerformEffect)
1115
import Parsing (ParserT, fail)
1216
import Parsing.Combinators (between, choice, sepBy, try)
1317
import Parsing.String (regex, skipSpaces, string)
18+
import Partial.Unsafe (unsafeCrashWith)
1419

1520
json :: forall m. Monad m => ParserT String m Json
1621
json = defer \_ ->
@@ -38,15 +43,18 @@ jsonArray = defer \_ ->
3843
json `sepBy` (try (skipSpaces *> string ","))
3944

4045
jsonString :: forall m. Monad m => ParserT String m String
41-
jsonString = between (string "\"") (string "\"") do
42-
regex {} """\\"|[^"]*"""
46+
jsonString = case regex """\\"|[^"]*""" noFlags of
47+
Left err -> unsafeCrashWith err
48+
Right p -> between (string "\"") (string "\"") p
4349

4450
jsonNumber :: forall m. Monad m => ParserT String m Number
45-
jsonNumber = do
46-
n <- regex {} """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?"""
47-
case Number.fromString n of
48-
Just n' -> pure n'
49-
Nothing -> fail "Expected number"
51+
jsonNumber = case regex """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?""" noFlags of
52+
Left err -> unsafeCrashWith err
53+
Right p -> do
54+
n <- p
55+
case Number.fromString n of
56+
Just n' -> pure n'
57+
Nothing -> fail "Expected number"
5058

5159
jsonBoolean :: forall m. Monad m => ParserT String m Boolean
5260
jsonBoolean = choice

spago.dhall

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
, "numbers"
1818
, "partial"
1919
, "prelude"
20-
, "record"
2120
, "strings"
2221
, "tailrec"
2322
, "transformers"

src/Parsing/String.purs

Lines changed: 46 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ module Parsing.String
3737
, noneOfCodePoints
3838
, match
3939
, regex
40-
, RegexFlagsRow
4140
, consumeWith
4241
) where
4342

@@ -55,12 +54,10 @@ import Data.String (CodePoint, Pattern(..), codePointAt, length, null, singleton
5554
import Data.String as String
5655
import Data.String.CodeUnits as SCU
5756
import Data.String.Regex as Regex
58-
import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec)
57+
import Data.String.Regex.Flags (RegexFlags)
5958
import Data.Tuple (Tuple(..), fst)
6059
import Partial.Unsafe (unsafePartial)
61-
import Prim.Row (class Nub, class Union)
62-
import Record (merge)
63-
import Parsing (ParseError(..), ParseState(..), ParserT(..), fail)
60+
import Parsing (ParseError(..), ParseState(..), ParserT(..))
6461
import Parsing.Combinators ((<?>), (<~?>))
6562
import Parsing.Pos (Position(..))
6663

@@ -229,101 +226,71 @@ match p = do
229226
-- boundary.
230227
pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x
231228

232-
-- | Parser which uses the `Data.String.Regex` module to match the regular
233-
-- | expression pattern passed as the `String`
234-
-- | argument to the parser.
229+
-- | Compile a regular expression string into a regular expression parser.
230+
-- |
231+
-- | This function will use the `Data.String.Regex.regex` function to compile and return a parser which can be used
232+
-- | in a `ParserT String m` monad.
235233
-- |
236234
-- | This parser will try to match the regular expression pattern starting
237235
-- | at the current parser position. On success, it will return the matched
238236
-- | substring.
239237
-- |
240-
-- | If the `Regex` pattern string fails to compile then this parser will fail.
241-
-- | (Note: It’s not possible to use a precompiled `Regex` because this parser
242-
-- | must set flags and make adjustments to the `Regex` pattern string.)
238+
-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
239+
-- |
240+
-- | This function should be called outside the context of a `ParserT String m` monad, because this function might
241+
-- | fail with a `Left` RegExp compilation error message.
242+
-- | If you call this function inside of the `ParserT String m` monad and then `fail` the parse when the compilation fails,
243+
-- | then that could be confusing because a parser failure is supposed to indicate an invalid input string.
244+
-- | If the compilation failure occurs in an `alt` then the compilation failure might not be reported at all and instead
245+
-- | the input string would be parsed incorrectly.
243246
-- |
244247
-- | This parser may be useful for quickly consuming a large section of the
245-
-- | input `String`, because in a JavaScript runtime environment the `RegExp`
248+
-- | input `String`, because in a JavaScript runtime environment the RegExp
246249
-- | runtime is a lot faster than primitive parsers.
247250
-- |
248-
-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
251+
-- | #### Example
252+
-- |
253+
-- | This example shows how to compile and run the `xMany` parser which will
254+
-- | capture the regular expression pattern `x*`.
255+
-- |
256+
-- | ```purescript
257+
-- | case regex "x*" noFlags of
258+
-- | Left compileError -> unsafeCrashWith $ "xMany failed to compile: " <> compileError
259+
-- | Right xMany -> runParser "xxxZ" do
260+
-- | xMany
261+
-- | ```
249262
-- |
250263
-- | #### Flags
251264
-- |
252-
-- | The `Record flags` argument to the parser is for `Regex` flags. Here are
253-
-- | the default flags.
265+
-- | Set `RegexFlags` with the `Semigroup` instance like this.
254266
-- |
255267
-- | ```purescript
256-
-- | { dotAll: true
257-
-- | ignoreCase: false
258-
-- | unicode: true
259-
-- | }
268+
-- | regex "x*" (dotAll <> ignoreCase)
260269
-- | ```
261270
-- |
262-
-- | To use the defaults, pass
263-
-- | `{}` as the flags argument. For case-insensitive pattern matching, pass
264-
-- | `{ignoreCase: true}` as the flags argument.
265-
-- |
266-
-- | The other `Data.String.Regex.Flags.RegexFlagsRec` fields are mostly
267-
-- | nonsense in the context of parsing
268-
-- | and use of the other flags may cause strange behavior in the parser.
271+
-- | The `dotAll`, `unicode`, and `ignoreCase` flags might make sense for a `regex` parser. The other flags will
272+
-- | probably cause surprising behavior and you should avoid them.
269273
-- |
270274
-- | [*MDN Advanced searching with flags*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#advanced_searching_with_flags)
275+
regex :: forall m. String -> RegexFlags -> Either String (ParserT String m String)
276+
regex pattern flags =
277+
Regex.regex ("^(" <> pattern <> ")") flags <#> \regexobj ->
278+
consumeWith \input -> do
279+
case NonEmptyArray.head <$> Regex.match regexobj input of
280+
Just (Just consumed) -> do
281+
let remainder = SCU.drop (SCU.length consumed) input
282+
Right { value: consumed, consumed, remainder }
283+
_ ->
284+
Left "No Regex pattern match"
285+
286+
-- | Consume a portion of the input string while yielding a value.
271287
-- |
272-
-- | #### Example
288+
-- | Takes a consumption function which takes the remaining input `String`
289+
-- | as its argument and returns three fields:
273290
-- |
274-
-- | ```
275-
-- | runParser "ababXX" (regex {} "(ab)+")
276-
-- | ```
277-
-- | ```
278-
-- | (Right "abab")
279-
-- | ```
280-
regex
281-
:: forall m flags f_
282-
. Monad m
283-
=> Union flags RegexFlagsRow f_
284-
=> Nub f_ RegexFlagsRow
285-
=> Record flags
286-
-> String
287-
-> ParserT String m String
288-
regex flags pattern =
289-
-- Prefix a ^ to ensure the pattern only matches the current position in the parse
290-
case Regex.regex ("^(" <> pattern <> ")") flags' of
291-
Left paterr ->
292-
fail $ "Regex pattern error " <> paterr
293-
Right regexobj ->
294-
consumeWith \input -> do
295-
case NonEmptyArray.head <$> Regex.match regexobj input of
296-
Just (Just consumed) -> do
297-
let remainder = SCU.drop (SCU.length consumed) input
298-
Right { value: consumed, consumed, remainder }
299-
_ ->
300-
Left "No Regex pattern match"
301-
where
302-
flags' = RegexFlags
303-
( merge flags
304-
{ dotAll: true
305-
, global: false
306-
, ignoreCase: false
307-
, multiline: false
308-
, sticky: false
309-
, unicode: true
310-
} :: RegexFlagsRec
311-
)
312-
313-
-- | The fields from `Data.String.Regex.Flags.RegexFlagsRec`.
314-
type RegexFlagsRow =
315-
( dotAll :: Boolean
316-
, global :: Boolean
317-
, ignoreCase :: Boolean
318-
, multiline :: Boolean
319-
, sticky :: Boolean
320-
, unicode :: Boolean
321-
)
322-
323-
-- | Consumes a portion of the input string while yielding a value.
324291
-- | * `value` is the value to return.
325-
-- | * `consumed` is the input that was consumed and is used to update the parser position.
326-
-- | * `remainder` is the new input state.
292+
-- | * `consumed` is the input `String` that was consumed. It is used to update the parser position.
293+
-- | * `remainder` is the new remaining input `String`.
327294
consumeWith
328295
:: forall m a
329296
. (String -> Either String { value :: a, consumed :: String, remainder :: String })

test/Main.purs

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
-- Run tests:
2+
--
3+
-- spago -x spago-dev.dhall test
4+
--
5+
16
module Test.Main where
27

38
import Prelude hiding (between, when)
@@ -16,6 +21,7 @@ import Data.Number (infinity, isNaN)
1621
import Data.String.CodePoints as SCP
1722
import Data.String.CodeUnits (fromCharArray, singleton)
1823
import Data.String.CodeUnits as SCU
24+
import Data.String.Regex.Flags (RegexFlags, ignoreCase, noFlags)
1925
import Data.Tuple (Tuple(..))
2026
import Effect (Effect)
2127
import Effect.Console (logShow)
@@ -26,7 +32,7 @@ import Parsing.Combinators (between, chainl, chainl1Rec, chainlRec, chainr1Rec,
2632
import Parsing.Expr (Assoc(..), Operator(..), buildExprParser)
2733
import Parsing.Language (haskellDef, haskellStyle, javaStyle)
2834
import Parsing.Pos (Position(..), initialPos)
29-
import Parsing.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, regex, rest, satisfy, string, takeN, whiteSpace)
35+
import Parsing.String (anyChar, anyCodePoint, char, eof, regex, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace)
3036
import Parsing.String.Basic (intDecimal, number, letter)
3137
import Parsing.Token (TokenParser, makeTokenParser, match, token, when)
3238
import Parsing.Token as Parser.Token
@@ -94,6 +100,15 @@ manySatisfyTest = do
94100
_ <- char '?'
95101
pure (fromCharArray r)
96102

103+
mkRegexTest :: String -> String -> String -> RegexFlags -> (Parser String String -> Parser String String) -> Effect Unit
104+
mkRegexTest input expected pattern flags pars =
105+
case regex pattern flags of
106+
Left err -> assert' ("error: " <> show err) false
107+
Right p -> parseTest input expected $ pars p
108+
109+
110+
-- TODO everything is stack-safe now.
111+
--
97112
-- This test doesn't test the actual stack safety of these combinators, mainly
98113
-- because I don't know how to come up with an example guaranteed to be large
99114
-- enough to overflow the stack. But thankfully, their stack safety is more or
@@ -751,19 +766,13 @@ main = do
751766

752767
parseTest "-300" (-300) intDecimal
753768

754-
parseTest "regex-" "regex" (regex {} "regex" <* char '-' <* eof)
755-
parseTest "-regex" "regex" (char '-' *> regex {} "regex" <* eof)
756-
parseTest "regexregex" "regexregex" (regex {} "(regex)*")
757-
parseTest "regexregex" "regex" (regex {} "(^regex)*")
758-
parseTest "ReGeX" "ReGeX" (regex { ignoreCase: true } "regex")
759-
parseTest "regexcapregexcap" "regexcap" (regex {} "(?<CaptureGroupName>regexcap)")
760-
parseTest "regexcapregexcap" "regexcap" (regex {} "(((?<CaptureGroupName>(r)e(g)excap)))")
761-
762-
-- Maybe it is nonsense to allow multiline regex.
763-
-- Because an end-of-line regex pattern `$` will match but then the
764-
-- newline character will not be consumed.
765-
-- Also why does this test fail? I think it should succeed.
766-
-- parseTest "regex\nregex\n" "regex\nregex\n" (regex {dotAll: false, multiline: true} "(^regex$)+")
769+
mkRegexTest "regex-" "regex" "regex" noFlags (\p -> p <* char '-' <* eof)
770+
mkRegexTest "-regex" "regex" "regex" noFlags (\p -> char '-' *> p <* eof)
771+
mkRegexTest "regexregex" "regexregex" "(regex)*" noFlags identity
772+
mkRegexTest "regexregex" "regex" "(^regex)*" noFlags identity
773+
mkRegexTest "ReGeX" "ReGeX" "regex" ignoreCase identity
774+
mkRegexTest "regexcapregexcap" "regexcap" "(?<CaptureGroupName>regexcap)" noFlags identity
775+
mkRegexTest "regexcapregexcap" "regexcap" "(((?<CaptureGroupName>(r)e(g)excap)))" noFlags identity
767776

768777
stackSafeLoopsTest
769778

0 commit comments

Comments
 (0)