Change String.regex so it pre-compiles

jamesdbrock · jamesdbrock · commit e3066f8f8723 · 2022-04-02T17:52:08.000+09:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,7 @@ Breaking changes:
   prevents a compiler error (i.e. `MixedAssociativityError`)
   without causing issues with `<$>`.
 - Rename module prefix from `Text.Parsing.Parser` to `Parsing` (#169 by @jamesdbrock)
+- Delete the `regex` parser and replace it with `mkRegex`. (#170 by @jamesdbrock)
 
 New features:
 
diff --git a/bench/Json/Parsing.purs b/bench/Json/Parsing.purs
@@ -4,13 +4,18 @@ import Prelude hiding (between)
 
 import Bench.Json.Common (Json(..))
 import Control.Lazy (defer)
+import Data.Either (Either(..))
 import Data.List (List)
 import Data.Maybe (Maybe(..))
 import Data.Number as Number
+import Data.String.Regex.Flags (noFlags)
 import Data.Tuple (Tuple(..))
+import Effect.Exception (throw)
+import Effect.Unsafe (unsafePerformEffect)
 import Parsing (ParserT, fail)
 import Parsing.Combinators (between, choice, sepBy, try)
 import Parsing.String (regex, skipSpaces, string)
+import Partial.Unsafe (unsafeCrashWith)
 
 json :: forall m. Monad m => ParserT String m Json
 json = defer \_ ->
@@ -38,15 +43,18 @@ jsonArray = defer \_ ->
     json `sepBy` (try (skipSpaces *> string ","))
 
 jsonString :: forall m. Monad m => ParserT String m String
-jsonString = between (string "\"") (string "\"") do
-  regex {} """\\"|[^"]*"""
+jsonString = case regex """\\"|[^"]*""" noFlags of
+  Left err -> unsafeCrashWith err
+  Right p -> between (string "\"") (string "\"") p
 
 jsonNumber :: forall m. Monad m => ParserT String m Number
-jsonNumber = do
-  n <- regex {} """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?"""
-  case Number.fromString n of
-    Just n' -> pure n'
-    Nothing -> fail "Expected number"
+jsonNumber = case regex """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?""" noFlags of
+  Left err -> unsafeCrashWith err
+  Right p -> do
+    n <- p
+    case Number.fromString n of
+      Just n' -> pure n'
+      Nothing -> fail "Expected number"
 
 jsonBoolean :: forall m. Monad m => ParserT String m Boolean
 jsonBoolean = choice
diff --git a/spago.dhall b/spago.dhall
@@ -17,7 +17,6 @@
   , "numbers"
   , "partial"
   , "prelude"
-  , "record"
   , "strings"
   , "tailrec"
   , "transformers"
diff --git a/src/Parsing/String.purs b/src/Parsing/String.purs
@@ -37,7 +37,6 @@ module Parsing.String
   , noneOfCodePoints
   , match
   , regex
-  , RegexFlagsRow
   , consumeWith
   ) where
 
@@ -55,12 +54,10 @@ import Data.String (CodePoint, Pattern(..), codePointAt, length, null, singleton
 import Data.String as String
 import Data.String.CodeUnits as SCU
 import Data.String.Regex as Regex
-import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec)
+import Data.String.Regex.Flags (RegexFlags)
 import Data.Tuple (Tuple(..), fst)
 import Partial.Unsafe (unsafePartial)
-import Prim.Row (class Nub, class Union)
-import Record (merge)
-import Parsing (ParseError(..), ParseState(..), ParserT(..), fail)
+import Parsing (ParseError(..), ParseState(..), ParserT(..))
 import Parsing.Combinators ((<?>), (<~?>))
 import Parsing.Pos (Position(..))
 
@@ -229,101 +226,71 @@ match p = do
   -- boundary.
   pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x
 
--- | Parser which uses the `Data.String.Regex` module to match the regular
--- | expression pattern passed as the `String`
--- | argument to the parser.
+-- | Compile a regular expression string into a regular expression parser.
+-- |
+-- | This function will use the `Data.String.Regex.regex` function to compile and return a parser which can be used
+-- | in a `ParserT String m` monad.
 -- |
 -- | This parser will try to match the regular expression pattern starting
 -- | at the current parser position. On success, it will return the matched
 -- | substring.
 -- |
--- | If the `Regex` pattern string fails to compile then this parser will fail.
--- | (Note: It’s not possible to use a precompiled `Regex` because this parser
--- | must set flags and make adjustments to the `Regex` pattern string.)
+-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
+-- |
+-- | This function should be called outside the context of a `ParserT String m` monad, because this function might
+-- | fail with a `Left` RegExp compilation error message.
+-- | If you call this function inside of the `ParserT String m` monad and then `fail` the parse when the compilation fails,
+-- | then that could be confusing because a parser failure is supposed to indicate an invalid input string.
+-- | If the compilation failure occurs in an `alt` then the compilation failure might not be reported at all and instead
+-- | the input string would be parsed incorrectly.
 -- |
 -- | This parser may be useful for quickly consuming a large section of the
--- | input `String`, because in a JavaScript runtime environment the `RegExp`
+-- | input `String`, because in a JavaScript runtime environment the RegExp
 -- | runtime is a lot faster than primitive parsers.
 -- |
--- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
+-- | #### Example
+-- |
+-- | This example shows how to compile and run the `xMany` parser which will
+-- | capture the regular expression pattern `x*`.
+-- |
+-- | ```purescript
+-- | case regex "x*" noFlags of
+-- |   Left compileError -> unsafeCrashWith $ "xMany failed to compile: " <> compileError
+-- |   Right xMany -> runParser "xxxZ" do
+-- |     xMany
+-- | ```
 -- |
 -- | #### Flags
 -- |
--- | The `Record flags` argument to the parser is for `Regex` flags. Here are
--- | the default flags.
+-- | Set `RegexFlags` with the `Semigroup` instance like this.
 -- |
 -- | ```purescript
--- | { dotAll: true
--- |   ignoreCase: false
--- |   unicode: true
--- | }
+-- | regex "x*" (dotAll <> ignoreCase)
 -- | ```
 -- |
--- | To use the defaults, pass
--- | `{}` as the flags argument. For case-insensitive pattern matching, pass
--- | `{ignoreCase: true}` as the flags argument.
--- |
--- | The other `Data.String.Regex.Flags.RegexFlagsRec` fields are mostly
--- | nonsense in the context of parsing
--- | and use of the other flags may cause strange behavior in the parser.
+-- | The `dotAll`, `unicode`, and `ignoreCase` flags might make sense for a `regex` parser. The other flags will
+-- | probably cause surprising behavior and you should avoid them.
 -- |
 -- | [*MDN Advanced searching with flags*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#advanced_searching_with_flags)
+regex :: forall m. String -> RegexFlags -> Either String (ParserT String m String)
+regex pattern flags =
+  Regex.regex ("^(" <> pattern <> ")") flags <#> \regexobj ->
+    consumeWith \input -> do
+      case NonEmptyArray.head <$> Regex.match regexobj input of
+        Just (Just consumed) -> do
+          let remainder = SCU.drop (SCU.length consumed) input
+          Right { value: consumed, consumed, remainder }
+        _ ->
+          Left "No Regex pattern match"
+
+-- | Consume a portion of the input string while yielding a value.
 -- |
--- | #### Example
+-- | Takes a consumption function which takes the remaining input `String`
+-- | as its argument and returns three fields:
 -- |
--- | ```
--- | runParser "ababXX" (regex {} "(ab)+")
--- | ```
--- | ```
--- | (Right "abab")
--- | ```
-regex
-  :: forall m flags f_
-   . Monad m
-  => Union flags RegexFlagsRow f_
-  => Nub f_ RegexFlagsRow
-  => Record flags
-  -> String
-  -> ParserT String m String
-regex flags pattern =
-  -- Prefix a ^ to ensure the pattern only matches the current position in the parse
-  case Regex.regex ("^(" <> pattern <> ")") flags' of
-    Left paterr ->
-      fail $ "Regex pattern error " <> paterr
-    Right regexobj ->
-      consumeWith \input -> do
-        case NonEmptyArray.head <$> Regex.match regexobj input of
-          Just (Just consumed) -> do
-            let remainder = SCU.drop (SCU.length consumed) input
-            Right { value: consumed, consumed, remainder }
-          _ ->
-            Left "No Regex pattern match"
-  where
-  flags' = RegexFlags
-    ( merge flags
-        { dotAll: true
-        , global: false
-        , ignoreCase: false
-        , multiline: false
-        , sticky: false
-        , unicode: true
-        } :: RegexFlagsRec
-    )
-
--- | The fields from `Data.String.Regex.Flags.RegexFlagsRec`.
-type RegexFlagsRow =
-  ( dotAll :: Boolean
-  , global :: Boolean
-  , ignoreCase :: Boolean
-  , multiline :: Boolean
-  , sticky :: Boolean
-  , unicode :: Boolean
-  )
-
--- | Consumes a portion of the input string while yielding a value.
 -- | * `value` is the value to return.
--- | * `consumed` is the input that was consumed and is used to update the parser position.
--- | * `remainder` is the new input state.
+-- | * `consumed` is the input `String` that was consumed. It is used to update the parser position.
+-- | * `remainder` is the new remaining input `String`.
 consumeWith
   :: forall m a
    . (String -> Either String { value :: a, consumed :: String, remainder :: String })
diff --git a/test/Main.purs b/test/Main.purs
@@ -1,3 +1,8 @@
+-- Run tests:
+--
+--     spago -x spago-dev.dhall test
+--
+
 module Test.Main where
 
 import Prelude hiding (between, when)
@@ -16,6 +21,7 @@ import Data.Number (infinity, isNaN)
 import Data.String.CodePoints as SCP
 import Data.String.CodeUnits (fromCharArray, singleton)
 import Data.String.CodeUnits as SCU
+import Data.String.Regex.Flags (RegexFlags, ignoreCase, noFlags)
 import Data.Tuple (Tuple(..))
 import Effect (Effect)
 import Effect.Console (logShow)
@@ -26,7 +32,7 @@ import Parsing.Combinators (between, chainl, chainl1Rec, chainlRec, chainr1Rec,
 import Parsing.Expr (Assoc(..), Operator(..), buildExprParser)
 import Parsing.Language (haskellDef, haskellStyle, javaStyle)
 import Parsing.Pos (Position(..), initialPos)
-import Parsing.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, regex, rest, satisfy, string, takeN, whiteSpace)
+import Parsing.String (anyChar, anyCodePoint, char, eof, regex, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace)
 import Parsing.String.Basic (intDecimal, number, letter)
 import Parsing.Token (TokenParser, makeTokenParser, match, token, when)
 import Parsing.Token as Parser.Token
@@ -94,6 +100,15 @@ manySatisfyTest = do
   _ <- char '?'
   pure (fromCharArray r)
 
+mkRegexTest :: String -> String -> String -> RegexFlags -> (Parser String String -> Parser String String) -> Effect Unit
+mkRegexTest input expected pattern flags pars =
+  case regex pattern flags of
+    Left err -> assert' ("error: " <> show err) false
+    Right p -> parseTest input expected $ pars p
+
+
+-- TODO everything is stack-safe now.
+--
 -- This test doesn't test the actual stack safety of these combinators, mainly
 -- because I don't know how to come up with an example guaranteed to be large
 -- enough to overflow the stack. But thankfully, their stack safety is more or
@@ -751,19 +766,13 @@ main = do
 
   parseTest "-300" (-300) intDecimal
 
-  parseTest "regex-" "regex" (regex {} "regex" <* char '-' <* eof)
-  parseTest "-regex" "regex" (char '-' *> regex {} "regex" <* eof)
-  parseTest "regexregex" "regexregex" (regex {} "(regex)*")
-  parseTest "regexregex" "regex" (regex {} "(^regex)*")
-  parseTest "ReGeX" "ReGeX" (regex { ignoreCase: true } "regex")
-  parseTest "regexcapregexcap" "regexcap" (regex {} "(?<CaptureGroupName>regexcap)")
-  parseTest "regexcapregexcap" "regexcap" (regex {} "(((?<CaptureGroupName>(r)e(g)excap)))")
-
-  -- Maybe it is nonsense to allow multiline regex.
-  -- Because an end-of-line regex pattern `$` will match but then the
-  -- newline character will not be consumed.
-  -- Also why does this test fail? I think it should succeed.
-  -- parseTest "regex\nregex\n" "regex\nregex\n" (regex {dotAll: false, multiline: true} "(^regex$)+")
+  mkRegexTest "regex-" "regex" "regex" noFlags (\p -> p <* char '-' <* eof)
+  mkRegexTest "-regex" "regex" "regex" noFlags (\p -> char '-' *> p <* eof)
+  mkRegexTest "regexregex" "regexregex" "(regex)*" noFlags identity
+  mkRegexTest "regexregex" "regex" "(^regex)*" noFlags identity
+  mkRegexTest "ReGeX" "ReGeX" "regex" ignoreCase identity
+  mkRegexTest "regexcapregexcap" "regexcap" "(?<CaptureGroupName>regexcap)" noFlags identity
+  mkRegexTest "regexcapregexcap" "regexcap" "(((?<CaptureGroupName>(r)e(g)excap)))" noFlags identity
 
   stackSafeLoopsTest