Skip to content

Commit 43463e3

Browse files
authored
Merge pull request #170 from purescript-contrib/mkregex
New `regex` parser
2 parents 4ccd3cd + e828642 commit 43463e3

File tree

5 files changed

+103
-108
lines changed

5 files changed

+103
-108
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Breaking changes:
2626
prevents a compiler error (i.e. `MixedAssociativityError`)
2727
without causing issues with `<$>`.
2828
- Rename module prefix from `Text.Parsing.Parser` to `Parsing` (#169 by @jamesdbrock)
29+
- Delete the `regex` parser and replace it with `mkRegex`. (#170 by @jamesdbrock)
2930

3031
New features:
3132

bench/Json/Parsing.purs

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@ import Prelude hiding (between)
44

55
import Bench.Json.Common (Json(..))
66
import Control.Lazy (defer)
7+
import Data.Either (Either(..))
78
import Data.List (List)
89
import Data.Maybe (Maybe(..))
910
import Data.Number as Number
11+
import Data.String.Regex.Flags (noFlags)
1012
import Data.Tuple (Tuple(..))
13+
import Effect.Exception (throw)
14+
import Effect.Unsafe (unsafePerformEffect)
1115
import Parsing (ParserT, fail)
1216
import Parsing.Combinators (between, choice, sepBy, try)
1317
import Parsing.String (regex, skipSpaces, string)
18+
import Partial.Unsafe (unsafeCrashWith)
1419

1520
json :: forall m. Monad m => ParserT String m Json
1621
json = defer \_ ->
@@ -38,15 +43,18 @@ jsonArray = defer \_ ->
3843
json `sepBy` (try (skipSpaces *> string ","))
3944

4045
jsonString :: forall m. Monad m => ParserT String m String
41-
jsonString = between (string "\"") (string "\"") do
42-
regex {} """\\"|[^"]*"""
46+
jsonString = case regex """\\"|[^"]*""" noFlags of
47+
Left err -> unsafeCrashWith err
48+
Right p -> between (string "\"") (string "\"") p
4349

4450
jsonNumber :: forall m. Monad m => ParserT String m Number
45-
jsonNumber = do
46-
n <- regex {} """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?"""
47-
case Number.fromString n of
48-
Just n' -> pure n'
49-
Nothing -> fail "Expected number"
51+
jsonNumber = case regex """(\+|-)?(\d+(\.\d*)?|\d*\.\d+)([eE](\+|-)?\d+)?""" noFlags of
52+
Left err -> unsafeCrashWith err
53+
Right p -> do
54+
n <- p
55+
case Number.fromString n of
56+
Just n' -> pure n'
57+
Nothing -> fail "Expected number"
5058

5159
jsonBoolean :: forall m. Monad m => ParserT String m Boolean
5260
jsonBoolean = choice

spago.dhall

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
, "numbers"
1818
, "partial"
1919
, "prelude"
20-
, "record"
2120
, "strings"
2221
, "tailrec"
2322
, "transformers"

src/Parsing/String.purs

Lines changed: 46 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ module Parsing.String
3737
, noneOfCodePoints
3838
, match
3939
, regex
40-
, RegexFlagsRow
4140
, consumeWith
4241
) where
4342

@@ -55,12 +54,10 @@ import Data.String (CodePoint, Pattern(..), codePointAt, length, null, singleton
5554
import Data.String as String
5655
import Data.String.CodeUnits as SCU
5756
import Data.String.Regex as Regex
58-
import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec)
57+
import Data.String.Regex.Flags (RegexFlags)
5958
import Data.Tuple (Tuple(..), fst)
6059
import Partial.Unsafe (unsafePartial)
61-
import Prim.Row (class Nub, class Union)
62-
import Record (merge)
63-
import Parsing (ParseError(..), ParseState(..), ParserT(..), fail)
60+
import Parsing (ParseError(..), ParseState(..), ParserT(..))
6461
import Parsing.Combinators ((<?>), (<~?>))
6562
import Parsing.Pos (Position(..))
6663

@@ -229,101 +226,71 @@ match p = do
229226
-- boundary.
230227
pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x
231228

232-
-- | Parser which uses the `Data.String.Regex` module to match the regular
233-
-- | expression pattern passed as the `String`
234-
-- | argument to the parser.
229+
-- | Compile a regular expression string into a regular expression parser.
230+
-- |
231+
-- | This function will use the `Data.String.Regex.regex` function to compile and return a parser which can be used
232+
-- | in a `ParserT String m` monad.
235233
-- |
236234
-- | This parser will try to match the regular expression pattern starting
237235
-- | at the current parser position. On success, it will return the matched
238236
-- | substring.
239237
-- |
240-
-- | If the `Regex` pattern string fails to compile then this parser will fail.
241-
-- | (Note: It’s not possible to use a precompiled `Regex` because this parser
242-
-- | must set flags and make adjustments to the `Regex` pattern string.)
238+
-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
239+
-- |
240+
-- | This function should be called outside the context of a `ParserT String m` monad, because this function might
241+
-- | fail with a `Left` RegExp compilation error message.
242+
-- | If you call this function inside of the `ParserT String m` monad and then `fail` the parse when the compilation fails,
243+
-- | then that could be confusing because a parser failure is supposed to indicate an invalid input string.
244+
-- | If the compilation failure occurs in an `alt` then the compilation failure might not be reported at all and instead
245+
-- | the input string would be parsed incorrectly.
243246
-- |
244247
-- | This parser may be useful for quickly consuming a large section of the
245-
-- | input `String`, because in a JavaScript runtime environment the `RegExp`
248+
-- | input `String`, because in a JavaScript runtime environment the RegExp
246249
-- | runtime is a lot faster than primitive parsers.
247250
-- |
248-
-- | [*MDN Regular Expressions Cheatsheet*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions/Cheatsheet)
251+
-- | #### Example
252+
-- |
253+
-- | This example shows how to compile and run the `xMany` parser which will
254+
-- | capture the regular expression pattern `x*`.
255+
-- |
256+
-- | ```purescript
257+
-- | case regex "x*" noFlags of
258+
-- | Left compileError -> unsafeCrashWith $ "xMany failed to compile: " <> compileError
259+
-- | Right xMany -> runParser "xxxZ" do
260+
-- | xMany
261+
-- | ```
249262
-- |
250263
-- | #### Flags
251264
-- |
252-
-- | The `Record flags` argument to the parser is for `Regex` flags. Here are
253-
-- | the default flags.
265+
-- | Set `RegexFlags` with the `Semigroup` instance like this.
254266
-- |
255267
-- | ```purescript
256-
-- | { dotAll: true
257-
-- | ignoreCase: false
258-
-- | unicode: true
259-
-- | }
268+
-- | regex "x*" (dotAll <> ignoreCase)
260269
-- | ```
261270
-- |
262-
-- | To use the defaults, pass
263-
-- | `{}` as the flags argument. For case-insensitive pattern matching, pass
264-
-- | `{ignoreCase: true}` as the flags argument.
265-
-- |
266-
-- | The other `Data.String.Regex.Flags.RegexFlagsRec` fields are mostly
267-
-- | nonsense in the context of parsing
268-
-- | and use of the other flags may cause strange behavior in the parser.
271+
-- | The `dotAll`, `unicode`, and `ignoreCase` flags might make sense for a `regex` parser. The other flags will
272+
-- | probably cause surprising behavior and you should avoid them.
269273
-- |
270274
-- | [*MDN Advanced searching with flags*](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#advanced_searching_with_flags)
275+
regex :: forall m. String -> RegexFlags -> Either String (ParserT String m String)
276+
regex pattern flags =
277+
Regex.regex ("^(" <> pattern <> ")") flags <#> \regexobj ->
278+
consumeWith \input -> do
279+
case NonEmptyArray.head <$> Regex.match regexobj input of
280+
Just (Just consumed) -> do
281+
let remainder = SCU.drop (SCU.length consumed) input
282+
Right { value: consumed, consumed, remainder }
283+
_ ->
284+
Left "No Regex pattern match"
285+
286+
-- | Consume a portion of the input string while yielding a value.
271287
-- |
272-
-- | #### Example
288+
-- | Takes a consumption function which takes the remaining input `String`
289+
-- | as its argument and returns three fields:
273290
-- |
274-
-- | ```
275-
-- | runParser "ababXX" (regex {} "(ab)+")
276-
-- | ```
277-
-- | ```
278-
-- | (Right "abab")
279-
-- | ```
280-
regex
281-
:: forall m flags f_
282-
. Monad m
283-
=> Union flags RegexFlagsRow f_
284-
=> Nub f_ RegexFlagsRow
285-
=> Record flags
286-
-> String
287-
-> ParserT String m String
288-
regex flags pattern =
289-
-- Prefix a ^ to ensure the pattern only matches the current position in the parse
290-
case Regex.regex ("^(" <> pattern <> ")") flags' of
291-
Left paterr ->
292-
fail $ "Regex pattern error " <> paterr
293-
Right regexobj ->
294-
consumeWith \input -> do
295-
case NonEmptyArray.head <$> Regex.match regexobj input of
296-
Just (Just consumed) -> do
297-
let remainder = SCU.drop (SCU.length consumed) input
298-
Right { value: consumed, consumed, remainder }
299-
_ ->
300-
Left "No Regex pattern match"
301-
where
302-
flags' = RegexFlags
303-
( merge flags
304-
{ dotAll: true
305-
, global: false
306-
, ignoreCase: false
307-
, multiline: false
308-
, sticky: false
309-
, unicode: true
310-
} :: RegexFlagsRec
311-
)
312-
313-
-- | The fields from `Data.String.Regex.Flags.RegexFlagsRec`.
314-
type RegexFlagsRow =
315-
( dotAll :: Boolean
316-
, global :: Boolean
317-
, ignoreCase :: Boolean
318-
, multiline :: Boolean
319-
, sticky :: Boolean
320-
, unicode :: Boolean
321-
)
322-
323-
-- | Consumes a portion of the input string while yielding a value.
324291
-- | * `value` is the value to return.
325-
-- | * `consumed` is the input that was consumed and is used to update the parser position.
326-
-- | * `remainder` is the new input state.
292+
-- | * `consumed` is the input `String` that was consumed. It is used to update the parser position.
293+
-- | * `remainder` is the new remaining input `String`.
327294
consumeWith
328295
:: forall m a
329296
. (String -> Either String { value :: a, consumed :: String, remainder :: String })

test/Main.purs

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
-- Run tests:
2+
--
3+
-- spago -x spago-dev.dhall test
4+
--
5+
16
module Test.Main where
27

38
import Prelude hiding (between, when)
@@ -16,17 +21,18 @@ import Data.Number (infinity, isNaN)
1621
import Data.String.CodePoints as SCP
1722
import Data.String.CodeUnits (fromCharArray, singleton)
1823
import Data.String.CodeUnits as SCU
24+
import Data.String.Regex.Flags (RegexFlags, ignoreCase, noFlags)
1925
import Data.Tuple (Tuple(..))
2026
import Effect (Effect)
21-
import Effect.Console (logShow)
27+
import Effect.Console (log, logShow)
2228
import Partial.Unsafe (unsafePartial)
2329
import Test.Assert (assert')
2430
import Parsing (ParseError(..), Parser, ParserT, fail, parseErrorMessage, parseErrorPosition, position, region, runParser)
2531
import Parsing.Combinators (between, chainl, chainl1Rec, chainlRec, chainr1Rec, chainrRec, choice, endBy1, endBy1Rec, endByRec, many1Rec, many1TillRec, many1TillRec_, many1Till_, manyTillRec, manyTillRec_, manyTill_, notFollowedBy, optionMaybe, sepBy1, sepBy1Rec, sepByRec, sepEndBy1Rec, sepEndByRec, skipMany1Rec, skipManyRec, try, (<?>), (<??>), (<~?>))
2632
import Parsing.Expr (Assoc(..), Operator(..), buildExprParser)
2733
import Parsing.Language (haskellDef, haskellStyle, javaStyle)
2834
import Parsing.Pos (Position(..), initialPos)
29-
import Parsing.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, regex, rest, satisfy, string, takeN, whiteSpace)
35+
import Parsing.String (anyChar, anyCodePoint, char, eof, regex, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace)
3036
import Parsing.String.Basic (intDecimal, number, letter)
3137
import Parsing.Token (TokenParser, makeTokenParser, match, token, when)
3238
import Parsing.Token as Parser.Token
@@ -94,6 +100,14 @@ manySatisfyTest = do
94100
_ <- char '?'
95101
pure (fromCharArray r)
96102

103+
mkRegexTest :: String -> String -> String -> RegexFlags -> (Parser String String -> Parser String String) -> Effect Unit
104+
mkRegexTest input expected pattern flags pars =
105+
case regex pattern flags of
106+
Left err -> assert' ("error: " <> show err) false
107+
Right p -> parseTest input expected $ pars p
108+
109+
-- TODO everything is stack-safe now.
110+
--
97111
-- This test doesn't test the actual stack safety of these combinators, mainly
98112
-- because I don't know how to come up with an example guaranteed to be large
99113
-- enough to overflow the stack. But thankfully, their stack safety is more or
@@ -559,6 +573,7 @@ javaStyleTest = do
559573
main :: Effect Unit
560574
main = do
561575

576+
log "\nTESTS String\n"
562577
parseErrorTestPosition
563578
(many $ char 'f' *> char '?')
564579
"foo"
@@ -667,6 +682,8 @@ main = do
667682
parseErrorTestPosition (string "a\nb\nc\n" *> eof) "a\nb\nc\nd\n" (Position { column: 1, line: 4 })
668683
parseErrorTestPosition (string "\ta" *> eof) "\tab" (Position { column: 10, line: 1 })
669684

685+
log "\nTESTS number\n"
686+
670687
parseTest "Infinity" infinity number
671688
parseTest "+Infinity" infinity number
672689
parseTest "-Infinity" (negate infinity) number
@@ -681,6 +698,7 @@ main = do
681698
parseTest "-6.0" (-6.0) number
682699
parseTest "+6.0" (6.0) number
683700

701+
log "\nTESTS Operator\n"
684702
-- test from issue #161
685703
-- all the below operators should play well together
686704
parseErrorTestMessage
@@ -749,24 +767,23 @@ main = do
749767
-- TODO This shows the current limitations of the number parser. Ideally this parse should fail.
750768
parseTest "1..3" 1.0 $ number <* eof
751769

770+
log "\nTESTS intDecimal\n"
752771
parseTest "-300" (-300) intDecimal
753772

754-
parseTest "regex-" "regex" (regex {} "regex" <* char '-' <* eof)
755-
parseTest "-regex" "regex" (char '-' *> regex {} "regex" <* eof)
756-
parseTest "regexregex" "regexregex" (regex {} "(regex)*")
757-
parseTest "regexregex" "regex" (regex {} "(^regex)*")
758-
parseTest "ReGeX" "ReGeX" (regex { ignoreCase: true } "regex")
759-
parseTest "regexcapregexcap" "regexcap" (regex {} "(?<CaptureGroupName>regexcap)")
760-
parseTest "regexcapregexcap" "regexcap" (regex {} "(((?<CaptureGroupName>(r)e(g)excap)))")
761-
762-
-- Maybe it is nonsense to allow multiline regex.
763-
-- Because an end-of-line regex pattern `$` will match but then the
764-
-- newline character will not be consumed.
765-
-- Also why does this test fail? I think it should succeed.
766-
-- parseTest "regex\nregex\n" "regex\nregex\n" (regex {dotAll: false, multiline: true} "(^regex$)+")
773+
log "\nTESTS Regex\n"
774+
mkRegexTest "regex-" "regex" "regex" noFlags (\regex -> regex <* char '-' <* eof)
775+
mkRegexTest "-regex" "regex" "regex" noFlags (\regex -> char '-' *> regex <* eof)
776+
mkRegexTest "regexregex" "regexregex" "(regex)*" noFlags identity
777+
mkRegexTest "regexregex" "regex" "(^regex)*" noFlags identity
778+
mkRegexTest "ReGeX" "ReGeX" "regex" ignoreCase identity
779+
mkRegexTest "regexcapregexcap" "regexcap" "(?<CaptureGroupName>regexcap)" noFlags identity
780+
mkRegexTest "regexcapregexcap" "regexcap" "(((?<CaptureGroupName>(r)e(g)excap)))" noFlags identity
767781

782+
log "\nTESTS Stack Safe Loops\n"
768783
stackSafeLoopsTest
769784

785+
log "\nTESTS Token Parser\n"
786+
770787
tokenParserIdentifierTest
771788
tokenParserReservedTest
772789
tokenParserOperatorTest
@@ -799,18 +816,21 @@ main = do
799816
tokenParserCommaSepTest
800817
tokenParserCommaSep1Test
801818

819+
log "\nTESTS Haskell Style\n"
802820
haskellStyleTest
821+
log "\nTESTS Java Style\n"
803822
javaStyleTest
804823

824+
log "\nTESTS region\n"
825+
let
826+
prependContext m' (ParseError m pos) = ParseError (m' <> m) pos
827+
p = region (prependContext "context1 ") $ do
828+
_ <- string "a"
829+
region (prependContext "context2 ") $ do
830+
string "b"
805831
case runParser "aa" p of
806832
Right _ -> assert' "error: ParseError expected!" false
807833
Left (ParseError message _) -> do
808834
let messageExpected = "context1 context2 Expected \"b\""
809835
assert' ("expected message: " <> messageExpected <> ", message: " <> message) (message == messageExpected)
810836
logShow messageExpected
811-
where
812-
prependContext m' (ParseError m pos) = ParseError (m' <> m) pos
813-
p = region (prependContext "context1 ") $ do
814-
_ <- string "a"
815-
region (prependContext "context2 ") $ do
816-
string "b"

0 commit comments

Comments
 (0)