-
Notifications
You must be signed in to change notification settings - Fork 20
Support CodeUnit and CodePoint parsing #44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
-- | Primitive parsers for strings, parsing based on code units. | ||
-- | | ||
-- | These functions will be much faster than the `CodePoints` alternatives, but | ||
-- | will behave incorrectly when dealing with Unicode characters that consist | ||
-- | of multiple code units. | ||
module Text.Parsing.StringParser.CodeUnits | ||
( eof | ||
, anyChar | ||
, anyDigit | ||
, string | ||
, satisfy | ||
, char | ||
, whiteSpace | ||
, skipSpaces | ||
, oneOf | ||
, noneOf | ||
, lowerCaseChar | ||
, upperCaseChar | ||
, anyLetter | ||
, alphaNum | ||
, regex | ||
) where | ||
|
||
import Prelude | ||
|
||
import Control.Alt ((<|>)) | ||
import Data.Array ((..)) | ||
import Data.Array.NonEmpty as NEA | ||
import Data.Char (toCharCode) | ||
import Data.Either (Either(..)) | ||
import Data.Foldable (class Foldable, foldMap, elem, notElem) | ||
import Data.Maybe (Maybe(..)) | ||
import Data.String.CodeUnits (charAt, singleton) | ||
import Data.String.CodeUnits as SCU | ||
import Data.String.Pattern (Pattern(..)) | ||
import Data.String.Regex as Regex | ||
import Data.String.Regex.Flags (noFlags) | ||
import Text.Parsing.StringParser (Parser(..), ParseError(..), try, fail) | ||
import Text.Parsing.StringParser.Combinators (many, (<?>)) | ||
|
||
-- | Match the end of the file. | ||
eof :: Parser Unit | ||
eof = Parser \s -> | ||
case s of | ||
{ str, pos } | pos < SCU.length str -> Left { pos, error: ParseError "Expected EOF" } | ||
_ -> Right { result: unit, suffix: s } | ||
|
||
-- | Match any character. | ||
anyChar :: Parser Char | ||
anyChar = Parser \{ str, pos } -> | ||
case charAt pos str of | ||
Just chr -> Right { result: chr, suffix: { str, pos: pos + 1 } } | ||
Nothing -> Left { pos, error: ParseError "Unexpected EOF" } | ||
|
||
-- | Match any digit. | ||
anyDigit :: Parser Char | ||
anyDigit = try do | ||
c <- anyChar | ||
if c >= '0' && c <= '9' | ||
then pure c | ||
else fail $ "Character " <> show c <> " is not a digit" | ||
|
||
-- | Match the specified string. | ||
string :: String -> Parser String | ||
string nt = Parser \s -> | ||
case s of | ||
{ str, pos } | SCU.indexOf' (Pattern nt) pos str == Just pos -> Right { result: nt, suffix: { str, pos: pos + SCU.length nt } } | ||
{ pos } -> Left { pos, error: ParseError ("Expected '" <> nt <> "'.") } | ||
|
||
-- | Match a character satisfying the given predicate. | ||
satisfy :: (Char -> Boolean) -> Parser Char | ||
satisfy f = try do | ||
c <- anyChar | ||
if f c | ||
then pure c | ||
else fail $ "Character " <> show c <> " did not satisfy predicate" | ||
|
||
-- | Match the specified character. | ||
char :: Char -> Parser Char | ||
char c = satisfy (_ == c) <?> "Could not match character " <> show c | ||
|
||
-- | Match many whitespace characters. | ||
whiteSpace :: Parser String | ||
whiteSpace = do | ||
cs <- many (satisfy \ c -> c == '\n' || c == '\r' || c == ' ' || c == '\t') | ||
pure (foldMap singleton cs) | ||
|
||
-- | Skip many whitespace characters. | ||
skipSpaces :: Parser Unit | ||
skipSpaces = void whiteSpace | ||
|
||
-- | Match one of the characters in the foldable structure. | ||
oneOf :: forall f. Foldable f => f Char -> Parser Char | ||
oneOf = satisfy <<< flip elem | ||
|
||
-- | Match any character not in the foldable structure. | ||
noneOf :: forall f. Foldable f => f Char -> Parser Char | ||
noneOf = satisfy <<< flip notElem | ||
|
||
-- | Match any lower case character. | ||
lowerCaseChar :: Parser Char | ||
lowerCaseChar = try do | ||
c <- anyChar | ||
if toCharCode c `elem` (97 .. 122) | ||
then pure c | ||
else fail $ "Expected a lower case character but found " <> show c | ||
|
||
-- | Match any upper case character. | ||
upperCaseChar :: Parser Char | ||
upperCaseChar = try do | ||
c <- anyChar | ||
if toCharCode c `elem` (65 .. 90) | ||
then pure c | ||
else fail $ "Expected an upper case character but found " <> show c | ||
|
||
-- | Match any letter. | ||
anyLetter :: Parser Char | ||
anyLetter = lowerCaseChar <|> upperCaseChar <?> "Expected a letter" | ||
|
||
-- | Match a letter or a number. | ||
alphaNum :: Parser Char | ||
alphaNum = anyLetter <|> anyDigit <?> "Expected a letter or a number" | ||
|
||
-- | match the regular expression | ||
regex :: String -> Parser String | ||
regex pat = | ||
case Regex.regex pattern noFlags of | ||
Left _ -> | ||
fail $ "Text.Parsing.StringParser.String.regex': illegal regex " <> pat | ||
Right r -> | ||
matchRegex r | ||
where | ||
-- ensure the pattern only matches the current position in the parse | ||
pattern = | ||
case SCU.stripPrefix (Pattern "^") pat of | ||
Nothing -> | ||
"^" <> pat | ||
_ -> | ||
pat | ||
matchRegex :: Regex.Regex -> Parser String | ||
matchRegex r = | ||
Parser \{ str, pos } -> | ||
let | ||
remainder = SCU.drop pos str | ||
in | ||
case NEA.head <$> Regex.match r remainder of | ||
Just (Just matched) -> | ||
Right { result: matched, suffix: { str, pos: pos + SCU.length matched } } | ||
_ -> | ||
Left { pos, error: ParseError "no match" } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
module Test.CodePoints where | ||
|
||
import Prelude hiding (between) | ||
|
||
import Control.Alt ((<|>)) | ||
import Data.Either (isLeft, isRight, Either(..)) | ||
import Data.Foldable (fold) | ||
import Data.List (List(Nil), (:)) | ||
import Data.List.Lazy (take, repeat) | ||
import Data.List.NonEmpty (NonEmptyList(..)) | ||
import Data.NonEmpty ((:|)) | ||
import Data.String.CodeUnits (singleton) | ||
import Data.String.Common as SC | ||
import Data.Unfoldable (replicate) | ||
import Effect (Effect) | ||
import Test.Assert (assert', assert) | ||
import Text.Parsing.StringParser (Parser, runParser, try) | ||
import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between) | ||
import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser) | ||
import Text.Parsing.StringParser.CodePoints (anyDigit, eof, string, anyChar, regex) | ||
|
||
parens :: forall a. Parser a -> Parser a | ||
parens = between (string "(") (string ")") | ||
|
||
nested :: Parser Int | ||
nested = fix $ \p -> (do | ||
_ <- string "a" | ||
pure 0) <|> ((+) 1) <$> parens p | ||
|
||
opTest :: Parser String | ||
opTest = chainl (singleton <$> anyChar) (string "+" $> append) "" | ||
|
||
digit :: Parser Int | ||
digit = string "0" $> 0 | ||
<|> string "1" $> 1 | ||
<|> string "2" $> 2 | ||
<|> string "3" $> 3 | ||
<|> string "4" $> 4 | ||
<|> string "5" $> 5 | ||
<|> string "6" $> 6 | ||
<|> string "7" $> 7 | ||
<|> string "8" $> 8 | ||
<|> string "9" $> 9 | ||
|
||
exprTest :: Parser Int | ||
exprTest = buildExprParser [ [Infix (string "/" >>= \_ -> pure div) AssocRight] | ||
, [Infix (string "*" >>= \_ -> pure mul) AssocRight] | ||
, [Infix (string "-" >>= \_ -> pure sub) AssocRight] | ||
, [Infix (string "+" >>= \_ -> pure add) AssocRight] | ||
] digit | ||
|
||
tryTest :: Parser String | ||
-- reduce the possible array of matches to 0 or 1 elements to aid Array pattern matching | ||
tryTest = | ||
try (string "aa" <> string "bb") <|> | ||
(string "aa" <> string "cc") | ||
|
||
canParse :: forall a. Parser a -> String -> Boolean | ||
canParse p input = isRight $ runParser p input | ||
|
||
parseFail :: forall a. Parser a -> String -> Boolean | ||
parseFail p input = isLeft $ runParser p input | ||
|
||
expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean | ||
expectResult res p input = runParser p input == Right res | ||
|
||
testCodePoints :: Effect Unit | ||
testCodePoints = do | ||
assert' "many should not blow the stack" $ canParse (many (string "a")) (SC.joinWith "" $ replicate 100000 "a") | ||
assert' "many failing after" $ parseFail (do | ||
as <- many (string "a") | ||
eof | ||
pure as) (SC.joinWith "" (replicate 100000 "a") <> "b" ) | ||
|
||
assert $ expectResult 3 nested "(((a)))" | ||
assert $ expectResult ("a":"a":"a":Nil) (many (string "a")) "aaa" | ||
assert $ parseFail (many1 (string "a")) "" | ||
assert $ canParse (parens (do | ||
_ <- string "a" | ||
optionMaybe $ string "b")) "(ab)" | ||
assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (string "a" `sepBy1` string ",") "a,a,a" | ||
assert $ canParse (do | ||
as <- string "a" `endBy1` string "," | ||
eof | ||
pure as) "a,a,a," | ||
assert' "opTest" $ expectResult "abc" opTest "a+b+c" | ||
assert' "exprTest" $ expectResult (-3) exprTest "1*2+3/4-5" | ||
assert' "tryTest "$ canParse tryTest "aacc" | ||
assert $ expectResult (NonEmptyList ('0' :| '1':'2':'3':'4':Nil)) (many1 anyDigit) "01234/" | ||
assert $ expectResult (NonEmptyList ('5' :| '6':'7':'8':'9':Nil)) (many1 anyDigit) "56789:" | ||
assert $ expectResult "aaaa" (regex "a+") "aaaab" | ||
assert $ expectResult ("a":"a":"a":Nil) (manyTill (string "a") (string "b")) "aaab" | ||
assert $ expectResult Nil (manyTill (string "a") (string "b")) "b" | ||
assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1Till (string "a") (string "b")) "aaab" | ||
assert $ parseFail (many1Till (string "a") (string "b")) "b" | ||
-- check against overflow | ||
assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and" | ||
-- check correct order | ||
assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
module Test.CodeUnits where | ||
|
||
import Prelude hiding (between) | ||
|
||
import Control.Alt ((<|>)) | ||
import Data.Either (isLeft, isRight, Either(..)) | ||
import Data.Foldable (fold) | ||
import Data.List (List(Nil), (:)) | ||
import Data.List.Lazy (take, repeat) | ||
import Data.List.NonEmpty (NonEmptyList(..)) | ||
import Data.NonEmpty ((:|)) | ||
import Data.String.CodeUnits (singleton) | ||
import Data.String.Common as SC | ||
import Data.Unfoldable (replicate) | ||
import Effect (Effect) | ||
import Test.Assert (assert', assert) | ||
import Text.Parsing.StringParser (Parser, runParser, try) | ||
import Text.Parsing.StringParser.Combinators (many1, endBy1, sepBy1, optionMaybe, many, manyTill, many1Till, chainl, fix, between) | ||
import Text.Parsing.StringParser.Expr (Assoc(..), Operator(..), buildExprParser) | ||
import Text.Parsing.StringParser.CodeUnits (anyDigit, eof, string, anyChar, regex) | ||
|
||
parens :: forall a. Parser a -> Parser a | ||
parens = between (string "(") (string ")") | ||
|
||
nested :: Parser Int | ||
nested = fix $ \p -> (do | ||
_ <- string "a" | ||
pure 0) <|> ((+) 1) <$> parens p | ||
|
||
opTest :: Parser String | ||
opTest = chainl (singleton <$> anyChar) (string "+" $> append) "" | ||
|
||
digit :: Parser Int | ||
digit = string "0" $> 0 | ||
<|> string "1" $> 1 | ||
<|> string "2" $> 2 | ||
<|> string "3" $> 3 | ||
<|> string "4" $> 4 | ||
<|> string "5" $> 5 | ||
<|> string "6" $> 6 | ||
<|> string "7" $> 7 | ||
<|> string "8" $> 8 | ||
<|> string "9" $> 9 | ||
|
||
exprTest :: Parser Int | ||
exprTest = buildExprParser [ [Infix (string "/" >>= \_ -> pure div) AssocRight] | ||
, [Infix (string "*" >>= \_ -> pure mul) AssocRight] | ||
, [Infix (string "-" >>= \_ -> pure sub) AssocRight] | ||
, [Infix (string "+" >>= \_ -> pure add) AssocRight] | ||
] digit | ||
|
||
tryTest :: Parser String | ||
-- reduce the possible array of matches to 0 or 1 elements to aid Array pattern matching | ||
tryTest = | ||
try (string "aa" <> string "bb") <|> | ||
(string "aa" <> string "cc") | ||
|
||
canParse :: forall a. Parser a -> String -> Boolean | ||
canParse p input = isRight $ runParser p input | ||
|
||
parseFail :: forall a. Parser a -> String -> Boolean | ||
parseFail p input = isLeft $ runParser p input | ||
|
||
expectResult :: forall a. Eq a => a -> Parser a -> String -> Boolean | ||
expectResult res p input = runParser p input == Right res | ||
|
||
testCodeUnits :: Effect Unit | ||
testCodeUnits = do | ||
assert' "many should not blow the stack" $ canParse (many (string "a")) (SC.joinWith "" $ replicate 100000 "a") | ||
assert' "many failing after" $ parseFail (do | ||
as <- many (string "a") | ||
eof | ||
pure as) (SC.joinWith "" (replicate 100000 "a") <> "b" ) | ||
|
||
assert $ expectResult 3 nested "(((a)))" | ||
assert $ expectResult ("a":"a":"a":Nil) (many (string "a")) "aaa" | ||
assert $ parseFail (many1 (string "a")) "" | ||
assert $ canParse (parens (do | ||
_ <- string "a" | ||
optionMaybe $ string "b")) "(ab)" | ||
assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (string "a" `sepBy1` string ",") "a,a,a" | ||
assert $ canParse (do | ||
as <- string "a" `endBy1` string "," | ||
eof | ||
pure as) "a,a,a," | ||
assert' "opTest" $ expectResult "abc" opTest "a+b+c" | ||
assert' "exprTest" $ expectResult (-3) exprTest "1*2+3/4-5" | ||
assert' "tryTest "$ canParse tryTest "aacc" | ||
assert $ expectResult (NonEmptyList ('0' :| '1':'2':'3':'4':Nil)) (many1 anyDigit) "01234/" | ||
assert $ expectResult (NonEmptyList ('5' :| '6':'7':'8':'9':Nil)) (many1 anyDigit) "56789:" | ||
assert $ expectResult "aaaa" (regex "a+") "aaaab" | ||
assert $ expectResult ("a":"a":"a":Nil) (manyTill (string "a") (string "b")) "aaab" | ||
assert $ expectResult Nil (manyTill (string "a") (string "b")) "b" | ||
assert $ expectResult (NonEmptyList ("a" :| "a":"a":Nil)) (many1Till (string "a") (string "b")) "aaab" | ||
assert $ parseFail (many1Till (string "a") (string "b")) "b" | ||
-- check against overflow | ||
assert $ canParse (many1Till (string "a") (string "and")) $ (fold <<< take 10000 $ repeat "a") <> "and" | ||
-- check correct order | ||
assert $ expectResult (NonEmptyList ('a' :| 'b':'c':Nil)) (many1Till anyChar (string "d")) "abcd" |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These will probably cause issues
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also any combinator with "Char" below should be removed/replaced in some way?