New module Text.Parsing.Parser.String.Basic

jamesdbrock · jamesdbrock · commit 0cfe422e491a · 2022-01-23T14:57:28.000+09:00
New functions
- `Parser.String.Basic.number`
- `Parser.String.Basic.intDecimal`

Moved the `Parser.Token` parsers `digit`, `hexDigit`, `octDigit`, `upper`,
`space`, `letter`, `alphaNum` into the new module `Parser.String.Basic`.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,8 @@ New features:
 - `Parser.Combinators.many1Till_` (#143 by @jamesbrock)
 - `Parser.Combinators.manyTillRec_` (#143 by @jamesbrock)
 - `Parser.Combinators.many1TillRec_` (#143 by @jamesbrock)
+- `Parser.String.Basic.number` (#142 by @jamesbrock)
+- `Parser.String.Basic.intDecimal` (#142 by @jamesbrock)
 
 Bugfixes:
 
@@ -23,8 +25,11 @@ Bugfixes:
 
 Other improvements:
 
+- Moved the `Parser.Token` parsers `digit`, `hexDigit`, `octDigit`, `upper`,
+  `space`, `letter`, `alphaNum` into the new module `Parser.String.Basic`. (#142 by @jamesdbrock)
 - Documentation. (#140 by @jamesdbrock)
 - Documentation. (#143 by @jamesdbrock)
+- Documentation. (#142 by @jamesdbrock)
 
 ## [v8.1.0](https://github.com/purescript-contrib/purescript-parsing/releases/tag/v8.1.0) - 2022-01-10
 
diff --git a/README.md b/README.md
@@ -79,7 +79,7 @@ and then the parser will succeed and return `Right true`.
 
 ### More parsers
 
-There are other `String` parsers in the module `Text.Parsing.Parser.Token`, for example the parser `letter :: Parser String Char` which will accept any single alphabetic letter.
+There are other `String` parsers in the module `Text.Parsing.Parser.String.Basic`, for example the parser `letter :: Parser String Char` which will accept any single alphabetic letter.
 
 ### Parser combinators
 
diff --git a/bench/Main.purs b/bench/Main.purs
@@ -55,7 +55,7 @@ import Effect.Unsafe (unsafePerformEffect)
 import Performance.Minibench (benchWith)
 import Text.Parsing.Parser (Parser, runParser)
 import Text.Parsing.Parser.String (string)
-import Text.Parsing.Parser.Token (digit)
+import Text.Parsing.Parser.String.Basic (digit)
 import Text.Parsing.StringParser as StringParser
 import Text.Parsing.StringParser.CodePoints as StringParser.CodePoints
 import Text.Parsing.StringParser.CodeUnits as StringParser.CodeUnits
diff --git a/spago.dhall b/spago.dhall
@@ -12,6 +12,7 @@
   , "math"
   , "maybe"
   , "newtype"
+  , "numbers"
   , "prelude"
   , "strings"
   , "tailrec"
diff --git a/src/Text/Parsing/Parser/Expr.purs b/src/Text/Parsing/Parser/Expr.purs
@@ -1,3 +1,7 @@
+-- | This module is a port of the Haskell
+-- | [__Text.Parsec.Expr__](https://hackage.haskell.org/package/docs/Text-Parsec-Expr.html)
+-- | module.
+
 module Text.Parsing.Parser.Expr
   ( Assoc(..)
   , Operator(..)
diff --git a/src/Text/Parsing/Parser/Indent.purs b/src/Text/Parsing/Parser/Indent.purs
@@ -1,5 +1,7 @@
--- | This is purescript-port of Text.Parsing.Indent
--- | <https://hackage.haskell.org/package/indents-0.3.3/docs/Text-Parsec-Indent.html>, 05.07.2016.
+-- | This module is a port of the Haskell
+-- | [__Text.Parsec.Indent__](https://hackage.haskell.org/package/indents-0.3.3/docs/Text-Parsec-Indent.html)
+-- | module from 2016-05-07.
+-- |
 -- | A module to construct indentation aware parsers. Many programming
 -- | language have indentation based syntax rules e.g. python and Haskell.
 -- | This module exports combinators to create such parsers.
diff --git a/src/Text/Parsing/Parser/Language.purs b/src/Text/Parsing/Parser/Language.purs
@@ -1,3 +1,6 @@
+-- | This module is a port of the Haskell
+-- | [__Text.Parsec.Language__](https://hackage.haskell.org/package/parsec/docs/Text-Parsec-Language.html)
+-- | module.
 module Text.Parsing.Parser.Language
   ( haskellDef
   , haskell
@@ -11,7 +14,8 @@ import Prelude
 import Control.Alt ((<|>))
 import Text.Parsing.Parser (ParserT)
 import Text.Parsing.Parser.String (char, oneOf)
-import Text.Parsing.Parser.Token (GenLanguageDef(..), LanguageDef, TokenParser, alphaNum, letter, makeTokenParser, unGenLanguageDef)
+import Text.Parsing.Parser.String.Basic (alphaNum, letter)
+import Text.Parsing.Parser.Token (GenLanguageDef(..), LanguageDef, TokenParser, makeTokenParser, unGenLanguageDef)
 
 -----------------------------------------------------------
 -- Styles: haskellStyle, javaStyle
diff --git a/src/Text/Parsing/Parser/String/Basic.purs b/src/Text/Parsing/Parser/String/Basic.purs
@@ -0,0 +1,119 @@
+-- | Basic `String` parsers derived from primitive `String` parsers.
+-- |
+-- | Note: In the future, the
+-- | __noneOf__, __noneOfCodePoints__, __oneOf__, __oneOfCodePoints__, __skipSpaces__, __whiteSpace__
+-- | should be moved into this module and removed from the
+-- | __Text.Parsing.Parser.String__ module, because they are not primitive parsers.
+module Text.Parsing.Parser.String.Basic
+  ( digit
+  , hexDigit
+  , octDigit
+  , letter
+  , space
+  , upper
+  , alphaNum
+  , intDecimal
+  , number
+  , module Text.Parsing.Parser.String
+  ) where
+
+import Prelude
+
+import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper)
+import Data.Int as Data.Int
+import Data.Maybe (Maybe(..))
+import Data.Number (infinity, nan)
+import Data.Number as Data.Number
+import Data.String (CodePoint)
+import Data.String.CodePoints (codePointFromChar)
+import Data.Tuple (Tuple(..))
+import Text.Parsing.Parser (ParserT, fail)
+import Text.Parsing.Parser.Combinators (choice, skipMany, (<?>))
+import Text.Parsing.Parser.String (noneOf, noneOfCodePoints, oneOf, oneOfCodePoints, skipSpaces, whiteSpace)
+import Text.Parsing.Parser.String as Parser.String
+
+-- | Parse a digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isDecDigit`.
+digit :: forall m. Monad m => ParserT String m Char
+digit = satisfyCP isDecDigit <?> "digit"
+
+-- | Parse a hex digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isHexDigit`.
+hexDigit :: forall m. Monad m => ParserT String m Char
+hexDigit = satisfyCP isHexDigit <?> "hex digit"
+
+-- | Parse an octal digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isOctDigit`.
+octDigit :: forall m. Monad m => ParserT String m Char
+octDigit = satisfyCP isOctDigit <?> "oct digit"
+
+-- | Parse an uppercase letter.  Matches any char that satisfies `Data.CodePoint.Unicode.isUpper`.
+upper :: forall m. Monad m => ParserT String m Char
+upper = satisfyCP isUpper <?> "uppercase letter"
+
+-- | Parse a space character.  Matches any char that satisfies `Data.CodePoint.Unicode.isSpace`.
+space :: forall m. Monad m => ParserT String m Char
+space = satisfyCP isSpace <?> "space"
+
+-- | Parse an alphabetical character.  Matches any char that satisfies `Data.CodePoint.Unicode.isAlpha`.
+letter :: forall m. Monad m => ParserT String m Char
+letter = satisfyCP isAlpha <?> "letter"
+
+-- | Parse an alphabetical or numerical character.
+-- | Matches any char that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
+alphaNum :: forall m. Monad m => ParserT String m Char
+alphaNum = satisfyCP isAlphaNum <?> "letter or digit"
+
+-- | Parser based on the __Data.Number.fromString__ function.
+-- |
+-- | This should be the inverse of `show :: String -> Number`.
+-- |
+-- | Examples of strings which can be parsed by this parser:
+-- | * `"3"`
+-- | * `"3.0"`
+-- | * `"0.3"`
+-- | * `"-0.3"`
+-- | * `"+0.3"`
+-- | * `"-3e-1"`
+-- | * `"-3.0E-1.0"`
+-- | * `"NaN"`
+-- | * `"-Infinity"`
+number :: forall m. Monad m => ParserT String m Number
+-- TODO because the JavaScript parseFloat function will successfully parse
+-- a Number up until it doesn't understand something and then return
+-- the partially parsed Number, this parser will sometimes consume more
+-- String that it actually parses. Example "1..3" will parse as 1.0.
+-- So this needs improvement.
+number =
+  choice
+    [ Parser.String.string "Infinity" *> pure infinity
+    , Parser.String.string "+Infinity" *> pure infinity
+    , Parser.String.string "-Infinity" *> pure (negate infinity)
+    , Parser.String.string "NaN" *> pure nan
+    , do
+        Tuple section _ <- Parser.String.match do
+          _ <- Parser.String.oneOf [ '+', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]
+          skipMany $ Parser.String.oneOf [ 'e', 'E', '+', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]
+        -- https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/parseFloat
+        case Data.Number.fromString section of
+          Nothing -> fail $ "Could not parse Number " <> section
+          Just x -> pure x
+    ]
+
+-- | Parser based on the __Data.Int.fromString__ function.
+-- |
+-- | This should be the inverse of `show :: String -> Int`.
+-- |
+-- | Examples of strings which can be parsed by this parser:
+-- | * `"3"`
+-- | * `"-3"`
+-- | * `"+300"`
+intDecimal :: forall m. Monad m => ParserT String m Int
+intDecimal = do
+  Tuple section _ <- Parser.String.match do
+    _ <- Parser.String.oneOf [ '+', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]
+    skipMany $ Parser.String.oneOf [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ]
+  case Data.Int.fromString section of
+    Nothing -> fail $ "Could not parse Int " <> section
+    Just x -> pure x
+
+-- | Helper function
+satisfyCP :: forall m. Monad m => (CodePoint -> Boolean) -> ParserT String m Char
+satisfyCP p = Parser.String.satisfy (p <<< codePointFromChar)
diff --git a/src/Text/Parsing/Parser/Token.purs b/src/Text/Parsing/Parser/Token.purs
@@ -1,4 +1,8 @@
 -- | Functions for working with streams of tokens.
+-- |
+-- | This module is a port of the Haskell
+-- | [__Text.Parsec.Token__](https://hackage.haskell.org/package/docs/Text-Parsec-Token.html)
+-- | module.
 
 module Text.Parsing.Parser.Token
   ( token
@@ -11,14 +15,7 @@ module Text.Parsing.Parser.Token
   , TokenParser
   , GenTokenParser
   , makeTokenParser
-  -- should these be exported?  Maybe they should go in a different module?
-  , digit
-  , hexDigit
-  , octDigit
-  , upper
-  , space
-  , letter
-  , alphaNum
+  , module Text.Parsing.Parser.String.Basic
   ) where
 
 import Prelude hiding (between, when)
@@ -28,7 +25,7 @@ import Control.Monad.State (get, gets, modify_)
 import Control.MonadPlus (guard, (<|>))
 import Data.Array as Array
 import Data.Char (fromCharCode, toCharCode)
-import Data.CodePoint.Unicode (hexDigitToInt, isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper)
+import Data.CodePoint.Unicode (hexDigitToInt, isAlpha, isSpace)
 import Data.Either (Either(..))
 import Data.Foldable (foldl, foldr)
 import Data.Identity (Identity)
@@ -37,7 +34,7 @@ import Data.List (List(..))
 import Data.List as List
 import Data.List.NonEmpty (NonEmptyList)
 import Data.Maybe (Maybe(..), maybe)
-import Data.String (CodePoint, null, toLower)
+import Data.String (null, toLower)
 import Data.String.CodePoints (codePointFromChar)
 import Data.String.CodeUnits (singleton, toChar) as CodeUnits
 import Data.String.CodeUnits as SCU
@@ -48,6 +45,8 @@ import Text.Parsing.Parser (ParseState(..), ParserT, consume, fail)
 import Text.Parsing.Parser.Combinators (between, choice, notFollowedBy, option, sepBy, sepBy1, skipMany, skipMany1, try, tryRethrow, (<?>), (<??>))
 import Text.Parsing.Parser.Pos (Position)
 import Text.Parsing.Parser.String (char, noneOf, oneOf, satisfy, satisfyCodePoint, string)
+import Text.Parsing.Parser.String.Basic as Basic
+import Text.Parsing.Parser.String.Basic (digit, hexDigit, octDigit, upper, space, letter, alphaNum)
 
 -- | A parser which returns the first token in the stream.
 token :: forall m a. Monad m => (a -> Position) -> ParserT (List a) m a
@@ -475,7 +474,7 @@ makeTokenParser (LanguageDef languageDef) =
   escapeEmpty = char '&'
 
   escapeGap :: ParserT String m Char
-  escapeGap = Array.some space *> char '\\' <?> "end of string gap"
+  escapeGap = Array.some Basic.space *> char '\\' <?> "end of string gap"
 
   -- -- escape codes
   escapeCode :: ParserT String m Char
@@ -485,16 +484,16 @@ makeTokenParser (LanguageDef languageDef) =
   charControl :: ParserT String m Char
   charControl = do
     _ <- char '^'
-    code <- upper
+    code <- Basic.upper
     case fromCharCode (toCharCode code - toCharCode 'A' + 1) of
       Just c -> pure c
       Nothing -> fail "invalid character code (should not happen)"
 
   charNum :: ParserT String m Char
   charNum = do
     code <- decimal
-      <|> (char 'o' *> number 8 octDigit)
-      <|> (char 'x' *> number 16 hexDigit)
+      <|> (char 'o' *> number 8 Basic.octDigit)
+      <|> (char 'x' *> number 16 Basic.hexDigit)
     if code > 0x10FFFF then fail "invalid escape sequence"
     else case fromCharCode code of
       Just c -> pure c
@@ -646,7 +645,7 @@ makeTokenParser (LanguageDef languageDef) =
   fraction :: ParserT String m Number
   fraction = "fraction" <??> do
     _ <- char '.'
-    digits <- Array.some digit <?> "fraction"
+    digits <- Array.some Basic.digit <?> "fraction"
     maybe (fail "not digit") pure $ foldr op (Just 0.0) digits
     where
     op :: Char -> Maybe Number -> Maybe Number
@@ -688,13 +687,13 @@ makeTokenParser (LanguageDef languageDef) =
       (hexadecimal <|> octal <|> decimal <|> pure 0) <?> ""
 
   decimal :: ParserT String m Int
-  decimal = number 10 digit
+  decimal = number 10 Basic.digit
 
   hexadecimal :: ParserT String m Int
-  hexadecimal = oneOf [ 'x', 'X' ] *> number 16 hexDigit
+  hexadecimal = oneOf [ 'x', 'X' ] *> number 16 Basic.hexDigit
 
   octal :: ParserT String m Int
-  octal = oneOf [ 'o', 'O' ] *> number 8 octDigit
+  octal = oneOf [ 'o', 'O' ] *> number 8 Basic.octDigit
 
   number :: Int -> ParserT String m Char -> ParserT String m Int
   number base baseDigit = do
@@ -878,38 +877,3 @@ inCommentSingle (LanguageDef languageDef) =
   startEnd :: Array Char
   startEnd = SCU.toCharArray languageDef.commentEnd <> SCU.toCharArray languageDef.commentStart
 
--------------------------------------------------------------------------
--- Helper functions that should maybe go in Text.Parsing.Parser.String --
--------------------------------------------------------------------------
-
-satisfyCP :: forall m. Monad m => (CodePoint -> Boolean) -> ParserT String m Char
-satisfyCP p = satisfy (p <<< codePointFromChar)
-
--- | Parse a digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isDecDigit`.
-digit :: forall m. Monad m => ParserT String m Char
-digit = satisfyCP isDecDigit <?> "digit"
-
--- | Parse a hex digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isHexDigit`.
-hexDigit :: forall m. Monad m => ParserT String m Char
-hexDigit = satisfyCP isHexDigit <?> "hex digit"
-
--- | Parse an octal digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isOctDigit`.
-octDigit :: forall m. Monad m => ParserT String m Char
-octDigit = satisfyCP isOctDigit <?> "oct digit"
-
--- | Parse an uppercase letter.  Matches any char that satisfies `Data.CodePoint.Unicode.isUpper`.
-upper :: forall m. Monad m => ParserT String m Char
-upper = satisfyCP isUpper <?> "uppercase letter"
-
--- | Parse a space character.  Matches any char that satisfies `Data.CodePoint.Unicode.isSpace`.
-space :: forall m. Monad m => ParserT String m Char
-space = satisfyCP isSpace <?> "space"
-
--- | Parse an alphabetical character.  Matches any char that satisfies `Data.CodePoint.Unicode.isAlpha`.
-letter :: forall m. Monad m => ParserT String m Char
-letter = satisfyCP isAlpha <?> "letter"
-
--- | Parse an alphabetical or numerical character.
--- | Matches any char that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
-alphaNum :: forall m. Monad m => ParserT String m Char
-alphaNum = satisfyCP isAlphaNum <?> "letter or digit"
diff --git a/test/Main.purs b/test/Main.purs
@@ -11,6 +11,7 @@ import Data.List (List(..), fromFoldable, many)
 import Data.List.NonEmpty (cons, cons')
 import Data.List.NonEmpty as NE
 import Data.Maybe (Maybe(..), fromJust)
+import Data.Number (infinity, isNaN)
 import Data.String.CodePoints as SCP
 import Data.String.CodeUnits (fromCharArray, singleton)
 import Data.String.CodeUnits as SCU
@@ -25,7 +26,8 @@ import Text.Parsing.Parser.Expr (Assoc(..), Operator(..), buildExprParser)
 import Text.Parsing.Parser.Language (haskellDef, haskellStyle, javaStyle)
 import Text.Parsing.Parser.Pos (Position(..), initialPos)
 import Text.Parsing.Parser.String (anyChar, anyCodePoint, char, eof, noneOfCodePoints, oneOfCodePoints, rest, satisfy, string, takeN, whiteSpace)
-import Text.Parsing.Parser.Token (TokenParser, letter, makeTokenParser, match, token, when)
+import Text.Parsing.Parser.String.Basic (intDecimal, number, letter)
+import Text.Parsing.Parser.Token (TokenParser, makeTokenParser, match, token, when)
 import Text.Parsing.Parser.Token as Parser.Token
 
 parens :: forall m a. Monad m => ParserT String m a -> ParserT String m a
@@ -655,6 +657,32 @@ main = do
   parseErrorTestPosition (string "a\nb\nc\n" *> eof) "a\nb\nc\nd\n" (Position { column: 1, line: 4 })
   parseErrorTestPosition (string "\ta" *> eof) "\tab" (Position { column: 10, line: 1 })
 
+  parseTest "Infinity" infinity number
+  parseTest "+Infinity" infinity number
+  parseTest "-Infinity" (negate infinity) number
+  parseErrorTestPosition number "+xxx" (mkPos 2)
+
+  parseTest "-3.0E-1.0" (-0.3) number
+
+  -- test from issue #73
+  parseTest "0.7531531167929774" 0.7531531167929774 number
+
+  -- test from issue #115
+  parseTest "-6.0" (-6.0) number
+  parseTest "+6.0" (6.0) number
+
+  -- we can't test "NaN" with `parseTest` because nan doesn't compare equal
+  case runParser "NaN" number of
+    Right actual -> do
+      assert' ("expected: NaN, actual: " <> show actual) (isNaN actual)
+      logShow actual
+    Left err -> assert' ("error: " <> show err) false
+
+  -- TODO This shows the current limitations of the number parser. Ideally this parse should fail.
+  parseTest "1..3" 1.0 $ number <* eof
+
+  parseTest "-300" (-300) intDecimal
+
   stackSafeLoopsTest
 
   tokenParserIdentifierTest