|
1 | 1 | -- | Primitive parsers for working with an input stream of type `String`.
|
2 |
| - |
3 |
| -module Text.Parsing.Parser.String where |
| 2 | +-- | In most JavaScript runtime environments, the `String` is encoded |
| 3 | +-- | as little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16), but |
| 4 | +-- | these primitive parsers should work with any runtime encoding. |
| 5 | +-- | |
| 6 | +-- | The primitive parsers which return `Char` will only succeed when the character |
| 7 | +-- | being parsed is a code point in the |
| 8 | +-- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane) |
| 9 | +-- | (the “BMP”). These parsers can be convenient because of the good support |
| 10 | +-- | that PureScript has for writing `Char` literals like `'あ', 'β', 'C'`. |
| 11 | +-- | |
| 12 | +-- | The other primitive parsers, which return `CodePoint` and `String` types, |
| 13 | +-- | can parse the full Unicode character set. All of the primitive parsers |
| 14 | +-- | in this module can be used together. |
| 15 | +module Text.Parsing.Parser.String |
| 16 | +( string |
| 17 | +, eof |
| 18 | +, anyChar |
| 19 | +, anyCodePoint |
| 20 | +, satisfy |
| 21 | +, satisfyCodePoint |
| 22 | +, char |
| 23 | +, whiteSpace |
| 24 | +, skipSpaces |
| 25 | +, oneOf |
| 26 | +, noneOf |
| 27 | +, match |
| 28 | +, digit |
| 29 | +, hexDigit |
| 30 | +, octDigit |
| 31 | +, upper |
| 32 | +, space |
| 33 | +, letter |
| 34 | +, alphaNum |
| 35 | +) |
| 36 | +where |
4 | 37 |
|
5 | 38 | import Prelude hiding (between)
|
6 | 39 |
|
7 |
| -import Control.Monad.State (gets, modify_) |
8 |
| -import Data.Array (many) |
9 |
| -import Data.Foldable (elem, notElem) |
| 40 | +import Control.Monad.State (get, put) |
| 41 | +import Data.Array (notElem) |
| 42 | +import Data.Char (fromCharCode) |
| 43 | +import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper) |
| 44 | +import Data.Foldable (elem) |
10 | 45 | import Data.Maybe (Maybe(..))
|
11 |
| -import Data.Newtype (wrap) |
12 |
| -import Data.String (Pattern) |
13 |
| -import Data.String as S |
| 46 | +import Data.String (CodePoint, Pattern(..), codePointFromChar, null, stripPrefix, uncons) |
14 | 47 | import Data.String.CodeUnits as SCU
|
| 48 | +import Data.Tuple (Tuple(..), fst) |
15 | 49 | import Text.Parsing.Parser (ParseState(..), ParserT, fail)
|
16 |
| -import Text.Parsing.Parser.Combinators (tryRethrow, (<?>)) |
17 |
| -import Text.Parsing.Parser.Pos (updatePosString) |
18 |
| - |
19 |
| --- | This class exists to abstract over streams which support the string-like |
20 |
| --- | operations which this modules needs. |
21 |
| -class StringLike s where |
22 |
| - drop :: Int -> s -> s |
23 |
| - stripPrefix :: Pattern -> s -> Maybe s |
24 |
| - null :: s -> Boolean |
25 |
| - uncons :: s -> Maybe { head :: Char, tail :: s } |
26 |
| - |
27 |
| -instance stringLikeString :: StringLike String where |
28 |
| - uncons = SCU.uncons |
29 |
| - drop = S.drop |
30 |
| - stripPrefix = S.stripPrefix |
31 |
| - null = S.null |
| 50 | +import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (<?>)) |
| 51 | +import Text.Parsing.Parser.Pos (Position(..)) |
| 52 | +import Unsafe.Coerce (unsafeCoerce) |
32 | 53 |
|
33 | 54 | -- | Match end-of-file.
|
34 |
| -eof :: forall s m. StringLike s => Monad m => ParserT s m Unit |
| 55 | +eof :: forall m. Monad m => ParserT String m Unit |
35 | 56 | eof = do
|
36 |
| - input <- gets \(ParseState input _ _) -> input |
| 57 | + ParseState input _ _ <- get |
37 | 58 | unless (null input) (fail "Expected EOF")
|
38 | 59 |
|
39 | 60 | -- | Match the specified string.
|
40 |
| -string :: forall s m. StringLike s => Monad m => String -> ParserT s m String |
| 61 | +string :: forall m. Monad m => String -> ParserT String m String |
41 | 62 | string str = do
|
42 |
| - input <- gets \(ParseState input _ _) -> input |
43 |
| - case stripPrefix (wrap str) input of |
| 63 | + ParseState input position _ <- get |
| 64 | + case stripPrefix (Pattern str) input of |
44 | 65 | Just remainder -> do
|
45 |
| - modify_ \(ParseState _ position _) -> |
46 |
| - ParseState remainder |
47 |
| - (updatePosString position str) |
48 |
| - true |
| 66 | + put $ ParseState remainder (updatePosString position str) true |
49 | 67 | pure str
|
50 | 68 | _ -> fail ("Expected " <> show str)
|
51 | 69 |
|
52 |
| --- | Match any character. |
53 |
| -anyChar :: forall s m. StringLike s => Monad m => ParserT s m Char |
54 |
| -anyChar = do |
55 |
| - input <- gets \(ParseState input _ _) -> input |
| 70 | +-- | Match any BMP `Char`. |
| 71 | +-- | Parser will fail if the character is not in the Basic Multilingual Plane. |
| 72 | +anyChar :: forall m. Monad m => ParserT String m Char |
| 73 | +anyChar = tryRethrow do |
| 74 | + cp :: Int <- unsafeCoerce <$> anyCodePoint |
| 75 | + -- the `fromCharCode` function doesn't check if this is beyond the |
| 76 | + -- BMP, so we check that ourselves. |
| 77 | + -- https://github.com/purescript/purescript-strings/issues/153 |
| 78 | + if cp > 65535 -- BMP |
| 79 | + then fail "Not a Char" |
| 80 | + else case fromCharCode cp of |
| 81 | + Nothing -> fail "Not a Char" |
| 82 | + Just c -> pure c |
| 83 | + |
| 84 | +-- | Match any Unicode character. |
| 85 | +-- | Always succeeds. |
| 86 | +anyCodePoint :: forall m. Monad m => ParserT String m CodePoint |
| 87 | +anyCodePoint = do |
| 88 | + -- ParseState input _ _ <- get |
| 89 | + ParseState input position _ <- get |
56 | 90 | case uncons input of
|
57 | 91 | Nothing -> fail "Unexpected EOF"
|
58 | 92 | Just { head, tail } -> do
|
59 |
| - modify_ \(ParseState _ position _) -> |
60 |
| - ParseState tail |
61 |
| - (updatePosString position (SCU.singleton head)) |
62 |
| - true |
| 93 | + put $ ParseState tail (updatePosSingle position head) true |
63 | 94 | pure head
|
64 | 95 |
|
65 |
| --- | Match a character satisfying the specified predicate. |
66 |
| -satisfy :: forall s m. StringLike s => Monad m => (Char -> Boolean) -> ParserT s m Char |
| 96 | +-- | Match a BMP `Char` satisfying the predicate. |
| 97 | +satisfy :: forall m. Monad m => (Char -> Boolean) -> ParserT String m Char |
67 | 98 | satisfy f = tryRethrow do
|
68 | 99 | c <- anyChar
|
69 |
| - if f c then pure c |
70 |
| - else fail $ "Character '" <> SCU.singleton c <> "' did not satisfy predicate" |
| 100 | + if f c |
| 101 | + then pure c |
| 102 | + else fail "Predicate unsatisfied" |
71 | 103 |
|
72 |
| --- | Match the specified character |
73 |
| -char :: forall s m. StringLike s => Monad m => Char -> ParserT s m Char |
| 104 | +-- | Match a Unicode character satisfying the predicate. |
| 105 | +satisfyCodePoint :: forall m. Monad m => (CodePoint -> Boolean) -> ParserT String m CodePoint |
| 106 | +satisfyCodePoint f = tryRethrow do |
| 107 | + c <- anyCodePoint |
| 108 | + if f c |
| 109 | + then pure c |
| 110 | + else fail "Predicate unsatisfied" |
| 111 | + |
| 112 | +-- | Match the specified BMP `Char`. |
| 113 | +char :: forall m. Monad m => Char -> ParserT String m Char |
74 | 114 | char c = satisfy (_ == c) <?> show c
|
75 | 115 |
|
76 |
| --- | Match zero or more whitespace characters. |
77 |
| -whiteSpace :: forall s m. StringLike s => Monad m => ParserT s m String |
78 |
| -whiteSpace = do |
79 |
| - cs <- many $ satisfy \c -> c == '\n' || c == '\r' || c == ' ' || c == '\t' |
80 |
| - pure $ SCU.fromCharArray cs |
| 116 | +-- | Match zero or more whitespace characters satisfying |
| 117 | +-- | `Data.CodePoint.Unicode.isSpace`. |
| 118 | +whiteSpace :: forall m. Monad m => ParserT String m String |
| 119 | +whiteSpace = fst <$> match skipSpaces |
81 | 120 |
|
82 | 121 | -- | Skip whitespace characters.
|
83 |
| -skipSpaces :: forall s m. StringLike s => Monad m => ParserT s m Unit |
84 |
| -skipSpaces = void whiteSpace |
| 122 | +skipSpaces :: forall m. Monad m => ParserT String m Unit |
| 123 | +skipSpaces = skipMany (satisfyCodePoint isSpace) |
85 | 124 |
|
86 |
| --- | Match one of the characters in the array. |
87 |
| -oneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char |
| 125 | +-- | Match one of the BMP `Char`s in the array. |
| 126 | +oneOf :: forall m. Monad m => Array Char -> ParserT String m Char |
88 | 127 | oneOf ss = satisfy (flip elem ss) <?> ("one of " <> show ss)
|
89 | 128 |
|
90 |
| --- | Match any character not in the array. |
91 |
| -noneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char |
| 129 | +-- | Match any BMP `Char` not in the array. |
| 130 | +noneOf :: forall m. Monad m => Array Char -> ParserT String m Char |
92 | 131 | noneOf ss = satisfy (flip notElem ss) <?> ("none of " <> show ss)
|
| 132 | + |
| 133 | +-- | Updates a `Position` by adding the columns and lines in `String`. |
| 134 | +updatePosString :: Position -> String -> Position |
| 135 | +updatePosString pos str = case uncons str of |
| 136 | + Nothing -> pos |
| 137 | + Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive |
| 138 | + |
| 139 | +-- | Updates a `Position` by adding the columns and lines in a |
| 140 | +-- | single `CodePoint`. |
| 141 | +updatePosSingle :: Position -> CodePoint -> Position |
| 142 | +updatePosSingle (Position {line,column}) cp = case unsafeCoerce cp of |
| 143 | + 10 -> Position { line: line + 1, column: 1 } -- "\n" |
| 144 | + 13 -> Position { line: line + 1, column: 1 } -- "\r" |
| 145 | + 9 -> Position { line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns? |
| 146 | + _ -> Position { line, column: column + 1 } |
| 147 | + |
| 148 | +-- | Combinator which returns both the result of a parse and the portion of |
| 149 | +-- | the input that was consumed while it was being parsed. |
| 150 | +match :: forall m a. Monad m => ParserT String m a -> ParserT String m (Tuple String a) |
| 151 | +match p = do |
| 152 | + ParseState input1 _ _ <- get |
| 153 | + x <- p |
| 154 | + ParseState input2 _ _ <- get |
| 155 | + -- We use the `SCU.length`, which is in units of “code units” |
| 156 | + -- instead of `Data.String.length`. which is in units of “code points”. |
| 157 | + -- This is more efficient, and it will be correct as long as we can assume |
| 158 | + -- the invariant that the `ParseState input` always begins on a code point |
| 159 | + -- boundary. |
| 160 | + pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x |
| 161 | + |
| 162 | +-- Helper function. |
| 163 | +satisfyCP :: forall m . Monad m => (CodePoint -> Boolean) -> ParserT String m Char |
| 164 | +satisfyCP p = satisfy (p <<< codePointFromChar) |
| 165 | + |
| 166 | +-- | Parse a digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isDecDigit`. |
| 167 | +digit :: forall m . Monad m => ParserT String m Char |
| 168 | +digit = satisfyCP isDecDigit <?> "digit" |
| 169 | + |
| 170 | +-- | Parse a hex digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isHexDigit`. |
| 171 | +hexDigit :: forall m . Monad m => ParserT String m Char |
| 172 | +hexDigit = satisfyCP isHexDigit <?> "hex digit" |
| 173 | + |
| 174 | +-- | Parse an octal digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isOctDigit`. |
| 175 | +octDigit :: forall m . Monad m => ParserT String m Char |
| 176 | +octDigit = satisfyCP isOctDigit <?> "oct digit" |
| 177 | + |
| 178 | +-- | Parse an uppercase letter. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isUpper`. |
| 179 | +upper :: forall m . Monad m => ParserT String m Char |
| 180 | +upper = satisfyCP isUpper <?> "uppercase letter" |
| 181 | + |
| 182 | +-- | Parse a space character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isSpace`. |
| 183 | +space :: forall m . Monad m => ParserT String m Char |
| 184 | +space = satisfyCP isSpace <?> "space" |
| 185 | + |
| 186 | +-- | Parse an alphabetical character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlpha`. |
| 187 | +letter :: forall m . Monad m => ParserT String m Char |
| 188 | +letter = satisfyCP isAlpha <?> "letter" |
| 189 | + |
| 190 | +-- | Parse an alphabetical or numerical character. |
| 191 | +-- | Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlphaNum`. |
| 192 | +alphaNum :: forall m . Monad m => ParserT String m Char |
| 193 | +alphaNum = satisfyCP isAlphaNum <?> "letter or digit" |
0 commit comments