1
1
-- | Primitive parsers for working with an input stream of type `String`.
2
-
3
- module Text.Parsing.Parser.String where
2
+ -- |
3
+ -- | These primitive parsers all operate on primitive `String` inputs.
4
+ -- | In most JavaScript runtime environments, the `String` is encoded
5
+ -- | as little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16), but
6
+ -- | these primitive parsers should work with any runtime encoding.
7
+ -- |
8
+ -- | The primitive parsers which return `Char` will only succeed when the character
9
+ -- | being parsed is a code point in the
10
+ -- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane)
11
+ -- | (the “BMP”). These parsers can be convenient because of the good support
12
+ -- | that PureScript has for writing `Char` literals like `'あ', 'β', 'C'`.
13
+ -- |
14
+ -- | The other primitive parsers, which return `CodePoint` and `String` types,
15
+ -- | can parse the full Unicode character set. All of the primitive parsers
16
+ -- | in this module can be used together.
17
+ module Text.Parsing.Parser.String
18
+ ( string
19
+ , eof
20
+ , anyChar
21
+ , anyCodePoint
22
+ , satisfy
23
+ , satisfyCodePoint
24
+ , char
25
+ , whiteSpace
26
+ , skipSpaces
27
+ , oneOf
28
+ , noneOf
29
+ , match
30
+ , digit
31
+ , hexDigit
32
+ , octDigit
33
+ , upper
34
+ , space
35
+ , letter
36
+ , alphaNum
37
+ )
38
+ where
4
39
5
40
import Prelude hiding (between )
6
41
7
- import Control.Monad.State (gets , modify_ )
8
- import Data.Array (many )
9
- import Data.Foldable (elem , notElem )
42
+ import Control.Monad.State (get , gets , modify_ )
43
+ import Data.Array (notElem )
44
+ import Data.Char (fromCharCode )
45
+ import Data.CodePoint.Unicode (isAlpha , isAlphaNum , isDecDigit , isHexDigit , isOctDigit , isSpace , isUpper )
46
+ import Data.Foldable (elem )
10
47
import Data.Maybe (Maybe (..))
11
48
import Data.Newtype (wrap )
12
- import Data.String (Pattern )
13
- import Data.String as S
49
+ import Data.String (CodePoint , codePointFromChar , null , stripPrefix , uncons )
14
50
import Data.String.CodeUnits as SCU
51
+ import Data.Tuple (Tuple (..), fst )
15
52
import Text.Parsing.Parser (ParseState (..), ParserT , fail )
16
- import Text.Parsing.Parser.Combinators (tryRethrow , (<?>))
17
- import Text.Parsing.Parser.Pos (updatePosString )
18
-
19
- -- | This class exists to abstract over streams which support the string-like
20
- -- | operations which this modules needs.
21
- class StringLike s where
22
- drop :: Int -> s -> s
23
- stripPrefix :: Pattern -> s -> Maybe s
24
- null :: s -> Boolean
25
- uncons :: s -> Maybe { head :: Char , tail :: s }
26
-
27
- instance stringLikeString :: StringLike String where
28
- uncons = SCU .uncons
29
- drop = S .drop
30
- stripPrefix = S .stripPrefix
31
- null = S .null
53
+ import Text.Parsing.Parser.Combinators (skipMany , tryRethrow , (<?>))
54
+ import Text.Parsing.Parser.Pos (Position (..))
55
+ import Unsafe.Coerce (unsafeCoerce )
32
56
33
57
-- | Match end-of-file.
34
- eof :: forall s m . StringLike s => Monad m => ParserT s m Unit
58
+ eof :: forall m . Monad m => ParserT String m Unit
35
59
eof = do
36
60
input <- gets \(ParseState input _ _) -> input
37
61
unless (null input) (fail " Expected EOF" )
38
62
39
63
-- | Match the specified string.
40
- string :: forall s m . StringLike s => Monad m => String -> ParserT s m String
64
+ string :: forall m . Monad m => String -> ParserT String m String
41
65
string str = do
42
66
input <- gets \(ParseState input _ _) -> input
43
67
case stripPrefix (wrap str) input of
@@ -49,44 +73,127 @@ string str = do
49
73
pure str
50
74
_ -> fail (" Expected " <> show str)
51
75
52
- -- | Match any character.
53
- anyChar :: forall s m . StringLike s => Monad m => ParserT s m Char
54
- anyChar = do
76
+ -- | Match any BMP `Char`.
77
+ -- | Parser will fail if the character is not in the Basic Multilingual Plane.
78
+ anyChar :: forall m . Monad m => ParserT String m Char
79
+ anyChar = tryRethrow do
80
+ cp :: Int <- unsafeCoerce <$> anyCodePoint
81
+ -- the `fromCharCode` function doesn't check if this is beyond the
82
+ -- BMP, so we check that ourselves.
83
+ -- https://github.com/purescript/purescript-strings/issues/153
84
+ if cp > 65535 -- BMP
85
+ then fail " Not a Char"
86
+ else case fromCharCode cp of
87
+ Nothing -> fail " Not a Char"
88
+ Just c -> pure c
89
+
90
+ -- | Match any Unicode character.
91
+ -- | Always succeeds.
92
+ anyCodePoint :: forall m . Monad m => ParserT String m CodePoint
93
+ anyCodePoint = do
55
94
input <- gets \(ParseState input _ _) -> input
56
95
case uncons input of
57
96
Nothing -> fail " Unexpected EOF"
58
97
Just { head, tail } -> do
59
98
modify_ \(ParseState _ position _) ->
60
- ParseState tail
61
- (updatePosString position (SCU .singleton head))
62
- true
99
+ ParseState tail (updatePosSingle position head) true
63
100
pure head
64
101
65
- -- | Match a character satisfying the specified predicate.
66
- satisfy :: forall s m . StringLike s => Monad m => (Char -> Boolean ) -> ParserT s m Char
102
+ -- | Match a BMP `Char` satisfying the predicate.
103
+ satisfy :: forall m . Monad m => (Char -> Boolean ) -> ParserT String m Char
67
104
satisfy f = tryRethrow do
68
105
c <- anyChar
69
- if f c then pure c
70
- else fail $ " Character '" <> SCU .singleton c <> " ' did not satisfy predicate"
106
+ if f c
107
+ then pure c
108
+ else fail " Predicate unsatisfied"
109
+
110
+ -- | Match a Unicode character satisfying the predicate.
111
+ satisfyCodePoint :: forall m . Monad m => (CodePoint -> Boolean ) -> ParserT String m CodePoint
112
+ satisfyCodePoint f = tryRethrow do
113
+ c <- anyCodePoint
114
+ if f c
115
+ then pure c
116
+ else fail " Predicate unsatisfied"
71
117
72
- -- | Match the specified character
73
- char :: forall s m . StringLike s => Monad m => Char -> ParserT s m Char
118
+ -- | Match the specified BMP `Char`.
119
+ char :: forall m . Monad m => Char -> ParserT String m Char
74
120
char c = satisfy (_ == c) <?> show c
75
121
76
- -- | Match zero or more whitespace characters.
77
- whiteSpace :: forall s m . StringLike s => Monad m => ParserT s m String
78
- whiteSpace = do
79
- cs <- many $ satisfy \c -> c == ' \n ' || c == ' \r ' || c == ' ' || c == ' \t '
80
- pure $ SCU .fromCharArray cs
122
+ -- | Match zero or more whitespace characters satisfying
123
+ -- | `Data.CodePoint.Unicode.isSpace`.
124
+ whiteSpace :: forall m . Monad m => ParserT String m String
125
+ whiteSpace = fst <$> match skipSpaces
81
126
82
127
-- | Skip whitespace characters.
83
- skipSpaces :: forall s m . StringLike s => Monad m => ParserT s m Unit
84
- skipSpaces = void whiteSpace
128
+ skipSpaces :: forall m . Monad m => ParserT String m Unit
129
+ skipSpaces = skipMany (satisfyCodePoint isSpace)
85
130
86
- -- | Match one of the characters in the array.
87
- oneOf :: forall s m . StringLike s => Monad m => Array Char -> ParserT s m Char
131
+ -- | Match one of the BMP `Char`s in the array.
132
+ oneOf :: forall m . Monad m => Array Char -> ParserT String m Char
88
133
oneOf ss = satisfy (flip elem ss) <?> (" one of " <> show ss)
89
134
90
- -- | Match any character not in the array.
91
- noneOf :: forall s m . StringLike s => Monad m => Array Char -> ParserT s m Char
135
+ -- | Match any BMP `Char` not in the array.
136
+ noneOf :: forall m . Monad m => Array Char -> ParserT String m Char
92
137
noneOf ss = satisfy (flip notElem ss) <?> (" none of " <> show ss)
138
+
139
+ -- | Updates a `Position` by adding the columns and lines in `String`.
140
+ updatePosString :: Position -> String -> Position
141
+ updatePosString pos str = case uncons str of
142
+ Nothing -> pos
143
+ Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive
144
+
145
+ -- | Updates a `Position` by adding the columns and lines in a
146
+ -- | single `CodePoint`.
147
+ updatePosSingle :: Position -> CodePoint -> Position
148
+ updatePosSingle (Position {line,column}) cp = case unsafeCoerce cp of
149
+ 10 -> Position { line: line + 1 , column: 1 } -- "\n"
150
+ 13 -> Position { line: line + 1 , column: 1 } -- "\r"
151
+ 9 -> Position { line, column: column + 8 - ((column - 1 ) `mod` 8 ) } -- "\t" Who says that one tab is 8 columns?
152
+ _ -> Position { line, column: column + 1 }
153
+
154
+ -- | Combinator which returns both the result of a parse and the portion of
155
+ -- | the input that was consumed while it was being parsed.
156
+ match :: forall m a . Monad m => ParserT String m a -> ParserT String m (Tuple String a )
157
+ match p = do
158
+ ParseState input1 _ _ <- get
159
+ x <- p
160
+ ParseState input2 _ _ <- get
161
+ -- We use the `SCU.length`, which is in units of “code units”
162
+ -- instead of `Data.String.length`. which is in units of “code points”.
163
+ -- This is more efficient, and it will be correct as long as we can assume
164
+ -- the invariant that the `ParseState input` always begins on a code point
165
+ -- boundary.
166
+ pure $ Tuple (SCU .take (SCU .length input1 - SCU .length input2) input1) x
167
+
168
+ -- Helper function.
169
+ satisfyCP :: forall m . Monad m => (CodePoint -> Boolean ) -> ParserT String m Char
170
+ satisfyCP p = satisfy (p <<< codePointFromChar)
171
+
172
+ -- | Parse a digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isDecDigit`.
173
+ digit :: forall m . Monad m => ParserT String m Char
174
+ digit = satisfyCP isDecDigit <?> " digit"
175
+
176
+ -- | Parse a hex digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isHexDigit`.
177
+ hexDigit :: forall m . Monad m => ParserT String m Char
178
+ hexDigit = satisfyCP isHexDigit <?> " hex digit"
179
+
180
+ -- | Parse an octal digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isOctDigit`.
181
+ octDigit :: forall m . Monad m => ParserT String m Char
182
+ octDigit = satisfyCP isOctDigit <?> " oct digit"
183
+
184
+ -- | Parse an uppercase letter. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isUpper`.
185
+ upper :: forall m . Monad m => ParserT String m Char
186
+ upper = satisfyCP isUpper <?> " uppercase letter"
187
+
188
+ -- | Parse a space character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isSpace`.
189
+ space :: forall m . Monad m => ParserT String m Char
190
+ space = satisfyCP isSpace <?> " space"
191
+
192
+ -- | Parse an alphabetical character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlpha`.
193
+ letter :: forall m . Monad m => ParserT String m Char
194
+ letter = satisfyCP isAlpha <?> " letter"
195
+
196
+ -- | Parse an alphabetical or numerical character.
197
+ -- | Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
198
+ alphaNum :: forall m . Monad m => ParserT String m Char
199
+ alphaNum = satisfyCP isAlphaNum <?> " letter or digit"
0 commit comments