1
1
-- | Primitive parsers for working with an input stream of type `String`.
2
-
3
- module Text.Parsing.Parser.String where
2
+ -- |
3
+ -- | These primitive parsers all operate on primitive `String` inputs.
4
+ -- | In most JavaScript runtime environments, the `String` is encoded
5
+ -- | as little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16), but
6
+ -- | these primitive parsers should work with any runtime encoding.
7
+ -- |
8
+ -- | The primitive parsers which return `Char` will only succeed when the character
9
+ -- | being parsed is a code point in the
10
+ -- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane)
11
+ -- | (the “BMP”). These parsers can be convenient because of the good support
12
+ -- | that PureScript has for writing `Char` literals like `'あ', 'β', 'C'`.
13
+ -- |
14
+ -- | The other primitive parsers, which return `CodePoint` and `String` types,
15
+ -- | can parse the full Unicode character set. All of the primitive parsers
16
+ -- | in this module can be used together.
17
+ module Text.Parsing.Parser.String
18
+ ( string
19
+ , eof
20
+ , anyChar
21
+ , anyCodePoint
22
+ , satisfy
23
+ , satisfyCodePoint
24
+ , char
25
+ , whiteSpace
26
+ , skipSpaces
27
+ , oneOf
28
+ , noneOf
29
+ , match
30
+ , digit
31
+ , hexDigit
32
+ , octDigit
33
+ , upper
34
+ , space
35
+ , letter
36
+ , alphaNum
37
+ )
38
+ where
4
39
5
40
import Prelude hiding (between )
6
41
7
- import Control.Monad.State (gets , modify_ )
8
- import Data.Array (many )
9
- import Data.Foldable (elem , notElem )
42
+ import Control.Monad.State (get , gets , modify_ )
43
+ import Data.Array (notElem )
44
+ import Data.Char (fromCharCode )
45
+ import Data.CodePoint.Unicode (isAlpha , isAlphaNum , isDecDigit , isHexDigit , isOctDigit , isSpace , isUpper )
46
+ import Data.Foldable (elem )
10
47
import Data.Maybe (Maybe (..))
11
48
import Data.Newtype (wrap )
12
- import Data.String (Pattern )
13
- import Data.String as S
49
+ import Data.String (CodePoint , codePointFromChar , null , stripPrefix , uncons )
14
50
import Data.String.CodeUnits as SCU
51
+ import Data.Tuple (Tuple (..), fst )
15
52
import Text.Parsing.Parser (ParseState (..), ParserT , fail )
16
- import Text.Parsing.Parser.Combinators (tryRethrow , (<?>))
17
- import Text.Parsing.Parser.Pos (updatePosString )
18
-
19
- -- | This class exists to abstract over streams which support the string-like
20
- -- | operations which this modules needs.
21
- class StringLike s where
22
- drop :: Int -> s -> s
23
- stripPrefix :: Pattern -> s -> Maybe s
24
- null :: s -> Boolean
25
- uncons :: s -> Maybe { head :: Char , tail :: s }
26
-
27
- instance stringLikeString :: StringLike String where
28
- uncons = SCU .uncons
29
- drop = S .drop
30
- stripPrefix = S .stripPrefix
31
- null = S .null
53
+ import Text.Parsing.Parser.Combinators (skipMany , tryRethrow , (<?>))
54
+ import Text.Parsing.Parser.Pos (Position (..))
55
+ import Unsafe.Coerce (unsafeCoerce )
32
56
33
57
-- | Match end-of-file.
34
- eof :: forall s m . StringLike s => Monad m => ParserT s m Unit
58
+ eof :: forall m . Monad m => ParserT String m Unit
35
59
eof = do
36
60
input <- gets \(ParseState input _ _) -> input
37
61
unless (null input) (fail " Expected EOF" )
38
62
39
63
-- | Match the specified string.
40
- string :: forall s m . StringLike s => Monad m => String -> ParserT s m String
64
+ string :: forall m . Monad m => String -> ParserT String m String
41
65
string str = do
42
66
input <- gets \(ParseState input _ _) -> input
43
67
case stripPrefix (wrap str) input of
@@ -49,44 +73,125 @@ string str = do
49
73
pure str
50
74
_ -> fail (" Expected " <> show str)
51
75
52
- -- | Match any character.
53
- anyChar :: forall s m . StringLike s => Monad m => ParserT s m Char
54
- anyChar = do
76
+ -- | Match any BMP `Char`.
77
+ anyChar :: forall m . Monad m => ParserT String m Char
78
+ anyChar = tryRethrow do
79
+ cp :: Int <- unsafeCoerce <$> anyCodePoint
80
+ -- the `fromCharCode` function doesn't check if this is beyond the
81
+ -- BMP, so we check that ourselves.
82
+ -- https://github.com/purescript/purescript-strings/issues/153
83
+ if cp > 65535 -- BMP
84
+ then fail " Not a Char"
85
+ else case fromCharCode cp of
86
+ Nothing -> fail " Not a Char"
87
+ Just c -> pure c
88
+
89
+ -- | Match any Unicode character.
90
+ anyCodePoint :: forall m . Monad m => ParserT String m CodePoint
91
+ anyCodePoint = do
55
92
input <- gets \(ParseState input _ _) -> input
56
93
case uncons input of
57
94
Nothing -> fail " Unexpected EOF"
58
95
Just { head, tail } -> do
59
96
modify_ \(ParseState _ position _) ->
60
- ParseState tail
61
- (updatePosString position (SCU .singleton head))
62
- true
97
+ ParseState tail (updatePosSingle position head) true
63
98
pure head
64
99
65
- -- | Match a character satisfying the specified predicate.
66
- satisfy :: forall s m . StringLike s => Monad m => (Char -> Boolean ) -> ParserT s m Char
100
+ -- | Match a BMP `Char` satisfying the specified predicate.
101
+ satisfy :: forall m . Monad m => (Char -> Boolean ) -> ParserT String m Char
67
102
satisfy f = tryRethrow do
68
103
c <- anyChar
69
- if f c then pure c
70
- else fail $ " Character '" <> SCU .singleton c <> " ' did not satisfy predicate"
104
+ if f c
105
+ then pure c
106
+ else fail " Predicate unsatisfied"
107
+
108
+ -- | Match a Unicode character satisfying the specified predicate.
109
+ satisfyCodePoint :: forall m . Monad m => (CodePoint -> Boolean ) -> ParserT String m CodePoint
110
+ satisfyCodePoint f = tryRethrow do
111
+ c <- anyCodePoint
112
+ if f c
113
+ then pure c
114
+ else fail " Predicate unsatisfied"
71
115
72
- -- | Match the specified character
73
- char :: forall s m . StringLike s => Monad m => Char -> ParserT s m Char
116
+ -- | Match the specified BMP `Char`.
117
+ char :: forall m . Monad m => Char -> ParserT String m Char
74
118
char c = satisfy (_ == c) <?> show c
75
119
76
- -- | Match zero or more whitespace characters.
77
- whiteSpace :: forall s m . StringLike s => Monad m => ParserT s m String
78
- whiteSpace = do
79
- cs <- many $ satisfy \c -> c == ' \n ' || c == ' \r ' || c == ' ' || c == ' \t '
80
- pure $ SCU .fromCharArray cs
120
+ -- | Match zero or more whitespace characters satisfying
121
+ -- | `Data.CodePoint.Unicode.isSpace`.
122
+ whiteSpace :: forall m . Monad m => ParserT String m String
123
+ whiteSpace = fst <$> match skipSpaces
81
124
82
125
-- | Skip whitespace characters.
83
- skipSpaces :: forall s m . StringLike s => Monad m => ParserT s m Unit
84
- skipSpaces = void whiteSpace
126
+ skipSpaces :: forall m . Monad m => ParserT String m Unit
127
+ skipSpaces = skipMany (satisfyCodePoint isSpace)
85
128
86
- -- | Match one of the characters in the array.
87
- oneOf :: forall s m . StringLike s => Monad m => Array Char -> ParserT s m Char
129
+ -- | Match one of the BMP `Char`s in the array.
130
+ oneOf :: forall m . Monad m => Array Char -> ParserT String m Char
88
131
oneOf ss = satisfy (flip elem ss) <?> (" one of " <> show ss)
89
132
90
- -- | Match any character not in the array.
91
- noneOf :: forall s m . StringLike s => Monad m => Array Char -> ParserT s m Char
133
+ -- | Match any BMP `Char` not in the array.
134
+ noneOf :: forall m . Monad m => Array Char -> ParserT String m Char
92
135
noneOf ss = satisfy (flip notElem ss) <?> (" none of " <> show ss)
136
+
137
+ -- | Updates a `Position` by adding the columns and lines in `String`.
138
+ updatePosString :: Position -> String -> Position
139
+ updatePosString pos str = case uncons str of
140
+ Nothing -> pos
141
+ Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive
142
+
143
+ -- | Updates a `Position` by adding the columns and lines in a
144
+ -- | single `CodePoint`.
145
+ updatePosSingle :: Position -> CodePoint -> Position
146
+ updatePosSingle (Position {line,column}) cp = case unsafeCoerce cp of
147
+ 10 -> Position { line: line + 1 , column: 1 } -- "\n"
148
+ 13 -> Position { line: line + 1 , column: 1 } -- "\r"
149
+ 9 -> Position { line, column: column + 8 - ((column - 1 ) `mod` 8 ) } -- "\t" Who says that one tab is 8 columns?
150
+ _ -> Position { line, column: column + 1 }
151
+
152
+ -- | Combinator which returns both the result of a parse and the portion of
153
+ -- | the input that was consumed while it was being parsed.
154
+ match :: forall m a . Monad m => ParserT String m a -> ParserT String m (Tuple String a )
155
+ match p = do
156
+ ParseState input1 _ _ <- get
157
+ x <- p
158
+ ParseState input2 _ _ <- get
159
+ -- We use the `SCU.length`, which is in units of “code units”
160
+ -- instead of `Data.String.length`. which is in units of “code points”.
161
+ -- This is more efficient, and it will be correct as long as we can assume
162
+ -- the invariant that the `ParseState input` always begins on a code point
163
+ -- boundary.
164
+ pure $ Tuple (SCU .take (SCU .length input1 - SCU .length input2) input1) x
165
+
166
+ -- Helper function.
167
+ satisfyCP :: forall m . Monad m => (CodePoint -> Boolean ) -> ParserT String m Char
168
+ satisfyCP p = satisfy (p <<< codePointFromChar)
169
+
170
+ -- | Parse a digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isDecDigit`.
171
+ digit :: forall m . Monad m => ParserT String m Char
172
+ digit = satisfyCP isDecDigit <?> " digit"
173
+
174
+ -- | Parse a hex digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isHexDigit`.
175
+ hexDigit :: forall m . Monad m => ParserT String m Char
176
+ hexDigit = satisfyCP isHexDigit <?> " hex digit"
177
+
178
+ -- | Parse an octal digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isOctDigit`.
179
+ octDigit :: forall m . Monad m => ParserT String m Char
180
+ octDigit = satisfyCP isOctDigit <?> " oct digit"
181
+
182
+ -- | Parse an uppercase letter. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isUpper`.
183
+ upper :: forall m . Monad m => ParserT String m Char
184
+ upper = satisfyCP isUpper <?> " uppercase letter"
185
+
186
+ -- | Parse a space character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isSpace`.
187
+ space :: forall m . Monad m => ParserT String m Char
188
+ space = satisfyCP isSpace <?> " space"
189
+
190
+ -- | Parse an alphabetical character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlpha`.
191
+ letter :: forall m . Monad m => ParserT String m Char
192
+ letter = satisfyCP isAlpha <?> " letter"
193
+
194
+ -- | Parse an alphabetical or numerical character.
195
+ -- | Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
196
+ alphaNum :: forall m . Monad m => ParserT String m Char
197
+ alphaNum = satisfyCP isAlphaNum <?> " letter or digit"
0 commit comments