1
1
-- | Primitive parsers for working with an input stream of type `String`.
2
-
3
- module Text.Parsing.Parser.String where
2
+ -- | In most JavaScript runtime environments, the `String` is encoded
3
+ -- | as little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16), but
4
+ -- | these primitive parsers should work with any runtime encoding.
5
+ -- |
6
+ -- | The primitive parsers which return `Char` will only succeed when the character
7
+ -- | being parsed is a code point in the
8
+ -- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane)
9
+ -- | (the “BMP”). These parsers can be convenient because of the good support
10
+ -- | that PureScript has for writing `Char` literals like `'あ', 'β', 'C'`.
11
+ -- |
12
+ -- | The other primitive parsers, which return `CodePoint` and `String` types,
13
+ -- | can parse the full Unicode character set. All of the primitive parsers
14
+ -- | in this module can be used together.
15
+ module Text.Parsing.Parser.String
16
+ ( string
17
+ , eof
18
+ , anyChar
19
+ , anyCodePoint
20
+ , satisfy
21
+ , satisfyCodePoint
22
+ , char
23
+ , whiteSpace
24
+ , skipSpaces
25
+ , oneOf
26
+ , noneOf
27
+ , match
28
+ , digit
29
+ , hexDigit
30
+ , octDigit
31
+ , upper
32
+ , space
33
+ , letter
34
+ , alphaNum
35
+ )
36
+ where
4
37
5
38
import Prelude hiding (between )
6
39
7
- import Control.Monad.State (gets , modify_ )
8
- import Data.Array (many )
9
- import Data.Foldable (elem , notElem )
40
+ import Control.Monad.State (get , put )
41
+ import Data.Array (notElem )
42
+ import Data.Char (fromCharCode )
43
+ import Data.CodePoint.Unicode (isAlpha , isAlphaNum , isDecDigit , isHexDigit , isOctDigit , isSpace , isUpper )
44
+ import Data.Foldable (elem )
10
45
import Data.Maybe (Maybe (..))
11
- import Data.Newtype (wrap )
12
- import Data.String (Pattern )
13
- import Data.String as S
46
+ import Data.String (CodePoint , Pattern (..), codePointFromChar , null , stripPrefix , uncons )
14
47
import Data.String.CodeUnits as SCU
48
+ import Data.Tuple (Tuple (..), fst )
15
49
import Text.Parsing.Parser (ParseState (..), ParserT , fail )
16
- import Text.Parsing.Parser.Combinators (tryRethrow , (<?>))
17
- import Text.Parsing.Parser.Pos (updatePosString )
18
-
19
- -- | This class exists to abstract over streams which support the string-like
20
- -- | operations which this modules needs.
21
- class StringLike s where
22
- drop :: Int -> s -> s
23
- stripPrefix :: Pattern -> s -> Maybe s
24
- null :: s -> Boolean
25
- uncons :: s -> Maybe { head :: Char , tail :: s }
26
-
27
- instance stringLikeString :: StringLike String where
28
- uncons = SCU .uncons
29
- drop = S .drop
30
- stripPrefix = S .stripPrefix
31
- null = S .null
50
+ import Text.Parsing.Parser.Combinators (skipMany , tryRethrow , (<?>))
51
+ import Text.Parsing.Parser.Pos (Position (..))
52
+ import Unsafe.Coerce (unsafeCoerce )
32
53
33
54
-- | Match end-of-file.
34
- eof :: forall s m . StringLike s => Monad m => ParserT s m Unit
55
+ eof :: forall m . Monad m => ParserT String m Unit
35
56
eof = do
36
- input <- gets \( ParseState input _ _) -> input
57
+ ParseState input _ _ <- get
37
58
unless (null input) (fail " Expected EOF" )
38
59
39
60
-- | Match the specified string.
40
- string :: forall s m . StringLike s => Monad m => String -> ParserT s m String
61
+ string :: forall m . Monad m => String -> ParserT String m String
41
62
string str = do
42
- input <- gets \( ParseState input _ _) -> input
43
- case stripPrefix (wrap str) input of
63
+ ParseState input position _ <- get
64
+ case stripPrefix (Pattern str) input of
44
65
Just remainder -> do
45
- modify_ \(ParseState _ position _) ->
46
- ParseState remainder
47
- (updatePosString position str)
48
- true
66
+ put $ ParseState remainder (updatePosString position str) true
49
67
pure str
50
68
_ -> fail (" Expected " <> show str)
51
69
52
- -- | Match any character.
53
- anyChar :: forall s m . StringLike s => Monad m => ParserT s m Char
54
- anyChar = do
55
- input <- gets \(ParseState input _ _) -> input
70
+ -- | Match any BMP `Char`.
71
+ -- | Parser will fail if the character is not in the Basic Multilingual Plane.
72
+ anyChar :: forall m . Monad m => ParserT String m Char
73
+ anyChar = tryRethrow do
74
+ cp :: Int <- deconstructCodePoint <$> anyCodePoint
75
+ -- the `fromCharCode` function doesn't check if this is beyond the
76
+ -- BMP, so we check that ourselves.
77
+ -- https://github.com/purescript/purescript-strings/issues/153
78
+ if cp > 65535 -- BMP
79
+ then fail " Not a Char"
80
+ else case fromCharCode cp of
81
+ Nothing -> fail " Not a Char"
82
+ Just c -> pure c
83
+
84
+ -- | Match any Unicode character.
85
+ -- | Always succeeds.
86
+ anyCodePoint :: forall m . Monad m => ParserT String m CodePoint
87
+ anyCodePoint = do
88
+ ParseState input position _ <- get
56
89
case uncons input of
57
90
Nothing -> fail " Unexpected EOF"
58
91
Just { head, tail } -> do
59
- modify_ \(ParseState _ position _) ->
60
- ParseState tail
61
- (updatePosString position (SCU .singleton head))
62
- true
92
+ put $ ParseState tail (updatePosSingle position head) true
63
93
pure head
64
94
65
- -- | Match a character satisfying the specified predicate.
66
- satisfy :: forall s m . StringLike s => Monad m => (Char -> Boolean ) -> ParserT s m Char
95
+ -- | Match a BMP `Char` satisfying the predicate.
96
+ satisfy :: forall m . Monad m => (Char -> Boolean ) -> ParserT String m Char
67
97
satisfy f = tryRethrow do
68
98
c <- anyChar
69
- if f c then pure c
70
- else fail $ " Character '" <> SCU .singleton c <> " ' did not satisfy predicate"
99
+ if f c
100
+ then pure c
101
+ else fail " Predicate unsatisfied"
71
102
72
- -- | Match the specified character
73
- char :: forall s m . StringLike s => Monad m => Char -> ParserT s m Char
103
+ -- | Match a Unicode character satisfying the predicate.
104
+ satisfyCodePoint :: forall m . Monad m => (CodePoint -> Boolean ) -> ParserT String m CodePoint
105
+ satisfyCodePoint f = tryRethrow do
106
+ c <- anyCodePoint
107
+ if f c
108
+ then pure c
109
+ else fail " Predicate unsatisfied"
110
+
111
+ -- | Match the specified BMP `Char`.
112
+ char :: forall m . Monad m => Char -> ParserT String m Char
74
113
char c = satisfy (_ == c) <?> show c
75
114
76
- -- | Match zero or more whitespace characters.
77
- whiteSpace :: forall s m . StringLike s => Monad m => ParserT s m String
78
- whiteSpace = do
79
- cs <- many $ satisfy \c -> c == ' \n ' || c == ' \r ' || c == ' ' || c == ' \t '
80
- pure $ SCU .fromCharArray cs
115
+ -- | Match zero or more whitespace characters satisfying
116
+ -- | `Data.CodePoint.Unicode.isSpace`.
117
+ whiteSpace :: forall m . Monad m => ParserT String m String
118
+ whiteSpace = fst <$> match skipSpaces
81
119
82
120
-- | Skip whitespace characters.
83
- skipSpaces :: forall s m . StringLike s => Monad m => ParserT s m Unit
84
- skipSpaces = void whiteSpace
121
+ skipSpaces :: forall m . Monad m => ParserT String m Unit
122
+ skipSpaces = skipMany (satisfyCodePoint isSpace)
85
123
86
- -- | Match one of the characters in the array.
87
- oneOf :: forall s m . StringLike s => Monad m => Array Char -> ParserT s m Char
124
+ -- | Match one of the BMP `Char`s in the array.
125
+ oneOf :: forall m . Monad m => Array Char -> ParserT String m Char
88
126
oneOf ss = satisfy (flip elem ss) <?> (" one of " <> show ss)
89
127
90
- -- | Match any character not in the array.
91
- noneOf :: forall s m . StringLike s => Monad m => Array Char -> ParserT s m Char
128
+ -- | Match any BMP `Char` not in the array.
129
+ noneOf :: forall m . Monad m => Array Char -> ParserT String m Char
92
130
noneOf ss = satisfy (flip notElem ss) <?> (" none of " <> show ss)
131
+
132
+ -- | Updates a `Position` by adding the columns and lines in `String`.
133
+ updatePosString :: Position -> String -> Position
134
+ updatePosString pos str = case uncons str of
135
+ Nothing -> pos
136
+ Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive
137
+
138
+ -- | Updates a `Position` by adding the columns and lines in a
139
+ -- | single `CodePoint`.
140
+ updatePosSingle :: Position -> CodePoint -> Position
141
+ updatePosSingle (Position {line,column}) cp = case deconstructCodePoint cp of
142
+ 10 -> Position { line: line + 1 , column: 1 } -- "\n"
143
+ 13 -> Position { line: line + 1 , column: 1 } -- "\r"
144
+ 9 -> Position { line, column: column + 8 - ((column - 1 ) `mod` 8 ) } -- "\t" Who says that one tab is 8 columns?
145
+ _ -> Position { line, column: column + 1 }
146
+
147
+ -- | Combinator which returns both the result of a parse and the portion of
148
+ -- | the input that was consumed while it was being parsed.
149
+ match :: forall m a . Monad m => ParserT String m a -> ParserT String m (Tuple String a )
150
+ match p = do
151
+ ParseState input1 _ _ <- get
152
+ x <- p
153
+ ParseState input2 _ _ <- get
154
+ -- We use the `SCU.length`, which is in units of “code units”
155
+ -- instead of `Data.String.length`. which is in units of “code points”.
156
+ -- This is more efficient, and it will be correct as long as we can assume
157
+ -- the invariant that the `ParseState input` always begins on a code point
158
+ -- boundary.
159
+ pure $ Tuple (SCU .take (SCU .length input1 - SCU .length input2) input1) x
160
+
161
+ -- Helper function.
162
+ satisfyCP :: forall m . Monad m => (CodePoint -> Boolean ) -> ParserT String m Char
163
+ satisfyCP p = satisfy (p <<< codePointFromChar)
164
+
165
+ -- | Parse a digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isDecDigit`.
166
+ digit :: forall m . Monad m => ParserT String m Char
167
+ digit = satisfyCP isDecDigit <?> " digit"
168
+
169
+ -- | Parse a hex digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isHexDigit`.
170
+ hexDigit :: forall m . Monad m => ParserT String m Char
171
+ hexDigit = satisfyCP isHexDigit <?> " hex digit"
172
+
173
+ -- | Parse an octal digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isOctDigit`.
174
+ octDigit :: forall m . Monad m => ParserT String m Char
175
+ octDigit = satisfyCP isOctDigit <?> " oct digit"
176
+
177
+ -- | Parse an uppercase letter. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isUpper`.
178
+ upper :: forall m . Monad m => ParserT String m Char
179
+ upper = satisfyCP isUpper <?> " uppercase letter"
180
+
181
+ -- | Parse a space character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isSpace`.
182
+ space :: forall m . Monad m => ParserT String m Char
183
+ space = satisfyCP isSpace <?> " space"
184
+
185
+ -- | Parse an alphabetical character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlpha`.
186
+ letter :: forall m . Monad m => ParserT String m Char
187
+ letter = satisfyCP isAlpha <?> " letter"
188
+
189
+ -- | Parse an alphabetical or numerical character.
190
+ -- | Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
191
+ alphaNum :: forall m . Monad m => ParserT String m Char
192
+ alphaNum = satisfyCP isAlphaNum <?> " letter or digit"
193
+
194
+ -- The CodePoint newtype constructor is not exported, so here's a helper.
195
+ -- This will break at runtime if the definition of CodePoint ever changes
196
+ -- to something other than `newtype CodePoint = CodePoint Int`.
197
+ deconstructCodePoint :: CodePoint -> Int
198
+ deconstructCodePoint = unsafeCoerce
0 commit comments