Skip to content

Commit 0b605a7

Browse files
committed
Unicode correctness
Correctly handle UTF-16 surrogate pairs in `String`s. We are not quite making the default `CodePoint`, as was discussed in purescript-contrib#76 (comment) . Rather we are keeping most of the current API and making it work properly with astral Unicode. We keep the `Char` parsers for ergonomic reasons. For example the parser `char :: forall s m. Monad m => Char -> ParserT s m Char`. This parser is usually called with a literal like `char 'a'`. It would be annoying to call this parser with `char (codePointFromChar 'a')`. Add primitive parsers `anyCodePoint` and `satisfyCodePoint` for parsing `CodePoint`s. To make this library handle Unicode correctly, it is necessary to delete the `StringLike` class. `StringLike` has no laws, and during the five years of its life, no-one on Github has ever written another instance of `StringLike`. https://github.com/search?l=&q=StringLike+language%3APureScript&type=code Move `updatePosString` to the `Text.Parsing.Parser.String` module and don't export it. Add the `match` combinator. Change the definition of `whiteSpace` and `skipSpaces` to `Data.CodePoint.Unicode.isSpace`. Move the character class parsers from `Text.Parsing.Parser.Token` module into the `Text.Parsing.Parser.String` module. All prior tests pass with no modifications. Add a few new tests.
1 parent 61db534 commit 0b605a7

File tree

6 files changed

+191
-132
lines changed

6 files changed

+191
-132
lines changed

spago.dhall

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
, "transformers"
2121
, "tuples"
2222
, "unicode"
23+
, "unsafe-coerce"
2324
]
2425
, packages = ./packages.dhall
2526
, sources = [ "src/**/*.purs", "test/**/*.purs" ]

src/Text/Parsing/Parser/Language.purs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ import Prelude
1212

1313
import Control.Alt ((<|>))
1414
import Text.Parsing.Parser (ParserT)
15-
import Text.Parsing.Parser.String (char, oneOf)
16-
import Text.Parsing.Parser.Token (LanguageDef, TokenParser, GenLanguageDef(..), unGenLanguageDef, makeTokenParser, alphaNum, letter)
15+
import Text.Parsing.Parser.String (char, oneOf, alphaNum, letter)
16+
import Text.Parsing.Parser.Token (LanguageDef, TokenParser, GenLanguageDef(..), unGenLanguageDef, makeTokenParser)
1717

1818
-----------------------------------------------------------
1919
-- Styles: haskellStyle, javaStyle

src/Text/Parsing/Parser/Pos.purs

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
module Text.Parsing.Parser.Pos where
22

33
import Prelude
4+
45
import Data.Generic.Rep (class Generic)
5-
import Data.Foldable (foldl)
6-
import Data.Newtype (wrap)
7-
import Data.String (split)
86

97
-- | `Position` represents the position of the parser in the input.
108
-- |
@@ -27,13 +25,3 @@ derive instance ordPosition :: Ord Position
2725
-- | The `Position` before any input has been parsed.
2826
initialPos :: Position
2927
initialPos = Position { line: 1, column: 1 }
30-
31-
-- | Updates a `Position` by adding the columns and lines in `String`.
32-
updatePosString :: Position -> String -> Position
33-
updatePosString pos' str = foldl updatePosChar pos' (split (wrap "") str)
34-
where
35-
updatePosChar (Position pos) c = case c of
36-
"\n" -> Position { line: pos.line + 1, column: 1 }
37-
"\r" -> Position { line: pos.line + 1, column: 1 }
38-
"\t" -> Position { line: pos.line, column: pos.column + 8 - ((pos.column - 1) `mod` 8) }
39-
_ -> Position { line: pos.line, column: pos.column + 1 }

src/Text/Parsing/Parser/String.purs

Lines changed: 159 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,193 @@
11
-- | Primitive parsers for working with an input stream of type `String`.
2-
3-
module Text.Parsing.Parser.String where
2+
-- | In most JavaScript runtime environments, the `String` is encoded
3+
-- | as little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16), but
4+
-- | these primitive parsers should work with any runtime encoding.
5+
-- |
6+
-- | The primitive parsers which return `Char` will only succeed when the character
7+
-- | being parsed is a code point in the
8+
-- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane)
9+
-- | (the “BMP”). These parsers can be convenient because of the good support
10+
-- | that PureScript has for writing `Char` literals like `'あ', 'β', 'C'`.
11+
-- |
12+
-- | The other primitive parsers, which return `CodePoint` and `String` types,
13+
-- | can parse the full Unicode character set. All of the primitive parsers
14+
-- | in this module can be used together.
15+
module Text.Parsing.Parser.String
16+
( string
17+
, eof
18+
, anyChar
19+
, anyCodePoint
20+
, satisfy
21+
, satisfyCodePoint
22+
, char
23+
, whiteSpace
24+
, skipSpaces
25+
, oneOf
26+
, noneOf
27+
, match
28+
, digit
29+
, hexDigit
30+
, octDigit
31+
, upper
32+
, space
33+
, letter
34+
, alphaNum
35+
)
36+
where
437

538
import Prelude hiding (between)
639

7-
import Control.Monad.State (gets, modify_)
8-
import Data.Array (many)
9-
import Data.Foldable (elem, notElem)
40+
import Control.Monad.State (get, put)
41+
import Data.Array (notElem)
42+
import Data.Char (fromCharCode)
43+
import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper)
44+
import Data.Foldable (elem)
1045
import Data.Maybe (Maybe(..))
11-
import Data.Newtype (wrap)
12-
import Data.String (Pattern)
13-
import Data.String as S
46+
import Data.String (CodePoint, Pattern(..), codePointFromChar, null, stripPrefix, uncons)
1447
import Data.String.CodeUnits as SCU
48+
import Data.Tuple (Tuple(..), fst)
1549
import Text.Parsing.Parser (ParseState(..), ParserT, fail)
16-
import Text.Parsing.Parser.Combinators (tryRethrow, (<?>))
17-
import Text.Parsing.Parser.Pos (updatePosString)
18-
19-
-- | This class exists to abstract over streams which support the string-like
20-
-- | operations which this modules needs.
21-
class StringLike s where
22-
drop :: Int -> s -> s
23-
stripPrefix :: Pattern -> s -> Maybe s
24-
null :: s -> Boolean
25-
uncons :: s -> Maybe { head :: Char, tail :: s }
26-
27-
instance stringLikeString :: StringLike String where
28-
uncons = SCU.uncons
29-
drop = S.drop
30-
stripPrefix = S.stripPrefix
31-
null = S.null
50+
import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (<?>))
51+
import Text.Parsing.Parser.Pos (Position(..))
52+
import Unsafe.Coerce (unsafeCoerce)
3253

3354
-- | Match end-of-file.
34-
eof :: forall s m. StringLike s => Monad m => ParserT s m Unit
55+
eof :: forall m. Monad m => ParserT String m Unit
3556
eof = do
36-
input <- gets \(ParseState input _ _) -> input
57+
ParseState input _ _ <- get
3758
unless (null input) (fail "Expected EOF")
3859

3960
-- | Match the specified string.
40-
string :: forall s m. StringLike s => Monad m => String -> ParserT s m String
61+
string :: forall m. Monad m => String -> ParserT String m String
4162
string str = do
42-
input <- gets \(ParseState input _ _) -> input
43-
case stripPrefix (wrap str) input of
63+
ParseState input position _ <- get
64+
case stripPrefix (Pattern str) input of
4465
Just remainder -> do
45-
modify_ \(ParseState _ position _) ->
46-
ParseState remainder
47-
(updatePosString position str)
48-
true
66+
put $ ParseState remainder (updatePosString position str) true
4967
pure str
5068
_ -> fail ("Expected " <> show str)
5169

52-
-- | Match any character.
53-
anyChar :: forall s m. StringLike s => Monad m => ParserT s m Char
54-
anyChar = do
55-
input <- gets \(ParseState input _ _) -> input
70+
-- | Match any BMP `Char`.
71+
-- | Parser will fail if the character is not in the Basic Multilingual Plane.
72+
anyChar :: forall m. Monad m => ParserT String m Char
73+
anyChar = tryRethrow do
74+
cp :: Int <- unsafeCoerce <$> anyCodePoint
75+
-- the `fromCharCode` function doesn't check if this is beyond the
76+
-- BMP, so we check that ourselves.
77+
-- https://github.com/purescript/purescript-strings/issues/153
78+
if cp > 65535 -- BMP
79+
then fail "Not a Char"
80+
else case fromCharCode cp of
81+
Nothing -> fail "Not a Char"
82+
Just c -> pure c
83+
84+
-- | Match any Unicode character.
85+
-- | Always succeeds.
86+
anyCodePoint :: forall m. Monad m => ParserT String m CodePoint
87+
anyCodePoint = do
88+
-- ParseState input _ _ <- get
89+
ParseState input position _ <- get
5690
case uncons input of
5791
Nothing -> fail "Unexpected EOF"
5892
Just { head, tail } -> do
59-
modify_ \(ParseState _ position _) ->
60-
ParseState tail
61-
(updatePosString position (SCU.singleton head))
62-
true
93+
put $ ParseState tail (updatePosSingle position head) true
6394
pure head
6495

65-
-- | Match a character satisfying the specified predicate.
66-
satisfy :: forall s m. StringLike s => Monad m => (Char -> Boolean) -> ParserT s m Char
96+
-- | Match a BMP `Char` satisfying the predicate.
97+
satisfy :: forall m. Monad m => (Char -> Boolean) -> ParserT String m Char
6798
satisfy f = tryRethrow do
6899
c <- anyChar
69-
if f c then pure c
70-
else fail $ "Character '" <> SCU.singleton c <> "' did not satisfy predicate"
100+
if f c
101+
then pure c
102+
else fail "Predicate unsatisfied"
71103

72-
-- | Match the specified character
73-
char :: forall s m. StringLike s => Monad m => Char -> ParserT s m Char
104+
-- | Match a Unicode character satisfying the predicate.
105+
satisfyCodePoint :: forall m. Monad m => (CodePoint -> Boolean) -> ParserT String m CodePoint
106+
satisfyCodePoint f = tryRethrow do
107+
c <- anyCodePoint
108+
if f c
109+
then pure c
110+
else fail "Predicate unsatisfied"
111+
112+
-- | Match the specified BMP `Char`.
113+
char :: forall m. Monad m => Char -> ParserT String m Char
74114
char c = satisfy (_ == c) <?> show c
75115

76-
-- | Match zero or more whitespace characters.
77-
whiteSpace :: forall s m. StringLike s => Monad m => ParserT s m String
78-
whiteSpace = do
79-
cs <- many $ satisfy \c -> c == '\n' || c == '\r' || c == ' ' || c == '\t'
80-
pure $ SCU.fromCharArray cs
116+
-- | Match zero or more whitespace characters satisfying
117+
-- | `Data.CodePoint.Unicode.isSpace`.
118+
whiteSpace :: forall m. Monad m => ParserT String m String
119+
whiteSpace = fst <$> match skipSpaces
81120

82121
-- | Skip whitespace characters.
83-
skipSpaces :: forall s m. StringLike s => Monad m => ParserT s m Unit
84-
skipSpaces = void whiteSpace
122+
skipSpaces :: forall m. Monad m => ParserT String m Unit
123+
skipSpaces = skipMany (satisfyCodePoint isSpace)
85124

86-
-- | Match one of the characters in the array.
87-
oneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char
125+
-- | Match one of the BMP `Char`s in the array.
126+
oneOf :: forall m. Monad m => Array Char -> ParserT String m Char
88127
oneOf ss = satisfy (flip elem ss) <?> ("one of " <> show ss)
89128

90-
-- | Match any character not in the array.
91-
noneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char
129+
-- | Match any BMP `Char` not in the array.
130+
noneOf :: forall m. Monad m => Array Char -> ParserT String m Char
92131
noneOf ss = satisfy (flip notElem ss) <?> ("none of " <> show ss)
132+
133+
-- | Updates a `Position` by adding the columns and lines in `String`.
134+
updatePosString :: Position -> String -> Position
135+
updatePosString pos str = case uncons str of
136+
Nothing -> pos
137+
Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive
138+
139+
-- | Updates a `Position` by adding the columns and lines in a
140+
-- | single `CodePoint`.
141+
updatePosSingle :: Position -> CodePoint -> Position
142+
updatePosSingle (Position {line,column}) cp = case unsafeCoerce cp of
143+
10 -> Position { line: line + 1, column: 1 } -- "\n"
144+
13 -> Position { line: line + 1, column: 1 } -- "\r"
145+
9 -> Position { line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns?
146+
_ -> Position { line, column: column + 1 }
147+
148+
-- | Combinator which returns both the result of a parse and the portion of
149+
-- | the input that was consumed while it was being parsed.
150+
match :: forall m a. Monad m => ParserT String m a -> ParserT String m (Tuple String a)
151+
match p = do
152+
ParseState input1 _ _ <- get
153+
x <- p
154+
ParseState input2 _ _ <- get
155+
-- We use the `SCU.length`, which is in units of “code units”
156+
-- instead of `Data.String.length`. which is in units of “code points”.
157+
-- This is more efficient, and it will be correct as long as we can assume
158+
-- the invariant that the `ParseState input` always begins on a code point
159+
-- boundary.
160+
pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x
161+
162+
-- Helper function.
163+
satisfyCP :: forall m . Monad m => (CodePoint -> Boolean) -> ParserT String m Char
164+
satisfyCP p = satisfy (p <<< codePointFromChar)
165+
166+
-- | Parse a digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isDecDigit`.
167+
digit :: forall m . Monad m => ParserT String m Char
168+
digit = satisfyCP isDecDigit <?> "digit"
169+
170+
-- | Parse a hex digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isHexDigit`.
171+
hexDigit :: forall m . Monad m => ParserT String m Char
172+
hexDigit = satisfyCP isHexDigit <?> "hex digit"
173+
174+
-- | Parse an octal digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isOctDigit`.
175+
octDigit :: forall m . Monad m => ParserT String m Char
176+
octDigit = satisfyCP isOctDigit <?> "oct digit"
177+
178+
-- | Parse an uppercase letter. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isUpper`.
179+
upper :: forall m . Monad m => ParserT String m Char
180+
upper = satisfyCP isUpper <?> "uppercase letter"
181+
182+
-- | Parse a space character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isSpace`.
183+
space :: forall m . Monad m => ParserT String m Char
184+
space = satisfyCP isSpace <?> "space"
185+
186+
-- | Parse an alphabetical character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlpha`.
187+
letter :: forall m . Monad m => ParserT String m Char
188+
letter = satisfyCP isAlpha <?> "letter"
189+
190+
-- | Parse an alphabetical or numerical character.
191+
-- | Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
192+
alphaNum :: forall m . Monad m => ParserT String m Char
193+
alphaNum = satisfyCP isAlphaNum <?> "letter or digit"

src/Text/Parsing/Parser/Token.purs

Lines changed: 7 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,17 @@ module Text.Parsing.Parser.Token
1010
, TokenParser
1111
, GenTokenParser
1212
, makeTokenParser
13-
-- should these be exported? Maybe they should go in a different module?
14-
, digit
15-
, hexDigit
16-
, octDigit
17-
, upper
18-
, space
19-
, letter
20-
, alphaNum
2113
)
2214
where
2315

24-
import Prelude hiding (when,between)
16+
import Prelude hiding (when, between)
2517

2618
import Control.Lazy (fix)
2719
import Control.Monad.State (gets, modify_)
2820
import Control.MonadPlus (guard, (<|>))
2921
import Data.Array as Array
30-
import Data.String.CodeUnits (toChar, singleton) as CodeUnits
31-
import Data.String.CodePoints (CodePoint, codePointFromChar)
3222
import Data.Char (fromCharCode, toCharCode)
33-
import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper, hexDigitToInt)
34-
import Data.String.Unicode as Unicode
23+
import Data.CodePoint.Unicode (hexDigitToInt, isAlpha, isSpace)
3524
import Data.Either (Either(..))
3625
import Data.Foldable (foldl, foldr)
3726
import Data.Identity (Identity)
@@ -41,13 +30,16 @@ import Data.List as List
4130
import Data.List.NonEmpty (NonEmptyList)
4231
import Data.Maybe (Maybe(..), maybe)
4332
import Data.String (null, toLower)
33+
import Data.String.CodePoints (codePointFromChar)
34+
import Data.String.CodeUnits (toChar, singleton) as CodeUnits
4435
import Data.String.CodeUnits as SCU
36+
import Data.String.Unicode as Unicode
4537
import Data.Tuple (Tuple(..))
4638
import Math (pow)
4739
import Text.Parsing.Parser (ParseState(..), ParserT, fail)
4840
import Text.Parsing.Parser.Combinators (skipMany1, try, tryRethrow, skipMany, notFollowedBy, option, choice, between, sepBy1, sepBy, (<?>), (<??>))
4941
import Text.Parsing.Parser.Pos (Position)
50-
import Text.Parsing.Parser.String (satisfy, oneOf, noneOf, string, char)
42+
import Text.Parsing.Parser.String (char, digit, hexDigit, noneOf, octDigit, oneOf, satisfy, satisfyCodePoint, space, string, upper)
5143

5244
-- | Create a parser which Returns the first token in the stream.
5345
token :: forall m a. Monad m => (a -> Position) -> ParserT (List a) m a
@@ -746,7 +738,7 @@ whiteSpace' langDef@(LanguageDef languageDef)
746738
skipMany (simpleSpace <|> oneLineComment langDef <|> multiLineComment langDef <?> "")
747739

748740
simpleSpace :: forall m . Monad m => ParserT String m Unit
749-
simpleSpace = skipMany1 (satisfyCP isSpace)
741+
simpleSpace = skipMany1 (satisfyCodePoint isSpace)
750742

751743
oneLineComment :: forall m . Monad m => GenLanguageDef String m -> ParserT String m Unit
752744
oneLineComment (LanguageDef languageDef) =
@@ -780,39 +772,3 @@ inCommentSingle (LanguageDef languageDef) =
780772
where
781773
startEnd :: Array Char
782774
startEnd = SCU.toCharArray languageDef.commentEnd <> SCU.toCharArray languageDef.commentStart
783-
784-
-------------------------------------------------------------------------
785-
-- Helper functions that should maybe go in Text.Parsing.Parser.String --
786-
-------------------------------------------------------------------------
787-
788-
satisfyCP :: forall m . Monad m => (CodePoint -> Boolean) -> ParserT String m Char
789-
satisfyCP p = satisfy (p <<< codePointFromChar)
790-
791-
-- | Parse a digit. Matches any char that satisfies `Data.CodePoint.Unicode.isDecDigit`.
792-
digit :: forall m . Monad m => ParserT String m Char
793-
digit = satisfyCP isDecDigit <?> "digit"
794-
795-
-- | Parse a hex digit. Matches any char that satisfies `Data.CodePoint.Unicode.isHexDigit`.
796-
hexDigit :: forall m . Monad m => ParserT String m Char
797-
hexDigit = satisfyCP isHexDigit <?> "hex digit"
798-
799-
-- | Parse an octal digit. Matches any char that satisfies `Data.CodePoint.Unicode.isOctDigit`.
800-
octDigit :: forall m . Monad m => ParserT String m Char
801-
octDigit = satisfyCP isOctDigit <?> "oct digit"
802-
803-
-- | Parse an uppercase letter. Matches any char that satisfies `Data.CodePoint.Unicode.isUpper`.
804-
upper :: forall m . Monad m => ParserT String m Char
805-
upper = satisfyCP isUpper <?> "uppercase letter"
806-
807-
-- | Parse a space character. Matches any char that satisfies `Data.CodePoint.Unicode.isSpace`.
808-
space :: forall m . Monad m => ParserT String m Char
809-
space = satisfyCP isSpace <?> "space"
810-
811-
-- | Parse an alphabetical character. Matches any char that satisfies `Data.CodePoint.Unicode.isAlpha`.
812-
letter :: forall m . Monad m => ParserT String m Char
813-
letter = satisfyCP isAlpha <?> "letter"
814-
815-
-- | Parse an alphabetical or numerical character.
816-
-- | Matches any char that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
817-
alphaNum :: forall m . Monad m => ParserT String m Char
818-
alphaNum = satisfyCP isAlphaNum <?> "letter or digit"

0 commit comments

Comments
 (0)