Skip to content

Commit b8291d8

Browse files
committed
Unicode correctness
Correctly handle UTF-16 surrogate pairs in `String`s. We are not quite making the default `CodePoint`, as was discussed in #76 (comment) . Rather we are keeping most of the current API and making it work properly with astral Unicode. We keep the `Char` parsers for ergonomic reasons. For example the parser `char :: forall s m. Monad m => Char -> ParserT s m Char`. This parser is usually called with a literal like `char 'a'`. It would be annoying to call this parser with `char (codePointFromChar 'a')`. Add primitive parsers `anyCodePoint` and `satisfyCodePoint` for parsing `CodePoint`s. To make this library handle Unicode correctly, it is necessary to delete the `StringLike` class. `StringLike` has no laws, and during the five years of its life, no-one on Github has ever written another instance of `StringLike`. https://github.com/search?l=&q=StringLike+language%3APureScript&type=code Move `updatePosString` to the `Text.Parsing.Parser.String` module and don't export it. Add the `match` combinator. Change the definition of `whiteSpace` and `skipSpaces` to `Data.CodePoint.Unicode.isSpace`. Move the character class parsers from `Text.Parsing.Parser.Token` module into the `Text.Parsing.Parser.String` module. All prior tests pass with no modifications. Add a few new tests.
1 parent 61db534 commit b8291d8

File tree

6 files changed

+187
-122
lines changed

6 files changed

+187
-122
lines changed

spago.dhall

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
, "transformers"
2121
, "tuples"
2222
, "unicode"
23+
, "unsafe-coerce"
2324
]
2425
, packages = ./packages.dhall
2526
, sources = [ "src/**/*.purs", "test/**/*.purs" ]

src/Text/Parsing/Parser/Language.purs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ import Prelude
1212

1313
import Control.Alt ((<|>))
1414
import Text.Parsing.Parser (ParserT)
15-
import Text.Parsing.Parser.String (char, oneOf)
16-
import Text.Parsing.Parser.Token (LanguageDef, TokenParser, GenLanguageDef(..), unGenLanguageDef, makeTokenParser, alphaNum, letter)
15+
import Text.Parsing.Parser.String (char, oneOf, alphaNum, letter)
16+
import Text.Parsing.Parser.Token (LanguageDef, TokenParser, GenLanguageDef(..), unGenLanguageDef, makeTokenParser)
1717

1818
-----------------------------------------------------------
1919
-- Styles: haskellStyle, javaStyle

src/Text/Parsing/Parser/Pos.purs

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
module Text.Parsing.Parser.Pos where
22

33
import Prelude
4+
45
import Data.Generic.Rep (class Generic)
5-
import Data.Foldable (foldl)
6-
import Data.Newtype (wrap)
7-
import Data.String (split)
86

97
-- | `Position` represents the position of the parser in the input.
108
-- |
@@ -27,13 +25,3 @@ derive instance ordPosition :: Ord Position
2725
-- | The `Position` before any input has been parsed.
2826
initialPos :: Position
2927
initialPos = Position { line: 1, column: 1 }
30-
31-
-- | Updates a `Position` by adding the columns and lines in `String`.
32-
updatePosString :: Position -> String -> Position
33-
updatePosString pos' str = foldl updatePosChar pos' (split (wrap "") str)
34-
where
35-
updatePosChar (Position pos) c = case c of
36-
"\n" -> Position { line: pos.line + 1, column: 1 }
37-
"\r" -> Position { line: pos.line + 1, column: 1 }
38-
"\t" -> Position { line: pos.line, column: pos.column + 8 - ((pos.column - 1) `mod` 8) }
39-
_ -> Position { line: pos.line, column: pos.column + 1 }

src/Text/Parsing/Parser/String.purs

Lines changed: 155 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,67 @@
11
-- | Primitive parsers for working with an input stream of type `String`.
2-
3-
module Text.Parsing.Parser.String where
2+
-- |
3+
-- | These primitive parsers all operate on primitive `String` inputs.
4+
-- | In most JavaScript runtime environments, the `String` is encoded
5+
-- | as little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16), but
6+
-- | these primitive parsers should work with any runtime encoding.
7+
-- |
8+
-- | The primitive parsers which return `Char` will only succeed when the character
9+
-- | being parsed is a code point in the
10+
-- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane)
11+
-- | (the “BMP”). These parsers can be convenient because of the good support
12+
-- | that PureScript has for writing `Char` literals like `'あ', 'β', 'C'`.
13+
-- |
14+
-- | The other primitive parsers, which return `CodePoint` and `String` types,
15+
-- | can parse the full Unicode character set. All of the primitive parsers
16+
-- | in this module can be used together.
17+
module Text.Parsing.Parser.String
18+
( string
19+
, eof
20+
, anyChar
21+
, anyCodePoint
22+
, satisfy
23+
, satisfyCodePoint
24+
, char
25+
, whiteSpace
26+
, skipSpaces
27+
, oneOf
28+
, noneOf
29+
, match
30+
, digit
31+
, hexDigit
32+
, octDigit
33+
, upper
34+
, space
35+
, letter
36+
, alphaNum
37+
)
38+
where
439

540
import Prelude hiding (between)
641

7-
import Control.Monad.State (gets, modify_)
8-
import Data.Array (many)
9-
import Data.Foldable (elem, notElem)
42+
import Control.Monad.State (get, gets, modify_)
43+
import Data.Array (notElem)
44+
import Data.Char (fromCharCode)
45+
import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper)
46+
import Data.Foldable (elem)
1047
import Data.Maybe (Maybe(..))
1148
import Data.Newtype (wrap)
12-
import Data.String (Pattern)
13-
import Data.String as S
49+
import Data.String (CodePoint, codePointFromChar, null, stripPrefix, uncons)
1450
import Data.String.CodeUnits as SCU
51+
import Data.Tuple (Tuple(..), fst)
1552
import Text.Parsing.Parser (ParseState(..), ParserT, fail)
16-
import Text.Parsing.Parser.Combinators (tryRethrow, (<?>))
17-
import Text.Parsing.Parser.Pos (updatePosString)
18-
19-
-- | This class exists to abstract over streams which support the string-like
20-
-- | operations which this modules needs.
21-
class StringLike s where
22-
drop :: Int -> s -> s
23-
stripPrefix :: Pattern -> s -> Maybe s
24-
null :: s -> Boolean
25-
uncons :: s -> Maybe { head :: Char, tail :: s }
26-
27-
instance stringLikeString :: StringLike String where
28-
uncons = SCU.uncons
29-
drop = S.drop
30-
stripPrefix = S.stripPrefix
31-
null = S.null
53+
import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (<?>))
54+
import Text.Parsing.Parser.Pos (Position(..))
55+
import Unsafe.Coerce (unsafeCoerce)
3256

3357
-- | Match end-of-file.
34-
eof :: forall s m. StringLike s => Monad m => ParserT s m Unit
58+
eof :: forall m. Monad m => ParserT String m Unit
3559
eof = do
3660
input <- gets \(ParseState input _ _) -> input
3761
unless (null input) (fail "Expected EOF")
3862

3963
-- | Match the specified string.
40-
string :: forall s m. StringLike s => Monad m => String -> ParserT s m String
64+
string :: forall m. Monad m => String -> ParserT String m String
4165
string str = do
4266
input <- gets \(ParseState input _ _) -> input
4367
case stripPrefix (wrap str) input of
@@ -49,44 +73,127 @@ string str = do
4973
pure str
5074
_ -> fail ("Expected " <> show str)
5175

52-
-- | Match any character.
53-
anyChar :: forall s m. StringLike s => Monad m => ParserT s m Char
54-
anyChar = do
76+
-- | Match any BMP `Char`.
77+
-- | Parser will fail if the character is not in the Basic Multilingual Plane.
78+
anyChar :: forall m. Monad m => ParserT String m Char
79+
anyChar = tryRethrow do
80+
cp :: Int <- unsafeCoerce <$> anyCodePoint
81+
-- the `fromCharCode` function doesn't check if this is beyond the
82+
-- BMP, so we check that ourselves.
83+
-- https://github.com/purescript/purescript-strings/issues/153
84+
if cp > 65535 -- BMP
85+
then fail "Not a Char"
86+
else case fromCharCode cp of
87+
Nothing -> fail "Not a Char"
88+
Just c -> pure c
89+
90+
-- | Match any Unicode character.
91+
-- | Always succeeds.
92+
anyCodePoint :: forall m. Monad m => ParserT String m CodePoint
93+
anyCodePoint = do
5594
input <- gets \(ParseState input _ _) -> input
5695
case uncons input of
5796
Nothing -> fail "Unexpected EOF"
5897
Just { head, tail } -> do
5998
modify_ \(ParseState _ position _) ->
60-
ParseState tail
61-
(updatePosString position (SCU.singleton head))
62-
true
99+
ParseState tail (updatePosSingle position head) true
63100
pure head
64101

65-
-- | Match a character satisfying the specified predicate.
66-
satisfy :: forall s m. StringLike s => Monad m => (Char -> Boolean) -> ParserT s m Char
102+
-- | Match a BMP `Char` satisfying the predicate.
103+
satisfy :: forall m. Monad m => (Char -> Boolean) -> ParserT String m Char
67104
satisfy f = tryRethrow do
68105
c <- anyChar
69-
if f c then pure c
70-
else fail $ "Character '" <> SCU.singleton c <> "' did not satisfy predicate"
106+
if f c
107+
then pure c
108+
else fail "Predicate unsatisfied"
109+
110+
-- | Match a Unicode character satisfying the predicate.
111+
satisfyCodePoint :: forall m. Monad m => (CodePoint -> Boolean) -> ParserT String m CodePoint
112+
satisfyCodePoint f = tryRethrow do
113+
c <- anyCodePoint
114+
if f c
115+
then pure c
116+
else fail "Predicate unsatisfied"
71117

72-
-- | Match the specified character
73-
char :: forall s m. StringLike s => Monad m => Char -> ParserT s m Char
118+
-- | Match the specified BMP `Char`.
119+
char :: forall m. Monad m => Char -> ParserT String m Char
74120
char c = satisfy (_ == c) <?> show c
75121

76-
-- | Match zero or more whitespace characters.
77-
whiteSpace :: forall s m. StringLike s => Monad m => ParserT s m String
78-
whiteSpace = do
79-
cs <- many $ satisfy \c -> c == '\n' || c == '\r' || c == ' ' || c == '\t'
80-
pure $ SCU.fromCharArray cs
122+
-- | Match zero or more whitespace characters satisfying
123+
-- | `Data.CodePoint.Unicode.isSpace`.
124+
whiteSpace :: forall m. Monad m => ParserT String m String
125+
whiteSpace = fst <$> match skipSpaces
81126

82127
-- | Skip whitespace characters.
83-
skipSpaces :: forall s m. StringLike s => Monad m => ParserT s m Unit
84-
skipSpaces = void whiteSpace
128+
skipSpaces :: forall m. Monad m => ParserT String m Unit
129+
skipSpaces = skipMany (satisfyCodePoint isSpace)
85130

86-
-- | Match one of the characters in the array.
87-
oneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char
131+
-- | Match one of the BMP `Char`s in the array.
132+
oneOf :: forall m. Monad m => Array Char -> ParserT String m Char
88133
oneOf ss = satisfy (flip elem ss) <?> ("one of " <> show ss)
89134

90-
-- | Match any character not in the array.
91-
noneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char
135+
-- | Match any BMP `Char` not in the array.
136+
noneOf :: forall m. Monad m => Array Char -> ParserT String m Char
92137
noneOf ss = satisfy (flip notElem ss) <?> ("none of " <> show ss)
138+
139+
-- | Updates a `Position` by adding the columns and lines in `String`.
140+
updatePosString :: Position -> String -> Position
141+
updatePosString pos str = case uncons str of
142+
Nothing -> pos
143+
Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive
144+
145+
-- | Updates a `Position` by adding the columns and lines in a
146+
-- | single `CodePoint`.
147+
updatePosSingle :: Position -> CodePoint -> Position
148+
updatePosSingle (Position {line,column}) cp = case unsafeCoerce cp of
149+
10 -> Position { line: line + 1, column: 1 } -- "\n"
150+
13 -> Position { line: line + 1, column: 1 } -- "\r"
151+
9 -> Position { line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns?
152+
_ -> Position { line, column: column + 1 }
153+
154+
-- | Combinator which returns both the result of a parse and the portion of
155+
-- | the input that was consumed while it was being parsed.
156+
match :: forall m a. Monad m => ParserT String m a -> ParserT String m (Tuple String a)
157+
match p = do
158+
ParseState input1 _ _ <- get
159+
x <- p
160+
ParseState input2 _ _ <- get
161+
-- We use the `SCU.length`, which is in units of “code units”
162+
-- instead of `Data.String.length`. which is in units of “code points”.
163+
-- This is more efficient, and it will be correct as long as we can assume
164+
-- the invariant that the `ParseState input` always begins on a code point
165+
-- boundary.
166+
pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x
167+
168+
-- Helper function.
169+
satisfyCP :: forall m . Monad m => (CodePoint -> Boolean) -> ParserT String m Char
170+
satisfyCP p = satisfy (p <<< codePointFromChar)
171+
172+
-- | Parse a digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isDecDigit`.
173+
digit :: forall m . Monad m => ParserT String m Char
174+
digit = satisfyCP isDecDigit <?> "digit"
175+
176+
-- | Parse a hex digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isHexDigit`.
177+
hexDigit :: forall m . Monad m => ParserT String m Char
178+
hexDigit = satisfyCP isHexDigit <?> "hex digit"
179+
180+
-- | Parse an octal digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isOctDigit`.
181+
octDigit :: forall m . Monad m => ParserT String m Char
182+
octDigit = satisfyCP isOctDigit <?> "oct digit"
183+
184+
-- | Parse an uppercase letter. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isUpper`.
185+
upper :: forall m . Monad m => ParserT String m Char
186+
upper = satisfyCP isUpper <?> "uppercase letter"
187+
188+
-- | Parse a space character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isSpace`.
189+
space :: forall m . Monad m => ParserT String m Char
190+
space = satisfyCP isSpace <?> "space"
191+
192+
-- | Parse an alphabetical character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlpha`.
193+
letter :: forall m . Monad m => ParserT String m Char
194+
letter = satisfyCP isAlpha <?> "letter"
195+
196+
-- | Parse an alphabetical or numerical character.
197+
-- | Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
198+
alphaNum :: forall m . Monad m => ParserT String m Char
199+
alphaNum = satisfyCP isAlphaNum <?> "letter or digit"

src/Text/Parsing/Parser/Token.purs

Lines changed: 7 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,17 @@ module Text.Parsing.Parser.Token
1010
, TokenParser
1111
, GenTokenParser
1212
, makeTokenParser
13-
-- should these be exported? Maybe they should go in a different module?
14-
, digit
15-
, hexDigit
16-
, octDigit
17-
, upper
18-
, space
19-
, letter
20-
, alphaNum
2113
)
2214
where
2315

24-
import Prelude hiding (when,between)
16+
import Prelude hiding (when, between)
2517

2618
import Control.Lazy (fix)
2719
import Control.Monad.State (gets, modify_)
2820
import Control.MonadPlus (guard, (<|>))
2921
import Data.Array as Array
30-
import Data.String.CodeUnits (toChar, singleton) as CodeUnits
31-
import Data.String.CodePoints (CodePoint, codePointFromChar)
3222
import Data.Char (fromCharCode, toCharCode)
33-
import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper, hexDigitToInt)
34-
import Data.String.Unicode as Unicode
23+
import Data.CodePoint.Unicode (hexDigitToInt, isAlpha, isSpace)
3524
import Data.Either (Either(..))
3625
import Data.Foldable (foldl, foldr)
3726
import Data.Identity (Identity)
@@ -41,13 +30,16 @@ import Data.List as List
4130
import Data.List.NonEmpty (NonEmptyList)
4231
import Data.Maybe (Maybe(..), maybe)
4332
import Data.String (null, toLower)
33+
import Data.String.CodePoints (codePointFromChar)
34+
import Data.String.CodeUnits (toChar, singleton) as CodeUnits
4435
import Data.String.CodeUnits as SCU
36+
import Data.String.Unicode as Unicode
4537
import Data.Tuple (Tuple(..))
4638
import Math (pow)
4739
import Text.Parsing.Parser (ParseState(..), ParserT, fail)
4840
import Text.Parsing.Parser.Combinators (skipMany1, try, tryRethrow, skipMany, notFollowedBy, option, choice, between, sepBy1, sepBy, (<?>), (<??>))
4941
import Text.Parsing.Parser.Pos (Position)
50-
import Text.Parsing.Parser.String (satisfy, oneOf, noneOf, string, char)
42+
import Text.Parsing.Parser.String (char, digit, hexDigit, noneOf, octDigit, oneOf, satisfy, satisfyCodePoint, space, string, upper)
5143

5244
-- | Create a parser which Returns the first token in the stream.
5345
token :: forall m a. Monad m => (a -> Position) -> ParserT (List a) m a
@@ -746,7 +738,7 @@ whiteSpace' langDef@(LanguageDef languageDef)
746738
skipMany (simpleSpace <|> oneLineComment langDef <|> multiLineComment langDef <?> "")
747739

748740
simpleSpace :: forall m . Monad m => ParserT String m Unit
749-
simpleSpace = skipMany1 (satisfyCP isSpace)
741+
simpleSpace = skipMany1 (satisfyCodePoint isSpace)
750742

751743
oneLineComment :: forall m . Monad m => GenLanguageDef String m -> ParserT String m Unit
752744
oneLineComment (LanguageDef languageDef) =
@@ -780,39 +772,3 @@ inCommentSingle (LanguageDef languageDef) =
780772
where
781773
startEnd :: Array Char
782774
startEnd = SCU.toCharArray languageDef.commentEnd <> SCU.toCharArray languageDef.commentStart
783-
784-
-------------------------------------------------------------------------
785-
-- Helper functions that should maybe go in Text.Parsing.Parser.String --
786-
-------------------------------------------------------------------------
787-
788-
satisfyCP :: forall m . Monad m => (CodePoint -> Boolean) -> ParserT String m Char
789-
satisfyCP p = satisfy (p <<< codePointFromChar)
790-
791-
-- | Parse a digit. Matches any char that satisfies `Data.CodePoint.Unicode.isDecDigit`.
792-
digit :: forall m . Monad m => ParserT String m Char
793-
digit = satisfyCP isDecDigit <?> "digit"
794-
795-
-- | Parse a hex digit. Matches any char that satisfies `Data.CodePoint.Unicode.isHexDigit`.
796-
hexDigit :: forall m . Monad m => ParserT String m Char
797-
hexDigit = satisfyCP isHexDigit <?> "hex digit"
798-
799-
-- | Parse an octal digit. Matches any char that satisfies `Data.CodePoint.Unicode.isOctDigit`.
800-
octDigit :: forall m . Monad m => ParserT String m Char
801-
octDigit = satisfyCP isOctDigit <?> "oct digit"
802-
803-
-- | Parse an uppercase letter. Matches any char that satisfies `Data.CodePoint.Unicode.isUpper`.
804-
upper :: forall m . Monad m => ParserT String m Char
805-
upper = satisfyCP isUpper <?> "uppercase letter"
806-
807-
-- | Parse a space character. Matches any char that satisfies `Data.CodePoint.Unicode.isSpace`.
808-
space :: forall m . Monad m => ParserT String m Char
809-
space = satisfyCP isSpace <?> "space"
810-
811-
-- | Parse an alphabetical character. Matches any char that satisfies `Data.CodePoint.Unicode.isAlpha`.
812-
letter :: forall m . Monad m => ParserT String m Char
813-
letter = satisfyCP isAlpha <?> "letter"
814-
815-
-- | Parse an alphabetical or numerical character.
816-
-- | Matches any char that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
817-
alphaNum :: forall m . Monad m => ParserT String m Char
818-
alphaNum = satisfyCP isAlphaNum <?> "letter or digit"

0 commit comments

Comments
 (0)