@@ -157,6 +157,19 @@ extension Source {
157
157
return . init( start ..< currentPosition)
158
158
}
159
159
160
+ /// Attempt to eat a given prefix that satisfies a given predicate, with the
161
+ /// source location recorded.
162
+ mutating func tryEatLocatedPrefix(
163
+ maxLength: Int ? = nil ,
164
+ _ f: ( Char ) -> Bool
165
+ ) -> Located < String > ? {
166
+ let result = recordLoc { src in
167
+ src. tryEatPrefix ( maxLength: maxLength, f)
168
+ }
169
+ guard let result = result else { return nil }
170
+ return result. map ( \. string)
171
+ }
172
+
160
173
/// Throws an expected ASCII character error if not matched
161
174
mutating func expectASCII( ) throws -> Located < Character > {
162
175
try recordLoc { src in
@@ -217,13 +230,13 @@ extension Source {
217
230
/// return the scalar value, or throw an error if the string is malformed or
218
231
/// would overflow the scalar.
219
232
private static func validateUnicodeScalar(
220
- _ str: String , _ kind: RadixKind
221
- ) throws -> Unicode . Scalar {
222
- let num = try validateNumber ( str, UInt32 . self, kind)
233
+ _ str: Source . Located < String > , _ kind: RadixKind
234
+ ) throws -> AST . Atom . Scalar {
235
+ let num = try validateNumber ( str. value , UInt32 . self, kind)
223
236
guard let scalar = Unicode . Scalar ( num) else {
224
237
throw ParseError . misc ( " Invalid scalar value U+ \( num. hexStr) " )
225
238
}
226
- return scalar
239
+ return . init ( scalar, str . location )
227
240
}
228
241
229
242
/// Try to eat a number of a particular type and radix off the front.
@@ -266,20 +279,65 @@ extension Source {
266
279
/// Eat a scalar value from hexadecimal notation off the front
267
280
private mutating func expectUnicodeScalar(
268
281
numDigits: Int
269
- ) throws -> Located < Unicode . Scalar > {
270
- try recordLoc { src in
282
+ ) throws -> AST . Atom . Scalar {
283
+ let str = try recordLoc { src -> String in
271
284
let str = src. eat ( upToCount: numDigits) . string
272
285
guard str. count == numDigits else {
273
286
throw ParseError . expectedNumDigits ( str, numDigits)
274
287
}
275
- return try Source . validateUnicodeScalar ( str, . hex )
288
+ return str
276
289
}
290
+ return try Source . validateUnicodeScalar ( str, . hex)
291
+ }
292
+
293
+ /// Try to lex a seqence of hex digit unicode scalars.
294
+ ///
295
+ /// UniScalarSequence -> Whitespace? UniScalarSequencElt+
296
+ /// UniScalarSequencElt -> HexDigit{1...} Whitespace?
297
+ ///
298
+ mutating func expectUnicodeScalarSequence(
299
+ eating ending: Character
300
+ ) throws -> AST . Atom . Kind {
301
+ try recordLoc { src in
302
+ var scalars = [ AST . Atom. Scalar] ( )
303
+ var trivia = [ AST . Trivia] ( )
304
+
305
+ // Eat up any leading whitespace.
306
+ if let t = src. lexWhitespace ( ) { trivia. append ( t) }
307
+
308
+ while true {
309
+ let str = src. lexUntil { src in
310
+ // Hit the ending, stop lexing.
311
+ if src. isEmpty || src. peek ( ) == ending {
312
+ return true
313
+ }
314
+ // Eat up trailing whitespace, and stop lexing to record the scalar.
315
+ if let t = src. lexWhitespace ( ) {
316
+ trivia. append ( t)
317
+ return true
318
+ }
319
+ // Not the ending or trivia, must be a digit of the scalar.
320
+ return false
321
+ }
322
+ guard !str. value. isEmpty else { break }
323
+ scalars. append ( try Source . validateUnicodeScalar ( str, . hex) )
324
+ }
325
+ guard !scalars. isEmpty else {
326
+ throw ParseError . expectedNumber ( " " , kind: . hex)
327
+ }
328
+ try src. expect ( ending)
329
+
330
+ if scalars. count == 1 {
331
+ return . scalar( scalars [ 0 ] )
332
+ }
333
+ return . scalarSequence( . init( scalars, trivia: trivia) )
334
+ } . value
277
335
}
278
336
279
337
/// Eat a scalar off the front, starting from after the
280
338
/// backslash and base character (e.g. `\u` or `\x`).
281
339
///
282
- /// UniScalar -> 'u{' HexDigit{1...} '}'
340
+ /// UniScalar -> 'u{' UniScalarSequence '}'
283
341
/// | 'u' HexDigit{4}
284
342
/// | 'x{' HexDigit{1...} '}'
285
343
/// | 'x' HexDigit{0...2}
@@ -289,49 +347,60 @@ extension Source {
289
347
///
290
348
mutating func expectUnicodeScalar(
291
349
escapedCharacter base: Character
292
- ) throws -> Located < Unicode . Scalar > {
350
+ ) throws -> AST . Atom . Kind {
293
351
try recordLoc { src in
352
+
353
+ func nullScalar( ) -> AST . Atom . Kind {
354
+ let pos = src. currentPosition
355
+ return . scalar( . init( UnicodeScalar ( 0 ) , SourceLocation ( pos ..< pos) ) )
356
+ }
357
+
294
358
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
295
359
switch base {
296
360
// Hex numbers.
297
- case " u " where src. tryEat ( " { " ) , " x " where src. tryEat ( " { " ) :
298
- let str = try src. lexUntil ( eating: " } " ) . value
299
- return try Source . validateUnicodeScalar ( str, . hex)
361
+ case " u " where src. tryEat ( " { " ) :
362
+ return try src. expectUnicodeScalarSequence ( eating: " } " )
363
+
364
+ case " x " where src. tryEat ( " { " ) :
365
+ let str = try src. lexUntil ( eating: " } " )
366
+ return . scalar( try Source . validateUnicodeScalar ( str, . hex) )
300
367
301
368
case " x " :
302
369
// \x expects *up to* 2 digits.
303
- guard let digits = src. tryEatPrefix ( maxLength: 2 , \. isHexDigit) else {
370
+ guard let digits = src. tryEatLocatedPrefix ( maxLength: 2 , \. isHexDigit)
371
+ else {
304
372
// In PCRE, \x without any valid hex digits is \u{0}.
305
373
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
306
374
// could be changed to throw an error if we had a parsing mode for
307
375
// them.
308
- return Unicode . Scalar ( 0 )
376
+ return nullScalar ( )
309
377
}
310
- return try Source . validateUnicodeScalar ( digits. string , . hex)
378
+ return . scalar ( try Source . validateUnicodeScalar ( digits, . hex) )
311
379
312
380
case " u " :
313
- return try src. expectUnicodeScalar ( numDigits: 4 ) . value
381
+ return . scalar ( try src. expectUnicodeScalar ( numDigits: 4 ) )
314
382
case " U " :
315
- return try src. expectUnicodeScalar ( numDigits: 8 ) . value
383
+ return . scalar ( try src. expectUnicodeScalar ( numDigits: 8 ) )
316
384
317
385
// Octal numbers.
318
386
case " o " where src. tryEat ( " { " ) :
319
- let str = try src. lexUntil ( eating: " } " ) . value
320
- return try Source . validateUnicodeScalar ( str, . octal)
387
+ let str = try src. lexUntil ( eating: " } " )
388
+ return . scalar ( try Source . validateUnicodeScalar ( str, . octal) )
321
389
322
390
case " 0 " :
323
391
// We can read *up to* 3 more octal digits.
324
392
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
325
393
// PCRE mode, we should limit it here.
326
- guard let digits = src. tryEatPrefix ( maxLength: 3 , \. isOctalDigit) else {
327
- return Unicode . Scalar ( 0 )
394
+ guard let digits = src. tryEatLocatedPrefix ( maxLength: 3 , \. isOctalDigit)
395
+ else {
396
+ return nullScalar ( )
328
397
}
329
- return try Source . validateUnicodeScalar ( digits. string , . octal)
398
+ return . scalar ( try Source . validateUnicodeScalar ( digits, . octal) )
330
399
331
400
default :
332
401
fatalError ( " Unexpected scalar start " )
333
402
}
334
- }
403
+ } . value
335
404
}
336
405
337
406
/// Try to consume a quantifier
@@ -434,13 +503,22 @@ extension Source {
434
503
private mutating func lexUntil(
435
504
_ predicate: ( inout Source ) throws -> Bool
436
505
) rethrows -> Located < String > {
506
+ // We track locations outside of recordLoc, as the predicate may advance the
507
+ // input when we hit the end, and we don't want that to affect the location
508
+ // of what was lexed in the `result`. We still want the recordLoc call to
509
+ // attach locations to any thrown errors though.
510
+ // TODO: We should find a better way of doing this, `lexUntil` seems full
511
+ // of footguns.
512
+ let start = currentPosition
513
+ var end = currentPosition
514
+ var result = " "
437
515
try recordLoc { src in
438
- var result = " "
439
516
while try ! predicate( & src) {
440
517
result. append ( src. eat ( ) )
518
+ end = src. currentPosition
441
519
}
442
- return result
443
520
}
521
+ return . init( result, start ..< end)
444
522
}
445
523
446
524
private mutating func lexUntil( eating end: String ) throws -> Located < String > {
@@ -576,6 +654,16 @@ extension Source {
576
654
// inside a custom character class (and only treats whitespace as
577
655
// non-semantic there for the extra-extended `(?xx)` mode). If we get a
578
656
// strict-PCRE mode, we'll need to add a case for that.
657
+ return lexWhitespace ( )
658
+ }
659
+
660
+ /// Try to consume whitespace as trivia
661
+ ///
662
+ /// Whitespace -> WhitespaceChar+
663
+ ///
664
+ /// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex
665
+ /// whitespace.
666
+ mutating func lexWhitespace( ) -> AST . Trivia ? {
579
667
let trivia : Located < String > ? = recordLoc { src in
580
668
src. tryEatPrefix ( \. isPatternWhitespace) ? . string
581
669
}
@@ -1153,7 +1241,7 @@ extension Source {
1153
1241
1154
1242
// We should either have a unicode scalar.
1155
1243
if src. tryEat ( sequence: " U+ " ) {
1156
- let str = try src. lexUntil ( eating: " } " ) . value
1244
+ let str = try src. lexUntil ( eating: " } " )
1157
1245
return . scalar( try Source . validateUnicodeScalar ( str, . hex) )
1158
1246
}
1159
1247
@@ -1581,8 +1669,7 @@ extension Source {
1581
1669
switch char {
1582
1670
// Hexadecimal and octal unicode scalars.
1583
1671
case " u " , " x " , " U " , " o " , " 0 " :
1584
- return try . scalar(
1585
- src. expectUnicodeScalar ( escapedCharacter: char) . value)
1672
+ return try src. expectUnicodeScalar ( escapedCharacter: char)
1586
1673
default :
1587
1674
break
1588
1675
}
0 commit comments