Skip to content

Commit 5b30c5b

Browse files
authored
Merge pull request #386 from hamishknight/multiscalar
2 parents c13980f + 0872d16 commit 5b30c5b

12 files changed

+357
-70
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,13 @@ extension AST {
2929
/// A Unicode scalar value written as a literal
3030
///
3131
/// \u{...}, \0dd, \x{...}, ...
32-
case scalar(Unicode.Scalar)
32+
case scalar(Scalar)
33+
34+
/// A whitespace-separated sequence of Unicode scalar values which are
35+
/// implicitly splatted out.
36+
///
37+
/// `\u{A B C}` -> `\u{A}\u{B}\u{C}`
38+
case scalarSequence(ScalarSequence)
3339

3440
/// A Unicode property, category, or script, including those written using
3541
/// POSIX syntax.
@@ -84,6 +90,7 @@ extension AST.Atom {
8490
switch kind {
8591
case .char(let v): return v
8692
case .scalar(let v): return v
93+
case .scalarSequence(let v): return v
8794
case .property(let v): return v
8895
case .escaped(let v): return v
8996
case .keyboardControl(let v): return v
@@ -106,6 +113,30 @@ extension AST.Atom {
106113
}
107114
}
108115

116+
extension AST.Atom {
117+
public struct Scalar: Hashable {
118+
public var value: UnicodeScalar
119+
public var location: SourceLocation
120+
121+
public init(_ value: UnicodeScalar, _ location: SourceLocation) {
122+
self.value = value
123+
self.location = location
124+
}
125+
}
126+
127+
public struct ScalarSequence: Hashable {
128+
public var scalars: [Scalar]
129+
public var trivia: [AST.Trivia]
130+
131+
public init(_ scalars: [Scalar], trivia: [AST.Trivia]) {
132+
precondition(scalars.count > 1, "Expected multiple scalars")
133+
self.scalars = scalars
134+
self.trivia = trivia
135+
}
136+
public var scalarValues: [Unicode.Scalar] { scalars.map(\.value) }
137+
}
138+
}
139+
109140
extension AST.Atom {
110141

111142
// TODO: We might scrap this and break out a few categories so
@@ -697,7 +728,7 @@ extension AST.Atom {
697728
case .char(let c):
698729
return c
699730
case .scalar(let s):
700-
return Character(s)
731+
return Character(s.value)
701732

702733
case .escaped(let c):
703734
return c.scalarValue.map(Character.init)
@@ -713,8 +744,9 @@ extension AST.Atom {
713744
// the AST? Or defer for the matching engine?
714745
return nil
715746

716-
case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
717-
.callout, .backtrackingDirective, .changeMatchingOptions:
747+
case .scalarSequence, .property, .any, .startOfLine, .endOfLine,
748+
.backreference, .subpattern, .callout, .backtrackingDirective,
749+
.changeMatchingOptions:
718750
return nil
719751
}
720752
}
@@ -736,13 +768,21 @@ extension AST.Atom {
736768
/// A string literal representation of the atom, if possible.
737769
///
738770
/// Individual characters are returned as-is, and Unicode scalars are
739-
/// presented using "\u{nnnn}" syntax.
771+
/// presented using "\u{nn nn ...}" syntax.
740772
public var literalStringValue: String? {
773+
func scalarLiteral(_ u: [UnicodeScalar]) -> String {
774+
let digits = u.map { String($0.value, radix: 16, uppercase: true) }
775+
.joined(separator: " ")
776+
return "\\u{\(digits)}"
777+
}
741778
switch kind {
742779
case .char(let c):
743780
return String(c)
744781
case .scalar(let s):
745-
return "\\u{\(String(s.value, radix: 16, uppercase: true))}"
782+
return scalarLiteral([s.value])
783+
784+
case .scalarSequence(let s):
785+
return scalarLiteral(s.scalarValues)
746786

747787
case .keyboardControl(let x):
748788
return "\\C-\(x)"

Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ extension Source {
1818
// This follows the rules provided by UAX44-LM3, including trying to drop an
1919
// "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for
2020
// consistency with other engines and the Unicode.Scalar.Properties names.
21-
let str = str.filter { !$0.isWhitespace && $0 != "_" && $0 != "-" }
21+
let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" }
2222
.lowercased()
2323
if let m = match(str) {
2424
return m

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,17 @@ extension ParseError: CustomStringConvertible {
105105
case let .expectedNumDigits(s, i):
106106
return "expected \(i) digits in '\(s)'"
107107
case let .expectedNumber(s, kind: kind):
108-
let radix: String
109-
if kind == .decimal {
110-
radix = ""
111-
} else {
112-
radix = " of radix \(kind.radix)"
108+
let number: String
109+
switch kind {
110+
case .octal:
111+
number = "octal number"
112+
case .decimal:
113+
number = "number"
114+
case .hex:
115+
number = "hexadecimal number"
113116
}
114-
return "expected a numbers in '\(s)'\(radix)"
117+
let suffix = s.isEmpty ? "" : " in '\(s)'"
118+
return "expected \(number)\(suffix)"
115119
case let .expected(s):
116120
return "expected '\(s)'"
117121
case .unexpectedEndOfInput:

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 115 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,19 @@ extension Source {
157157
return .init(start ..< currentPosition)
158158
}
159159

160+
/// Attempt to eat a given prefix that satisfies a given predicate, with the
161+
/// source location recorded.
162+
mutating func tryEatLocatedPrefix(
163+
maxLength: Int? = nil,
164+
_ f: (Char) -> Bool
165+
) -> Located<String>? {
166+
let result = recordLoc { src in
167+
src.tryEatPrefix(maxLength: maxLength, f)
168+
}
169+
guard let result = result else { return nil }
170+
return result.map(\.string)
171+
}
172+
160173
/// Throws an expected ASCII character error if not matched
161174
mutating func expectASCII() throws -> Located<Character> {
162175
try recordLoc { src in
@@ -217,13 +230,13 @@ extension Source {
217230
/// return the scalar value, or throw an error if the string is malformed or
218231
/// would overflow the scalar.
219232
private static func validateUnicodeScalar(
220-
_ str: String, _ kind: RadixKind
221-
) throws -> Unicode.Scalar {
222-
let num = try validateNumber(str, UInt32.self, kind)
233+
_ str: Source.Located<String>, _ kind: RadixKind
234+
) throws -> AST.Atom.Scalar {
235+
let num = try validateNumber(str.value, UInt32.self, kind)
223236
guard let scalar = Unicode.Scalar(num) else {
224237
throw ParseError.misc("Invalid scalar value U+\(num.hexStr)")
225238
}
226-
return scalar
239+
return .init(scalar, str.location)
227240
}
228241

229242
/// Try to eat a number of a particular type and radix off the front.
@@ -266,20 +279,65 @@ extension Source {
266279
/// Eat a scalar value from hexadecimal notation off the front
267280
private mutating func expectUnicodeScalar(
268281
numDigits: Int
269-
) throws -> Located<Unicode.Scalar> {
270-
try recordLoc { src in
282+
) throws -> AST.Atom.Scalar {
283+
let str = try recordLoc { src -> String in
271284
let str = src.eat(upToCount: numDigits).string
272285
guard str.count == numDigits else {
273286
throw ParseError.expectedNumDigits(str, numDigits)
274287
}
275-
return try Source.validateUnicodeScalar(str, .hex)
288+
return str
276289
}
290+
return try Source.validateUnicodeScalar(str, .hex)
291+
}
292+
293+
/// Try to lex a seqence of hex digit unicode scalars.
294+
///
295+
/// UniScalarSequence -> Whitespace? UniScalarSequencElt+
296+
/// UniScalarSequencElt -> HexDigit{1...} Whitespace?
297+
///
298+
mutating func expectUnicodeScalarSequence(
299+
eating ending: Character
300+
) throws -> AST.Atom.Kind {
301+
try recordLoc { src in
302+
var scalars = [AST.Atom.Scalar]()
303+
var trivia = [AST.Trivia]()
304+
305+
// Eat up any leading whitespace.
306+
if let t = src.lexWhitespace() { trivia.append(t) }
307+
308+
while true {
309+
let str = src.lexUntil { src in
310+
// Hit the ending, stop lexing.
311+
if src.isEmpty || src.peek() == ending {
312+
return true
313+
}
314+
// Eat up trailing whitespace, and stop lexing to record the scalar.
315+
if let t = src.lexWhitespace() {
316+
trivia.append(t)
317+
return true
318+
}
319+
// Not the ending or trivia, must be a digit of the scalar.
320+
return false
321+
}
322+
guard !str.value.isEmpty else { break }
323+
scalars.append(try Source.validateUnicodeScalar(str, .hex))
324+
}
325+
guard !scalars.isEmpty else {
326+
throw ParseError.expectedNumber("", kind: .hex)
327+
}
328+
try src.expect(ending)
329+
330+
if scalars.count == 1 {
331+
return .scalar(scalars[0])
332+
}
333+
return .scalarSequence(.init(scalars, trivia: trivia))
334+
}.value
277335
}
278336

279337
/// Eat a scalar off the front, starting from after the
280338
/// backslash and base character (e.g. `\u` or `\x`).
281339
///
282-
/// UniScalar -> 'u{' HexDigit{1...} '}'
340+
/// UniScalar -> 'u{' UniScalarSequence '}'
283341
/// | 'u' HexDigit{4}
284342
/// | 'x{' HexDigit{1...} '}'
285343
/// | 'x' HexDigit{0...2}
@@ -289,49 +347,60 @@ extension Source {
289347
///
290348
mutating func expectUnicodeScalar(
291349
escapedCharacter base: Character
292-
) throws -> Located<Unicode.Scalar> {
350+
) throws -> AST.Atom.Kind {
293351
try recordLoc { src in
352+
353+
func nullScalar() -> AST.Atom.Kind {
354+
let pos = src.currentPosition
355+
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
356+
}
357+
294358
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
295359
switch base {
296360
// Hex numbers.
297-
case "u" where src.tryEat("{"), "x" where src.tryEat("{"):
298-
let str = try src.lexUntil(eating: "}").value
299-
return try Source.validateUnicodeScalar(str, .hex)
361+
case "u" where src.tryEat("{"):
362+
return try src.expectUnicodeScalarSequence(eating: "}")
363+
364+
case "x" where src.tryEat("{"):
365+
let str = try src.lexUntil(eating: "}")
366+
return .scalar(try Source.validateUnicodeScalar(str, .hex))
300367

301368
case "x":
302369
// \x expects *up to* 2 digits.
303-
guard let digits = src.tryEatPrefix(maxLength: 2, \.isHexDigit) else {
370+
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
371+
else {
304372
// In PCRE, \x without any valid hex digits is \u{0}.
305373
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
306374
// could be changed to throw an error if we had a parsing mode for
307375
// them.
308-
return Unicode.Scalar(0)
376+
return nullScalar()
309377
}
310-
return try Source.validateUnicodeScalar(digits.string, .hex)
378+
return .scalar(try Source.validateUnicodeScalar(digits, .hex))
311379

312380
case "u":
313-
return try src.expectUnicodeScalar(numDigits: 4).value
381+
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
314382
case "U":
315-
return try src.expectUnicodeScalar(numDigits: 8).value
383+
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
316384

317385
// Octal numbers.
318386
case "o" where src.tryEat("{"):
319-
let str = try src.lexUntil(eating: "}").value
320-
return try Source.validateUnicodeScalar(str, .octal)
387+
let str = try src.lexUntil(eating: "}")
388+
return .scalar(try Source.validateUnicodeScalar(str, .octal))
321389

322390
case "0":
323391
// We can read *up to* 3 more octal digits.
324392
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
325393
// PCRE mode, we should limit it here.
326-
guard let digits = src.tryEatPrefix(maxLength: 3, \.isOctalDigit) else {
327-
return Unicode.Scalar(0)
394+
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
395+
else {
396+
return nullScalar()
328397
}
329-
return try Source.validateUnicodeScalar(digits.string, .octal)
398+
return .scalar(try Source.validateUnicodeScalar(digits, .octal))
330399

331400
default:
332401
fatalError("Unexpected scalar start")
333402
}
334-
}
403+
}.value
335404
}
336405

337406
/// Try to consume a quantifier
@@ -434,13 +503,22 @@ extension Source {
434503
private mutating func lexUntil(
435504
_ predicate: (inout Source) throws -> Bool
436505
) rethrows -> Located<String> {
506+
// We track locations outside of recordLoc, as the predicate may advance the
507+
// input when we hit the end, and we don't want that to affect the location
508+
// of what was lexed in the `result`. We still want the recordLoc call to
509+
// attach locations to any thrown errors though.
510+
// TODO: We should find a better way of doing this, `lexUntil` seems full
511+
// of footguns.
512+
let start = currentPosition
513+
var end = currentPosition
514+
var result = ""
437515
try recordLoc { src in
438-
var result = ""
439516
while try !predicate(&src) {
440517
result.append(src.eat())
518+
end = src.currentPosition
441519
}
442-
return result
443520
}
521+
return .init(result, start ..< end)
444522
}
445523

446524
private mutating func lexUntil(eating end: String) throws -> Located<String> {
@@ -576,6 +654,16 @@ extension Source {
576654
// inside a custom character class (and only treats whitespace as
577655
// non-semantic there for the extra-extended `(?xx)` mode). If we get a
578656
// strict-PCRE mode, we'll need to add a case for that.
657+
return lexWhitespace()
658+
}
659+
660+
/// Try to consume whitespace as trivia
661+
///
662+
/// Whitespace -> WhitespaceChar+
663+
///
664+
/// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex
665+
/// whitespace.
666+
mutating func lexWhitespace() -> AST.Trivia? {
579667
let trivia: Located<String>? = recordLoc { src in
580668
src.tryEatPrefix(\.isPatternWhitespace)?.string
581669
}
@@ -1153,7 +1241,7 @@ extension Source {
11531241

11541242
// We should either have a unicode scalar.
11551243
if src.tryEat(sequence: "U+") {
1156-
let str = try src.lexUntil(eating: "}").value
1244+
let str = try src.lexUntil(eating: "}")
11571245
return .scalar(try Source.validateUnicodeScalar(str, .hex))
11581246
}
11591247

@@ -1581,8 +1669,7 @@ extension Source {
15811669
switch char {
15821670
// Hexadecimal and octal unicode scalars.
15831671
case "u", "x", "U", "o", "0":
1584-
return try .scalar(
1585-
src.expectUnicodeScalar(escapedCharacter: char).value)
1672+
return try src.expectUnicodeScalar(escapedCharacter: char)
15861673
default:
15871674
break
15881675
}

0 commit comments

Comments
 (0)