Skip to content

Commit 6997fcb

Browse files
committed
Introduce AST.Atom.Number
This stores both a source location, and has the ability to be `nil`, which is necessary to enable parser recovery in cases where we expect a number but parse something that e.g overflows.
1 parent 40f5fdd commit 6997fcb

File tree

15 files changed

+252
-168
lines changed

15 files changed

+252
-168
lines changed

Sources/_RegexParser/Regex/AST/AST.swift

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -265,33 +265,46 @@ extension AST {
265265
public enum Kind: Hashable {
266266
// \n \gn \g{n} \g<n> \g'n' (?n) (?(n)...
267267
// Oniguruma: \k<n>, \k'n'
268-
case absolute(Int)
268+
case absolute(AST.Atom.Number)
269269

270270
// \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n)
271271
// (?(+n)... (?(-n)...
272272
// Oniguruma: \k<-n> \k<+n> \k'-n' \k'+n'
273-
case relative(Int)
273+
case relative(AST.Atom.Number)
274274

275275
// \k<name> \k'name' \g{name} \k{name} (?P=name)
276276
// \g<name> \g'name' (?&name) (?P>name)
277277
// (?(<name>)... (?('name')... (?(name)...
278278
case named(String)
279279

280280
/// (?R), (?(R)..., which are equivalent to (?0), (?(0)...
281-
static var recurseWholePattern: Kind { .absolute(0) }
281+
static func recurseWholePattern(_ loc: SourceLocation) -> Kind {
282+
.absolute(.init(0, at: loc))
283+
}
284+
285+
/// Whether this is a reference that recurses the whole pattern, rather
286+
/// than a group.
287+
public var recursesWholePattern: Bool {
288+
switch self {
289+
case .absolute(let a):
290+
return a.value == 0
291+
default:
292+
return false
293+
}
294+
}
282295
}
283296
public var kind: Kind
284297

285298
/// An additional specifier supported by Oniguruma that specifies what
286299
/// recursion level the group being referenced belongs to.
287-
public var recursionLevel: Located<Int>?
300+
public var recursionLevel: AST.Atom.Number?
288301

289302
/// The location of the inner numeric or textual reference, e.g the location
290303
/// of '-2' in '\g{-2}'. Note this includes the recursion level for e.g
291304
/// '\k<a+2>'.
292305
public var innerLoc: SourceLocation
293306

294-
public init(_ kind: Kind, recursionLevel: Located<Int>? = nil,
307+
public init(_ kind: Kind, recursionLevel: AST.Atom.Number? = nil,
295308
innerLoc: SourceLocation) {
296309
self.kind = kind
297310
self.recursionLevel = recursionLevel
@@ -300,7 +313,7 @@ extension AST {
300313

301314
/// Whether this is a reference that recurses the whole pattern, rather than
302315
/// a group.
303-
public var recursesWholePattern: Bool { kind == .recurseWholePattern }
316+
public var recursesWholePattern: Bool { kind.recursesWholePattern }
304317
}
305318

306319
/// A set of global matching options in a regular expression literal.

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,18 @@ extension AST.Atom {
113113
}
114114

115115
extension AST.Atom {
116+
public struct Number: Hashable {
117+
/// The value, which may be `nil` in an invalid AST, e.g the parser expected
118+
/// a number at a given location, or the parsed number overflowed.
119+
public var value: Int?
120+
public var location: SourceLocation
121+
122+
public init(_ value: Int?, at location: SourceLocation) {
123+
self.value = value
124+
self.location = location
125+
}
126+
}
127+
116128
public struct Scalar: Hashable {
117129
public var value: UnicodeScalar
118130
public var location: SourceLocation
@@ -558,7 +570,7 @@ extension AST.Atom {
558570
/// A PCRE callout written `(?C...)`
559571
public struct PCRE: Hashable {
560572
public enum Argument: Hashable {
561-
case number(Int)
573+
case number(AST.Atom.Number)
562574
case string(String)
563575
}
564576
public var arg: AST.Located<Argument>

Sources/_RegexParser/Regex/AST/Conditional.swift

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,13 @@ extension AST.Conditional {
6666

6767
extension AST.Conditional.Condition {
6868
public struct PCREVersionNumber: Hashable {
69-
public var major: Int
70-
public var minor: Int
69+
public var major: AST.Atom.Number
70+
public var minor: AST.Atom.Number
7171
public var location: SourceLocation
7272

73-
public init(major: Int, minor: Int, _ location: SourceLocation) {
73+
public init(
74+
major: AST.Atom.Number, minor: AST.Atom.Number, _ location: SourceLocation
75+
) {
7476
self.major = major
7577
self.minor = minor
7678
self.location = location

Sources/_RegexParser/Regex/AST/MatchingOptions.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,13 +175,13 @@ extension AST {
175175
}
176176
public enum Kind: Hashable {
177177
/// (*LIMIT_DEPTH=d)
178-
case limitDepth(Located<Int>)
178+
case limitDepth(AST.Atom.Number)
179179

180180
/// (*LIMIT_HEAP=d)
181-
case limitHeap(Located<Int>)
181+
case limitHeap(AST.Atom.Number)
182182

183183
/// (*LIMIT_MATCH=d)
184-
case limitMatch(Located<Int>)
184+
case limitMatch(AST.Atom.Number)
185185

186186
/// (*NOTEMPTY)
187187
case notEmpty

Sources/_RegexParser/Regex/AST/Quantification.swift

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,13 @@ extension AST {
3737
}
3838

3939
public enum Amount: Hashable {
40-
case zeroOrMore // *
41-
case oneOrMore // +
42-
case zeroOrOne // ?
43-
case exactly(Located<Int>) // {n}
44-
case nOrMore(Located<Int>) // {n,}
45-
case upToN(Located<Int>) // {,n}
46-
case range(Located<Int>, Located<Int>) // {n,m}
40+
case zeroOrMore // *
41+
case oneOrMore // +
42+
case zeroOrOne // ?
43+
case exactly(AST.Atom.Number) // {n}
44+
case nOrMore(AST.Atom.Number) // {n,}
45+
case upToN(AST.Atom.Number) // {,n}
46+
case range(AST.Atom.Number, AST.Atom.Number) // {n,m}
4747
}
4848

4949
public enum Kind: String, Hashable {
@@ -58,7 +58,7 @@ extension AST {
5858

5959
extension AST.Quantification.Amount {
6060
/// The bounds.
61-
public var bounds: (atLeast: Int, atMost: Int?) {
61+
public var bounds: (atLeast: Int?, atMost: Int?) {
6262
switch self {
6363
case .zeroOrMore: return (0, nil)
6464
case .oneOrMore: return (1, nil)

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -253,18 +253,18 @@ extension Source {
253253
///
254254
/// Throws on overflow
255255
///
256-
private mutating func lexNumber<Num: FixedWidthInteger>(
257-
_ ty: Num.Type, _ kind: RadixKind
258-
) throws -> Located<Num>? {
256+
private mutating func lexNumber(
257+
_ kind: RadixKind
258+
) throws -> AST.Atom.Number? {
259259
try recordLoc { src in
260-
guard let str = src.tryEatPrefix(kind.characterFilter)?.string else {
260+
guard let str = src.tryEatLocatedPrefix(kind.characterFilter) else {
261261
return nil
262262
}
263-
guard let i = Num(str, radix: kind.radix) else {
264-
throw ParseError.numberOverflow(str)
263+
guard let i = Int(str.value, radix: kind.radix) else {
264+
throw ParseError.numberOverflow(str.value)
265265
}
266-
return i
267-
}
266+
return .init(i, at: str.location)
267+
}.value
268268
}
269269

270270
/// Try to eat a number off the front.
@@ -273,11 +273,11 @@ extension Source {
273273
///
274274
/// Throws on overflow
275275
///
276-
mutating func lexNumber() throws -> Located<Int>? {
277-
try lexNumber(Int.self, .decimal)
276+
mutating func lexNumber() throws -> AST.Atom.Number? {
277+
try lexNumber(.decimal)
278278
}
279279

280-
mutating func expectNumber() throws -> Located<Int> {
280+
mutating func expectNumber() throws -> AST.Atom.Number {
281281
guard let num = try lexNumber() else {
282282
throw ParseError.expectedNumber("", kind: .decimal)
283283
}
@@ -488,9 +488,10 @@ extension Source {
488488

489489
if let t = src.lexWhitespace() { trivia.append(t) }
490490

491-
let upperOpt = try src.lexNumber()?.map { upper in
491+
var upperOpt = try src.lexNumber()
492+
if closedRange == false {
492493
// If we have an open range, the upper bound should be adjusted down.
493-
closedRange == true ? upper : upper - 1
494+
upperOpt?.value? -= 1
494495
}
495496

496497
if let t = src.lexWhitespace() { trivia.append(t) }
@@ -1066,10 +1067,11 @@ extension Source {
10661067
///
10671068
private mutating func expectPCREVersionNumber(
10681069
) throws -> AST.Conditional.Condition.PCREVersionNumber {
1069-
let nums = try recordLoc { src -> (major: Int, minor: Int) in
1070-
let major = try src.expectNumber().value
1070+
let nums = try recordLoc { src -> (major: AST.Atom.Number,
1071+
minor: AST.Atom.Number) in
1072+
let major = try src.expectNumber()
10711073
try src.expect(".")
1072-
let minor = try src.expectNumber().value
1074+
let minor = try src.expectNumber()
10731075
return (major, minor)
10741076
}
10751077
return .init(major: nums.value.major, minor: nums.value.minor,
@@ -1119,7 +1121,7 @@ extension Source {
11191121
}
11201122
if let num = try src.lexNumber() {
11211123
return .groupRecursionCheck(
1122-
.init(.absolute(num.value), innerLoc: num.location))
1124+
.init(.absolute(num), innerLoc: num.location))
11231125
}
11241126
return .recursionCheck
11251127
}
@@ -1406,20 +1408,21 @@ extension Source {
14061408
let kind = try recordLoc { src -> AST.Reference.Kind? in
14071409
try src.tryEating { src in
14081410
// Note this logic should match canLexNumberedReference.
1409-
if src.tryEat("+"), let num = try src.lexNumber() {
1410-
return .relative(num.value)
1411+
if let plus = src.tryEatWithLoc("+"), let num = try src.lexNumber() {
1412+
return .relative(.init(num.value, at: num.location.union(with: plus)))
14111413
}
1412-
if src.tryEat("-"), let num = try src.lexNumber() {
1413-
return .relative(-num.value)
1414+
if let minus = src.tryEatWithLoc("-"), let num = try src.lexNumber() {
1415+
let val = num.value.map { x in -x }
1416+
return .relative(.init(val, at: num.location.union(with: minus)))
14141417
}
14151418
if let num = try src.lexNumber() {
1416-
return .absolute(num.value)
1419+
return .absolute(num)
14171420
}
14181421
return nil
14191422
}
14201423
}
14211424
guard let kind = kind else { return nil }
1422-
guard allowWholePatternRef || kind.value != .recurseWholePattern else {
1425+
guard allowWholePatternRef || !kind.value.recursesWholePattern else {
14231426
throw ParseError.cannotReferToWholePattern
14241427
}
14251428
let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil
@@ -1432,12 +1435,14 @@ extension Source {
14321435
/// RecursionLevel -> '+' <Int> | '-' <Int>
14331436
///
14341437
private mutating func lexRecursionLevel(
1435-
) throws -> Located<Int>? {
1436-
try recordLoc { src in
1438+
) throws -> AST.Atom.Number? {
1439+
let value = try recordLoc { src -> Int? in
14371440
if src.tryEat("+") { return try src.expectNumber().value }
1438-
if src.tryEat("-") { return try -src.expectNumber().value }
1441+
if src.tryEat("-") { return try src.expectNumber().value.map { x in -x } }
14391442
return nil
14401443
}
1444+
guard let value = value else { return nil }
1445+
return .init(value.value, at: value.location)
14411446
}
14421447

14431448
/// Checks whether a numbered reference can be lexed.
@@ -1579,9 +1584,8 @@ extension Source {
15791584
}
15801585

15811586
// Backslash followed by a non-0 digit character is a backreference.
1582-
if firstChar != "0", let numAndLoc = try src.lexNumber() {
1583-
return .backreference(.init(
1584-
.absolute(numAndLoc.value), innerLoc: numAndLoc.location))
1587+
if firstChar != "0", let num = try src.lexNumber() {
1588+
return .backreference(.init(.absolute(num), innerLoc: num.location))
15851589
}
15861590
return nil
15871591
}
@@ -1621,7 +1625,7 @@ extension Source {
16211625
// Whole-pattern recursion, which is equivalent to (?0).
16221626
if let loc = src.tryEatWithLoc("R") {
16231627
try src.expect(")")
1624-
return .subpattern(.init(.recurseWholePattern, innerLoc: loc))
1628+
return .subpattern(.init(.recurseWholePattern(loc), innerLoc: loc))
16251629
}
16261630

16271631
// Numbered subpattern reference.
@@ -1772,11 +1776,12 @@ extension Source {
17721776
let arg = try recordLoc { src -> AST.Atom.Callout.PCRE.Argument in
17731777
// Parse '(?C' followed by a number.
17741778
if let num = try src.lexNumber() {
1775-
return .number(num.value)
1779+
return .number(num)
17761780
}
17771781
// '(?C)' is implicitly '(?C0)'.
17781782
if src.peek() == ")" {
1779-
return .number(0)
1783+
let pos = src.currentPosition
1784+
return .number(.init(0, at: SourceLocation(pos ..< pos)))
17801785
}
17811786
// Parse '(C?' followed by a set of balanced delimiters as defined by
17821787
// http://pcre.org/current/doc/html/pcre2pattern.html#SEC28

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,10 @@ struct ParsingContext {
9696
func isPriorGroupRef(_ ref: AST.Reference.Kind) -> Bool {
9797
switch ref {
9898
case .absolute(let i):
99+
guard let i = i.value else { return false }
99100
return i <= priorGroupCount
100101
case .relative(let i):
102+
guard let i = i.value else { return false }
101103
return i < 0
102104
case .named(let str):
103105
return usedGroupNames.contains(str)

Sources/_RegexParser/Regex/Parse/Sema.swift

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ extension RegexValidator {
7676
throw error(.unsupported("recursion level"), at: recLevel.location)
7777
}
7878
switch ref.kind {
79-
case .absolute(let i):
79+
case .absolute(let num):
80+
guard let i = num.value else { break }
8081
guard i < captures.captures.count else {
8182
throw error(.invalidReference(i), at: ref.innerLoc)
8283
}
@@ -359,9 +360,9 @@ extension RegexValidator {
359360
}
360361
switch quant.amount.value {
361362
case .range(let lhs, let rhs):
362-
guard lhs.value <= rhs.value else {
363-
throw error(
364-
.invalidQuantifierRange(lhs.value, rhs.value), at: quant.location)
363+
guard let lhs = lhs.value, let rhs = rhs.value else { break }
364+
guard lhs <= rhs else {
365+
throw error(.invalidQuantifierRange(lhs, rhs), at: quant.location)
365366
}
366367
case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN:
367368
break

Sources/_RegexParser/Regex/Printing/DumpAST.swift

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,12 @@ extension AST.Atom {
173173
}
174174
}
175175

176+
extension AST.Atom.Number: _ASTPrintable {
177+
public var _dumpBase: String {
178+
value.map { "\($0)" } ?? "<invalid>"
179+
}
180+
}
181+
176182
extension AST.Atom.Callout: _ASTPrintable {
177183
public var _dumpBase: String {
178184
switch self {
@@ -227,7 +233,7 @@ extension AST.Reference: _ASTPrintable {
227233
public var _dumpBase: String {
228234
var result = "\(kind)"
229235
if let recursionLevel = recursionLevel {
230-
result += "\(recursionLevel.value)"
236+
result += "\(recursionLevel)"
231237
}
232238
return result
233239
}
@@ -270,11 +276,11 @@ extension AST.Quantification.Amount: _ASTPrintable {
270276
case .zeroOrMore: return "zeroOrMore"
271277
case .oneOrMore: return "oneOrMore"
272278
case .zeroOrOne: return "zeroOrOne"
273-
case let .exactly(n): return "exactly<\(n.value)>"
274-
case let .nOrMore(n): return "nOrMore<\(n.value)>"
275-
case let .upToN(n): return "uptoN<\(n.value)>"
279+
case let .exactly(n): return "exactly<\(n)>"
280+
case let .nOrMore(n): return "nOrMore<\(n)>"
281+
case let .upToN(n): return "uptoN<\(n)>"
276282
case let .range(lower, upper):
277-
return ".range<\(lower.value)...\(upper.value)>"
283+
return ".range<\(lower)...\(upper)>"
278284
}
279285
}
280286
}

0 commit comments

Comments
 (0)