Skip to content

Commit e87149a

Browse files
authored
Merge pull request #517 from hamishknight/closed-range
2 parents 8688296 + 941d28a commit e87149a

File tree

5 files changed

+193
-43
lines changed

5 files changed

+193
-43
lines changed

Sources/_RegexParser/Regex/AST/CustomCharClass.swift

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ extension AST {
6262
self.rhs = rhs
6363
self.trivia = trivia
6464
}
65+
66+
public var location: SourceLocation {
67+
lhs.location.union(with: rhs.location)
68+
}
6569
}
6670
public enum SetOp: String, Hashable {
6771
case subtraction = "--"
@@ -108,6 +112,25 @@ extension CustomCC.Member {
108112
public var isSemantic: Bool {
109113
!isTrivia
110114
}
115+
116+
public var location: SourceLocation {
117+
switch self {
118+
case let .custom(c): return c.location
119+
case let .range(r): return r.location
120+
case let .atom(a): return a.location
121+
case let .quote(q): return q.location
122+
case let .trivia(t): return t.location
123+
case let .setOperation(lhs, dash, rhs):
124+
var loc = dash.location
125+
if let lhs = lhs.first {
126+
loc = loc.union(with: lhs.location)
127+
}
128+
if let rhs = rhs.last {
129+
loc = loc.union(with: rhs.location)
130+
}
131+
return loc
132+
}
133+
}
111134
}
112135

113136
extension AST.CustomCharacterClass {

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ enum ParseError: Error, Hashable {
9494
case invalidNamedReference(String)
9595
case duplicateNamedCapture(String)
9696
case invalidCharacterClassRangeOperand
97+
case unsupportedDotNetSubtraction
9798
case invalidQuantifierRange(Int, Int)
9899
case invalidCharacterRange(from: Character, to: Character)
99100
case notQuantifiable
@@ -174,7 +175,9 @@ extension ParseError: CustomStringConvertible {
174175
case .expectedCustomCharacterClassMembers:
175176
return "expected custom character class members"
176177
case .invalidCharacterClassRangeOperand:
177-
return "invalid character class range"
178+
return "invalid bound for character class range"
179+
case .unsupportedDotNetSubtraction:
180+
return "subtraction with '-' is unsupported; use '--' instead"
178181
case .emptyProperty:
179182
return "empty property"
180183
case .unknownProperty(let key, let value):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,25 @@ extension Source {
12451245
return nil
12461246
}
12471247

1248+
/// Check to see if we can lex a .NET subtraction. Returns the
1249+
/// location of the `-`.
1250+
///
1251+
/// DotNetSubtraction -> Trivia* '-' Trivia* CustomCharClass
1252+
///
1253+
func canLexDotNetCharClassSubtraction(
1254+
context: ParsingContext
1255+
) -> SourceLocation? {
1256+
lookahead { src in
1257+
// We can lex '-' as a .NET subtraction if it precedes a custom character
1258+
// class.
1259+
while (try? src.lexTrivia(context: context)) != nil {}
1260+
guard let dashLoc = src.tryEatWithLoc("-") else { return nil }
1261+
while (try? src.lexTrivia(context: context)) != nil {}
1262+
guard src.lexCustomCCStart() != nil else { return nil }
1263+
return dashLoc
1264+
}
1265+
}
1266+
12481267
private mutating func lexPOSIXCharacterProperty(
12491268
) throws -> Located<AST.Atom.CharacterProperty>? {
12501269
try recordLoc { src in

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 83 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,12 @@ extension Parser {
502502
var members: Array<Member> = []
503503
try parseCCCMembers(into: &members)
504504

505+
// Make sure we have at least one semantic member.
506+
if members.none(\.isSemantic) {
507+
throw Source.LocatedError(
508+
ParseError.expectedCustomCharacterClassMembers, start.location)
509+
}
510+
505511
// If we have a binary set operator, parse it and the next members. Note
506512
// that this means we left associate for a chain of operators.
507513
// TODO: We may want to diagnose and require users to disambiguate, at least
@@ -511,16 +517,12 @@ extension Parser {
511517
var rhs: Array<Member> = []
512518
try parseCCCMembers(into: &rhs)
513519

514-
if members.none(\.isSemantic) || rhs.none(\.isSemantic) {
520+
if rhs.none(\.isSemantic) {
515521
throw Source.LocatedError(
516522
ParseError.expectedCustomCharacterClassMembers, start.location)
517523
}
518524
members = [.setOperation(members, binOp, rhs)]
519525
}
520-
if members.none(\.isSemantic) {
521-
throw Source.LocatedError(
522-
ParseError.expectedCustomCharacterClassMembers, start.location)
523-
}
524526
try source.expect("]")
525527
return CustomCC(start, members, loc(start.location.start))
526528
}
@@ -550,48 +552,88 @@ extension Parser {
550552
return nil
551553
}
552554

553-
mutating func parseCCCMembers(
554-
into members: inout Array<CustomCC.Member>
555+
/// Attempt to parse a custom character class range into `members`, or regular
556+
/// members if a range cannot be formed.
557+
mutating func parsePotentialCCRange(
558+
into members: inout [CustomCC.Member]
555559
) throws {
556-
// Parse members until we see the end of the custom char class or an
557-
// operator.
558-
while let member = try parseCCCMember() {
559-
members.append(member)
560-
561-
// If we have an atom, we can try to parse a character class range. Each
562-
// time we parse a component of the range, we append to `members` in case
563-
// it ends up not being a range, and we bail. If we succeed in parsing, we
564-
// remove the intermediate members.
565-
if case .atom(let lhs) = member {
566-
let membersBeforeRange = members.count - 1
567-
568-
while let t = try source.lexTrivia(context: context) {
569-
members.append(.trivia(t))
570-
}
560+
guard let lhs = members.last, lhs.isSemantic else { return }
561+
562+
// Try and see if we can parse a character class range. Each time we parse
563+
// a component of the range, we append to `members` in case it ends up not
564+
// being a range, and we bail. If we succeed in parsing, we remove the
565+
// intermediate members.
566+
let membersBeforeRange = members.count - 1
567+
while let t = try source.lexTrivia(context: context) {
568+
members.append(.trivia(t))
569+
}
570+
guard let dash = source.lexCustomCharacterClassRangeOperator() else {
571+
return
572+
}
571573

572-
guard let dash = source.lexCustomCharacterClassRangeOperator() else {
573-
continue
574-
}
575-
// If we can't parse a range, '-' becomes literal, e.g `[6-]`.
576-
members.append(.atom(.init(.char("-"), dash)))
574+
// If we can't parse a range, '-' becomes literal, e.g `[6-]`.
575+
members.append(.atom(.init(.char("-"), dash)))
577576

578-
while let t = try source.lexTrivia(context: context) {
579-
members.append(.trivia(t))
577+
while let t = try source.lexTrivia(context: context) {
578+
members.append(.trivia(t))
579+
}
580+
guard let rhs = try parseCCCMember() else { return }
581+
members.append(rhs)
582+
583+
func makeOperand(_ m: CustomCC.Member, isLHS: Bool) throws -> AST.Atom {
584+
switch m {
585+
case .atom(let a):
586+
return a
587+
case .custom:
588+
// Not supported. While .NET allows `x-[...]` to spell subtraction, we
589+
// require `x--[...]`. We also ban `[...]-x` for consistency.
590+
if isLHS {
591+
throw Source.LocatedError(
592+
ParseError.invalidCharacterClassRangeOperand, m.location)
593+
} else {
594+
throw Source.LocatedError(
595+
ParseError.unsupportedDotNetSubtraction, m.location)
580596
}
581-
guard let rhs = try parseCCCMember() else { continue }
582-
members.append(rhs)
583-
584-
guard case let .atom(rhs) = rhs else { continue }
585-
586-
// We've successfully parsed an atom LHS and RHS, so form a range,
587-
// collecting the trivia we've parsed, and replacing the members that
588-
// would have otherwise been added to the custom character class.
589-
let rangeMemberCount = members.count - membersBeforeRange
590-
let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia)
591-
members.removeLast(rangeMemberCount)
592-
members.append(.range(.init(lhs, dash, rhs, trivia: trivia)))
597+
case .quote:
598+
// Currently unsupported, we need to figure out what the semantics
599+
// would be for grapheme/scalar modes.
600+
throw Source.LocatedError(
601+
ParseError.unsupported("range with quoted sequence"), m.location)
602+
case .trivia:
603+
throw Unreachable("Should have been lexed separately")
604+
case .range, .setOperation:
605+
throw Unreachable("Parsed later")
593606
}
594607
}
608+
let lhsOp = try makeOperand(lhs, isLHS: true)
609+
let rhsOp = try makeOperand(rhs, isLHS: false)
610+
611+
// We've successfully parsed an atom LHS and RHS, so form a range,
612+
// collecting the trivia we've parsed, and replacing the members that
613+
// would have otherwise been added to the custom character class.
614+
let rangeMemberCount = members.count - membersBeforeRange
615+
let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia)
616+
members.removeLast(rangeMemberCount)
617+
members.append(.range(.init(lhsOp, dash, rhsOp, trivia: trivia)))
618+
619+
// We need to specially check if we can lex a .NET character class
620+
// subtraction here as e.g `[a-c-[...]]` is allowed in .NET. Otherwise we'd
621+
// treat the second `-` as literal.
622+
if let dashLoc = source.canLexDotNetCharClassSubtraction(context: context) {
623+
throw Source.LocatedError(
624+
ParseError.unsupportedDotNetSubtraction, dashLoc)
625+
}
626+
}
627+
628+
mutating func parseCCCMembers(
629+
into members: inout Array<CustomCC.Member>
630+
) throws {
631+
// Parse members and ranges until we see the end of the custom char class
632+
// or an operator.
633+
while let member = try parseCCCMember() {
634+
members.append(member)
635+
try parsePotentialCCRange(into: &members)
636+
}
595637
}
596638
}
597639

Tests/RegexTests/ParseTests.swift

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,10 +517,36 @@ extension RegexTests {
517517

518518
parseTest(
519519
"[a-b-c]", charClass(range_m("a", "b"), "-", "c"))
520+
parseTest(
521+
"[a-b-c-d]", charClass(range_m("a", "b"), "-", range_m("c", "d")))
522+
523+
parseTest("[a-c---]", charClass(
524+
setOp(range_m("a", "c"), op: .subtraction, "-")
525+
))
526+
527+
parseTest("(?x)[a-c -- -]", concat(
528+
changeMatchingOptions(matchingOptions(adding: .extended)),
529+
charClass(setOp(range_m("a", "c"), op: .subtraction, "-"))
530+
))
531+
532+
parseTest("(?x)[a-c - - -]", concat(
533+
changeMatchingOptions(matchingOptions(adding: .extended)),
534+
charClass(range_m("a", "c"), range_m("-", "-"))
535+
))
520536

521537
parseTest("[-a-]", charClass("-", "a", "-"))
522538
parseTest("[[a]-]", charClass(charClass("a"), "-"))
523-
parseTest("[[a]-b]", charClass(charClass("a"), "-", "b"))
539+
parseTest("[-[a]]", charClass("-", charClass("a")))
540+
541+
parseTest(#"(?x)[ -[b]]"#, concat(
542+
changeMatchingOptions(matchingOptions(adding: .extended)),
543+
charClass("-", charClass("b"))
544+
))
545+
546+
parseTest(#"[ - [ ]]"#, charClass(range_m(" ", " "), charClass(" ")))
547+
parseTest(#"[ - [ ] ]"#, charClass(range_m(" ", " "), charClass(" "), " "))
548+
549+
parseTest(#"[a-c-\Qd\E]"#, charClass(range_m("a", "c"), "-", quote_m("d")))
524550

525551
parseTest("[a-z]", charClass(range_m("a", "z")))
526552
parseTest("[a-a]", charClass(range_m("a", "a")))
@@ -2692,6 +2718,32 @@ extension RegexTests {
26922718
diagnosticTest("[[:=:]]", .emptyProperty)
26932719

26942720
diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand)
2721+
diagnosticTest("[[a]-b]", .invalidCharacterClassRangeOperand)
2722+
2723+
// .NET subtraction is banned, we require explicit '--'.
2724+
diagnosticTest("[a-[b]]", .unsupportedDotNetSubtraction)
2725+
diagnosticTest(#"[abc-[def]]"#, .unsupportedDotNetSubtraction)
2726+
diagnosticTest(#"[abc-[^def]]"#, .unsupportedDotNetSubtraction)
2727+
diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction)
2728+
diagnosticTest("[a-z-[d-w-[m-o]]]", .unsupportedDotNetSubtraction)
2729+
diagnosticTest(#"[a-[:b]]"#, .unsupportedDotNetSubtraction)
2730+
diagnosticTest(#"[[a]-[b]]"#, .invalidCharacterClassRangeOperand)
2731+
diagnosticTest(#"[ -[ ]]"#, .unsupportedDotNetSubtraction)
2732+
diagnosticTest(#"(?x)[a - [b] ]"#, .unsupportedDotNetSubtraction)
2733+
2734+
diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers)
2735+
diagnosticTest(#"[-[]]"#, .expectedCustomCharacterClassMembers)
2736+
diagnosticTest(#"(?x)[ - [ ] ]"#, .expectedCustomCharacterClassMembers)
2737+
diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers)
2738+
diagnosticTest(#"[a-[:digit:]]"#, .invalidCharacterClassRangeOperand)
2739+
2740+
diagnosticTest("[--]", .expectedCustomCharacterClassMembers)
2741+
diagnosticTest("[---]", .expectedCustomCharacterClassMembers)
2742+
diagnosticTest("[----]", .expectedCustomCharacterClassMembers)
2743+
2744+
// Quoted sequences aren't currently supported as range operands.
2745+
diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence"))
2746+
diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence"))
26952747

26962748
diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
26972749
diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
@@ -2878,6 +2930,17 @@ extension RegexTests {
28782930
/#
28792931
"""#, .quoteMayNotSpanMultipleLines)
28802932

2933+
// .NET subtraction
2934+
diagnosticWithDelimitersTest(#"""
2935+
#/
2936+
[
2937+
a # interesting
2938+
- #a
2939+
[ b] # comment
2940+
]
2941+
/#
2942+
"""#, .unsupportedDotNetSubtraction)
2943+
28812944
// MARK: Group specifiers
28822945

28832946
diagnosticTest(#"(*"#, .unknownGroupKind("*"))

0 commit comments

Comments
 (0)