Skip to content

Commit 70ecb27

Browse files
committed
Fix trivia parsing for set operations and initial ] cases
Previously we would check for an empty array of members when deciding whether an initial `]` is literal, or if the operands of a set operation are invalid. Switch to checking whether we have any semantic members instead.
1 parent 33b0b4c commit 70ecb27

File tree

5 files changed

+73
-18
lines changed

5 files changed

+73
-18
lines changed

Sources/_RegexParser/Regex/AST/CustomCharClass.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,18 @@ extension CustomCC.Member {
9797
if case .trivia = self { return true }
9898
return false
9999
}
100+
101+
public var isSemantic: Bool {
102+
!isTrivia
103+
}
100104
}
101105

102106
extension AST.CustomCharacterClass {
103107
/// Strip trivia from the character class members. This does not recurse into
104108
/// nested custom character classes.
105109
public var strippingTriviaShallow: Self {
106110
var copy = self
107-
copy.members = copy.members.filter { !$0.isTrivia }
111+
copy.members = copy.members.filter(\.isSemantic)
108112
return copy
109113
}
110114
}

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -438,16 +438,18 @@ extension Parser {
438438
defer { context.isInCustomCharacterClass = alreadyInCCC }
439439

440440
typealias Member = CustomCC.Member
441-
try source.expectNonEmpty()
442-
443441
var members: Array<Member> = []
442+
try parseCCCMembers(into: &members)
444443

445-
// We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty
446-
// character classes, and assume an initial ']' is literal.
447-
if let loc = source.tryEatWithLoc("]") {
448-
members.append(.atom(.init(.char("]"), loc)))
444+
// If we didn't parse any semantic members, we can eat a ']' character, as
445+
// PCRE, Oniguruma, and ICU forbid empty character classes, and assume an
446+
// initial ']' is literal.
447+
if members.none(\.isSemantic) {
448+
if let loc = source.tryEatWithLoc("]") {
449+
members.append(.atom(.init(.char("]"), loc)))
450+
try parseCCCMembers(into: &members)
451+
}
449452
}
450-
try parseCCCMembers(into: &members)
451453

452454
// If we have a binary set operator, parse it and the next members. Note
453455
// that this means we left associate for a chain of operators.
@@ -458,8 +460,9 @@ extension Parser {
458460
var rhs: Array<Member> = []
459461
try parseCCCMembers(into: &rhs)
460462

461-
if members.isEmpty || rhs.isEmpty {
462-
throw ParseError.expectedCustomCharacterClassMembers
463+
if members.none(\.isSemantic) || rhs.none(\.isSemantic) {
464+
throw Source.LocatedError(
465+
ParseError.expectedCustomCharacterClassMembers, start.location)
463466
}
464467

465468
// If we're done, bail early
@@ -472,8 +475,9 @@ extension Parser {
472475
// Otherwise it's just another member to accumulate
473476
members = [setOp]
474477
}
475-
if members.isEmpty {
476-
throw ParseError.expectedCustomCharacterClassMembers
478+
if members.none(\.isSemantic) {
479+
throw Source.LocatedError(
480+
ParseError.expectedCustomCharacterClassMembers, start.location)
477481
}
478482
try source.expect("]")
479483
return CustomCC(start, members, loc(start.location.start))
@@ -484,7 +488,8 @@ extension Parser {
484488
) throws {
485489
// Parse members until we see the end of the custom char class or an
486490
// operator.
487-
while source.peek() != "]" && source.peekCCBinOp() == nil {
491+
while !source.isEmpty && source.peek() != "]" &&
492+
source.peekCCBinOp() == nil {
488493

489494
// Nested custom character class.
490495
if let cccStart = try source.lexCustomCCStart() {

Sources/_RegexParser/Regex/Printing/DumpAST.swift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,9 @@ extension AST.CustomCharacterClass.Member: _ASTPrintable {
312312
case .quote(let q): return "\(q)"
313313
case .trivia(let t): return "\(t)"
314314
case .setOperation(let lhs, let op, let rhs):
315-
return "op \(lhs) \(op.value) \(rhs)"
315+
// TODO: We should eventually have some way of filtering out trivia for
316+
// tests, so that it can appear in regular dumps.
317+
return "op \(lhs.filter(\.isSemantic)) \(op.value) \(rhs.filter(\.isSemantic))"
316318
}
317319
}
318320
}

Tests/RegexTests/MatchTests.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1217,6 +1217,9 @@ extension RegexTests {
12171217
firstMatchTest(#"(?xx)[ \t]+"#, input: " \t ", match: "\t")
12181218
firstMatchTest(#"(?xx)[ \t]+"#, input: " \t\t ", match: "\t\t")
12191219
firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t")
1220+
1221+
firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa")
1222+
firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a")
12201223
}
12211224

12221225
func testASCIIClasses() {

Tests/RegexTests/ParseTests.swift

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -460,9 +460,25 @@ extension RegexTests {
460460

461461
parseTest("[-]", charClass("-"))
462462

463-
// Empty character classes are forbidden, therefore this is a character
464-
// class of literal ']'.
463+
// Empty character classes are forbidden, therefore these are character
464+
// classes containing literal ']'.
465465
parseTest("[]]", charClass("]"))
466+
parseTest("[]a]", charClass("]", "a"))
467+
parseTest(
468+
"(?x)[ ]]", changeMatchingOptions(
469+
matchingOptions(adding: .extended), isIsolated: true,
470+
charClass("]"))
471+
)
472+
parseTest(
473+
"(?x)[ ] ]", changeMatchingOptions(
474+
matchingOptions(adding: .extended), isIsolated: true,
475+
charClass("]"))
476+
)
477+
parseTest(
478+
"(?x)[ ] a ]", changeMatchingOptions(
479+
matchingOptions(adding: .extended), isIsolated: true,
480+
charClass("]", "a"))
481+
)
466482

467483
// These are metacharacters in certain contexts, but normal characters
468484
// otherwise.
@@ -613,6 +629,16 @@ extension RegexTests {
613629
parseTest(
614630
"~~*", concat("~", zeroOrMore(of: "~")))
615631

632+
parseTest(
633+
"[ && ]", charClass(
634+
.setOperation([" "], .init(faking: .intersection), [" ", " "]))
635+
)
636+
parseTest(
637+
"(?x)[ a && b ]", changeMatchingOptions(
638+
matchingOptions(adding: .extended), isIsolated: true, charClass(
639+
.setOperation(["a"], .init(faking: .intersection), ["b"]))
640+
))
641+
616642
// MARK: Quotes
617643

618644
parseTest(
@@ -2205,6 +2231,9 @@ extension RegexTests {
22052231
diagnosticTest(")))", .unbalancedEndOfGroup)
22062232
diagnosticTest("())()", .unbalancedEndOfGroup)
22072233

2234+
diagnosticTest("[", .expectedCustomCharacterClassMembers)
2235+
diagnosticTest("[^", .expectedCustomCharacterClassMembers)
2236+
22082237
diagnosticTest(#"\u{5"#, .expected("}"))
22092238
diagnosticTest(#"\x{5"#, .expected("}"))
22102239
diagnosticTest(#"\N{A"#, .expected("}"))
@@ -2245,9 +2274,21 @@ extension RegexTests {
22452274
diagnosticTest("(?<a-b", .expected(">"))
22462275
diagnosticTest("(?<a-b>", .expected(")"))
22472276

2248-
// The first ']' of a custom character class is literal, so this is missing
2249-
// the closing bracket.
2277+
// MARK: Character classes
2278+
2279+
diagnosticTest("[a", .expected("]"))
2280+
2281+
// The first ']' of a custom character class is literal, so these are
2282+
// missing the closing bracket.
22502283
diagnosticTest("[]", .expected("]"))
2284+
diagnosticTest("(?x)[ ]", .expected("]"))
2285+
2286+
diagnosticTest("[&&]", .expectedCustomCharacterClassMembers)
2287+
diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers)
2288+
diagnosticTest("[&&a]", .expectedCustomCharacterClassMembers)
2289+
diagnosticTest("(?x)[ && ]", .expectedCustomCharacterClassMembers)
2290+
diagnosticTest("(?x)[ &&a]", .expectedCustomCharacterClassMembers)
2291+
diagnosticTest("(?x)[a&& ]", .expectedCustomCharacterClassMembers)
22512292

22522293
diagnosticTest("[:a", .expected("]"))
22532294
diagnosticTest("[:a:", .expected("]"))

0 commit comments

Comments
 (0)