Skip to content

Fix trivia in custom character classes #278

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Sources/_RegexParser/Regex/AST/CustomCharClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,18 @@ extension CustomCC.Member {
if case .trivia = self { return true }
return false
}

public var isSemantic: Bool {
!isTrivia
}
}

extension AST.CustomCharacterClass {
/// Strip trivia from the character class members. This does not recurse into
/// nested custom character classes.
public var strippingTriviaShallow: Self {
var copy = self
copy.members = copy.members.filter { !$0.isTrivia }
copy.members = copy.members.filter(\.isSemantic)
return copy
}
}
29 changes: 17 additions & 12 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -438,16 +438,18 @@ extension Parser {
defer { context.isInCustomCharacterClass = alreadyInCCC }

typealias Member = CustomCC.Member
try source.expectNonEmpty()

var members: Array<Member> = []
try parseCCCMembers(into: &members)

// We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty
// character classes, and assume an initial ']' is literal.
if let loc = source.tryEatWithLoc("]") {
members.append(.atom(.init(.char("]"), loc)))
// If we didn't parse any semantic members, we can eat a ']' character, as
// PCRE, Oniguruma, and ICU forbid empty character classes, and assume an
// initial ']' is literal.
if members.none(\.isSemantic) {
if let loc = source.tryEatWithLoc("]") {
members.append(.atom(.init(.char("]"), loc)))
try parseCCCMembers(into: &members)
}
}
try parseCCCMembers(into: &members)

// If we have a binary set operator, parse it and the next members. Note
// that this means we left associate for a chain of operators.
Expand All @@ -458,8 +460,9 @@ extension Parser {
var rhs: Array<Member> = []
try parseCCCMembers(into: &rhs)

if members.isEmpty || rhs.isEmpty {
throw ParseError.expectedCustomCharacterClassMembers
if members.none(\.isSemantic) || rhs.none(\.isSemantic) {
throw Source.LocatedError(
ParseError.expectedCustomCharacterClassMembers, start.location)
}

// If we're done, bail early
Expand All @@ -472,8 +475,9 @@ extension Parser {
// Otherwise it's just another member to accumulate
members = [setOp]
}
if members.isEmpty {
throw ParseError.expectedCustomCharacterClassMembers
if members.none(\.isSemantic) {
throw Source.LocatedError(
ParseError.expectedCustomCharacterClassMembers, start.location)
}
try source.expect("]")
return CustomCC(start, members, loc(start.location.start))
Expand All @@ -484,7 +488,8 @@ extension Parser {
) throws {
// Parse members until we see the end of the custom char class or an
// operator.
while source.peek() != "]" && source.peekCCBinOp() == nil {
while !source.isEmpty && source.peek() != "]" &&
source.peekCCBinOp() == nil {

// Nested custom character class.
if let cccStart = try source.lexCustomCCStart() {
Expand Down
4 changes: 3 additions & 1 deletion Sources/_RegexParser/Regex/Printing/DumpAST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,9 @@ extension AST.CustomCharacterClass.Member: _ASTPrintable {
case .quote(let q): return "\(q)"
case .trivia(let t): return "\(t)"
case .setOperation(let lhs, let op, let rhs):
return "op \(lhs) \(op.value) \(rhs)"
// TODO: We should eventually have some way of filtering out trivia for
// tests, so that it can appear in regular dumps.
return "op \(lhs.filter(\.isSemantic)) \(op.value) \(rhs.filter(\.isSemantic))"
}
}
}
Expand Down
5 changes: 1 addition & 4 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -287,11 +287,8 @@ extension DSLTree.CustomCharacterClass.Member {
}
case .trivia:
// TODO: Should probably strip this earlier...
return { _, bounds in
return bounds.lowerBound
}
return { _, _ in nil }
}

}
}

Expand Down
13 changes: 13 additions & 0 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1208,6 +1208,19 @@ extension RegexTests {
("CaFe", true),
("EfAc", true))
}

func testNonSemanticWhitespace() {
firstMatchTest(#" \t "#, input: " \t ", match: " \t ")
firstMatchTest(#"(?xx) \t "#, input: " \t ", match: "\t")

firstMatchTest(#"[ \t]+"#, input: " \t ", match: " \t ")
firstMatchTest(#"(?xx)[ \t]+"#, input: " \t ", match: "\t")
firstMatchTest(#"(?xx)[ \t]+"#, input: " \t\t ", match: "\t\t")
firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t")

firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa")
firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a")
Comment on lines +1213 to +1222
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍🏻

}

func testASCIIClasses() {
// 'D' ASCII-only digits
Expand Down
49 changes: 45 additions & 4 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -460,9 +460,25 @@ extension RegexTests {

parseTest("[-]", charClass("-"))

// Empty character classes are forbidden, therefore this is a character
// class of literal ']'.
// Empty character classes are forbidden, therefore these are character
// classes containing literal ']'.
parseTest("[]]", charClass("]"))
parseTest("[]a]", charClass("]", "a"))
parseTest(
"(?x)[ ]]", changeMatchingOptions(
matchingOptions(adding: .extended), isIsolated: true,
charClass("]"))
)
parseTest(
"(?x)[ ] ]", changeMatchingOptions(
matchingOptions(adding: .extended), isIsolated: true,
charClass("]"))
)
parseTest(
"(?x)[ ] a ]", changeMatchingOptions(
matchingOptions(adding: .extended), isIsolated: true,
charClass("]", "a"))
)

// These are metacharacters in certain contexts, but normal characters
// otherwise.
Expand Down Expand Up @@ -613,6 +629,16 @@ extension RegexTests {
parseTest(
"~~*", concat("~", zeroOrMore(of: "~")))

parseTest(
"[ && ]", charClass(
.setOperation([" "], .init(faking: .intersection), [" ", " "]))
)
parseTest(
"(?x)[ a && b ]", changeMatchingOptions(
matchingOptions(adding: .extended), isIsolated: true, charClass(
.setOperation(["a"], .init(faking: .intersection), ["b"]))
))

// MARK: Quotes

parseTest(
Expand Down Expand Up @@ -2205,6 +2231,9 @@ extension RegexTests {
diagnosticTest(")))", .unbalancedEndOfGroup)
diagnosticTest("())()", .unbalancedEndOfGroup)

diagnosticTest("[", .expectedCustomCharacterClassMembers)
diagnosticTest("[^", .expectedCustomCharacterClassMembers)

diagnosticTest(#"\u{5"#, .expected("}"))
diagnosticTest(#"\x{5"#, .expected("}"))
diagnosticTest(#"\N{A"#, .expected("}"))
Expand Down Expand Up @@ -2245,9 +2274,21 @@ extension RegexTests {
diagnosticTest("(?<a-b", .expected(">"))
diagnosticTest("(?<a-b>", .expected(")"))

// The first ']' of a custom character class is literal, so this is missing
// the closing bracket.
// MARK: Character classes

diagnosticTest("[a", .expected("]"))

// The first ']' of a custom character class is literal, so these are
// missing the closing bracket.
diagnosticTest("[]", .expected("]"))
diagnosticTest("(?x)[ ]", .expected("]"))

diagnosticTest("[&&]", .expectedCustomCharacterClassMembers)
diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers)
diagnosticTest("[&&a]", .expectedCustomCharacterClassMembers)
diagnosticTest("(?x)[ && ]", .expectedCustomCharacterClassMembers)
diagnosticTest("(?x)[ &&a]", .expectedCustomCharacterClassMembers)
diagnosticTest("(?x)[a&& ]", .expectedCustomCharacterClassMembers)

diagnosticTest("[:a", .expected("]"))
diagnosticTest("[:a:", .expected("]"))
Expand Down