Skip to content

Commit 3595070

Browse files
committed
Allow coalescing through trivia
I also noticed that `lexQuantifier` could silently eat trivia if it failed to lex a quantification, so also fix that.
1 parent 35c9d9c commit 3595070

File tree

5 files changed

+104
-22
lines changed

5 files changed

+104
-22
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -480,35 +480,37 @@ extension Parser {
480480
///
481481
mutating func lexQuantifier(
482482
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
483-
var trivia: [AST.Trivia] = []
483+
tryEating { p in
484+
var trivia: [AST.Trivia] = []
484485

485-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
486+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
486487

487-
let amt: Located<Quant.Amount>? = recordLoc { p in
488-
if p.tryEat("*") { return .zeroOrMore }
489-
if p.tryEat("+") { return .oneOrMore }
490-
if p.tryEat("?") { return .zeroOrOne }
488+
let amt: Located<Quant.Amount>? = p.recordLoc { p in
489+
if p.tryEat("*") { return .zeroOrMore }
490+
if p.tryEat("+") { return .oneOrMore }
491+
if p.tryEat("?") { return .zeroOrOne }
491492

492-
return p.tryEating { p in
493-
guard p.tryEat("{"),
494-
let range = p.lexRange(trivia: &trivia),
495-
p.tryEat("}")
496-
else { return nil }
497-
return range.value
493+
return p.tryEating { p in
494+
guard p.tryEat("{"),
495+
let range = p.lexRange(trivia: &trivia),
496+
p.tryEat("}")
497+
else { return nil }
498+
return range.value
499+
}
498500
}
499-
}
500-
guard let amt = amt else { return nil }
501+
guard let amt = amt else { return nil }
501502

502-
// PCRE allows non-semantic whitespace here in extended syntax mode.
503-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
503+
// PCRE allows non-semantic whitespace here in extended syntax mode.
504+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
504505

505-
let kind: Located<Quant.Kind> = recordLoc { p in
506-
if p.tryEat("?") { return .reluctant }
507-
if p.tryEat("+") { return .possessive }
508-
return .eager
509-
}
506+
let kind: Located<Quant.Kind> = p.recordLoc { p in
507+
if p.tryEat("?") { return .reluctant }
508+
if p.tryEat("+") { return .possessive }
509+
return .eager
510+
}
510511

511-
return (amt, kind, trivia)
512+
return (amt, kind, trivia)
513+
}
512514
}
513515

514516
/// Try to consume a range, returning `nil` if unsuccessful.

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,10 @@ fileprivate extension Compiler.ByteCodeGen {
812812
current.append(lhs)
813813
rangeOperands.append(String(rhs))
814814
return true
815+
case .trivia:
816+
// Trivia can be completely ignored if we've already coalesced
817+
// something.
818+
return !current.isEmpty
815819
default:
816820
return false
817821
}
@@ -927,6 +931,10 @@ fileprivate extension Compiler.ByteCodeGen {
927931
case .quotedLiteral(let q):
928932
str += q
929933
return true
934+
case .trivia:
935+
// Trivia can be completely ignored if we've already coalesced
936+
// something.
937+
return !str.isEmpty
930938
default:
931939
return false
932940
}

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,10 @@ extension PrettyPrinter {
304304
case .quotedLiteral(let q):
305305
literal.append(q)
306306
return true
307+
case .trivia:
308+
// Trivia can be completely ignored if we've already coalesced
309+
// something.
310+
return !literal.isEmpty
307311
default:
308312
return false
309313
}

Tests/RegexTests/MatchTests.swift

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,23 @@ extension RegexTests {
345345
input: "e\u{301}0e\u{302}",
346346
match: "e\u{301}0e\u{302}"
347347
)
348+
firstMatchTest(
349+
#"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#,
350+
input: "e\u{301}\u{315}\u{35C}",
351+
match: "e\u{301}\u{315}\u{35C}"
352+
)
353+
firstMatchTest(
354+
#"(?x) e \u{35C} \u{315 301}"#,
355+
input: "e\u{301}\u{315}\u{35C}",
356+
match: "e\u{301}\u{315}\u{35C}"
357+
)
358+
359+
// We don't coalesce across groups.
360+
firstMatchTests(
361+
#"e\u{301}(?:\u{315}\u{35C})?"#,
362+
("e\u{301}", "e\u{301}"),
363+
("e\u{301}\u{315}\u{35C}", nil)
364+
)
348365

349366
// Escape sequences that represent scalar values.
350367
firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
@@ -824,6 +841,30 @@ extension RegexTests {
824841
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
825842
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
826843
)
844+
firstMatchTests(
845+
#"(?x) [ e \u{315} \u{301} \u{35C} ]"#,
846+
("e", nil),
847+
("e\u{315}", nil),
848+
("e\u{301}", nil),
849+
("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"),
850+
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
851+
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
852+
)
853+
854+
// We don't coalesce across character classes.
855+
firstMatchTests(
856+
#"e[\u{315}\u{301}\u{35C}]"#,
857+
("e", nil),
858+
("e\u{315}", nil),
859+
("e\u{315}\u{301}", nil),
860+
("e\u{301}\u{315}\u{35C}", nil)
861+
)
862+
firstMatchTests(
863+
#"[e[\u{301}]]"#,
864+
("e", "e"),
865+
("\u{301}", "\u{301}"),
866+
("e\u{301}", nil)
867+
)
827868

828869
firstMatchTests(
829870
#"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#,
@@ -982,6 +1023,16 @@ extension RegexTests {
9821023
("e\u{302}", nil),
9831024
("e\u{303}", "e\u{303}"))
9841025

1026+
firstMatchTests(
1027+
#"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#,
1028+
("e", nil),
1029+
("\u{301}", nil),
1030+
("\u{302}", nil),
1031+
("\u{303}", nil),
1032+
("e\u{301}", nil),
1033+
("e\u{302}", nil),
1034+
("e\u{303}", "e\u{303}"))
1035+
9851036
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
9861037

9871038
// These are metacharacters in certain contexts, but normal characters
@@ -2152,6 +2203,11 @@ extension RegexTests {
21522203
#"\u{65 301}"#,
21532204
(eComposed, true),
21542205
(eDecomposed, true))
2206+
2207+
matchTest(
2208+
#"(?x) \u{65} \u{301}"#,
2209+
(eComposed, true),
2210+
(eDecomposed, true))
21552211
}
21562212

21572213
func testCanonicalEquivalenceCharacterClass() throws {

Tests/RegexTests/RenderDSLTests.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,18 @@ extension RenderDSLTests {
177177
}
178178
"""#)
179179

180+
try testConversion(#"(?x) a \u{301}"#, #"""
181+
Regex {
182+
"a\u{301}"
183+
}
184+
"""#)
185+
186+
try testConversion(#"(?x) [ a b c \u{301} ] "#, #"""
187+
Regex {
188+
One(.anyOf("abc\u{301}"))
189+
}
190+
"""#)
191+
180192
try testConversion(#"👨\u{200D}👨\u{200D}👧\u{200D}👦"#, #"""
181193
Regex {
182194
"👨\u{200D}👨\u{200D}👧\u{200D}👦"

0 commit comments

Comments
 (0)