Skip to content

Commit 7dbe453

Browse files
committed
Allow coalescing through trivia
I also noticed that `lexQuantifier` could silently eat trivia if it failed to lex a quantification, so also fix that.
1 parent f64d020 commit 7dbe453

File tree

5 files changed

+104
-22
lines changed

5 files changed

+104
-22
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -480,35 +480,37 @@ extension Parser {
480480
///
481481
mutating func lexQuantifier(
482482
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
483-
var trivia: [AST.Trivia] = []
483+
tryEating { p in
484+
var trivia: [AST.Trivia] = []
484485

485-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
486+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
486487

487-
let amt: Located<Quant.Amount>? = recordLoc { p in
488-
if p.tryEat("*") { return .zeroOrMore }
489-
if p.tryEat("+") { return .oneOrMore }
490-
if p.tryEat("?") { return .zeroOrOne }
488+
let amt: Located<Quant.Amount>? = p.recordLoc { p in
489+
if p.tryEat("*") { return .zeroOrMore }
490+
if p.tryEat("+") { return .oneOrMore }
491+
if p.tryEat("?") { return .zeroOrOne }
491492

492-
return p.tryEating { p in
493-
guard p.tryEat("{"),
494-
let range = p.lexRange(trivia: &trivia),
495-
p.tryEat("}")
496-
else { return nil }
497-
return range.value
493+
return p.tryEating { p in
494+
guard p.tryEat("{"),
495+
let range = p.lexRange(trivia: &trivia),
496+
p.tryEat("}")
497+
else { return nil }
498+
return range.value
499+
}
498500
}
499-
}
500-
guard let amt = amt else { return nil }
501+
guard let amt = amt else { return nil }
501502

502-
// PCRE allows non-semantic whitespace here in extended syntax mode.
503-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
503+
// PCRE allows non-semantic whitespace here in extended syntax mode.
504+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
504505

505-
let kind: Located<Quant.Kind> = recordLoc { p in
506-
if p.tryEat("?") { return .reluctant }
507-
if p.tryEat("+") { return .possessive }
508-
return .eager
509-
}
506+
let kind: Located<Quant.Kind> = p.recordLoc { p in
507+
if p.tryEat("?") { return .reluctant }
508+
if p.tryEat("+") { return .possessive }
509+
return .eager
510+
}
510511

511-
return (amt, kind, trivia)
512+
return (amt, kind, trivia)
513+
}
512514
}
513515

514516
/// Try to consume a range, returning `nil` if unsuccessful.

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,10 @@ fileprivate extension Compiler.ByteCodeGen {
812812
current.append(lhs)
813813
rangeOperands.append(String(rhs))
814814
return true
815+
case .trivia:
816+
// Trivia can be completely ignored if we've already coalesced
817+
// something.
818+
return !current.isEmpty
815819
default:
816820
return false
817821
}
@@ -935,6 +939,10 @@ fileprivate extension Compiler.ByteCodeGen {
935939
case .quotedLiteral(let q):
936940
str += q
937941
return true
942+
case .trivia:
943+
// Trivia can be completely ignored if we've already coalesced
944+
// something.
945+
return !str.isEmpty
938946
default:
939947
return false
940948
}

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,10 @@ extension PrettyPrinter {
304304
case .quotedLiteral(let q):
305305
literal.append(q)
306306
return true
307+
case .trivia:
308+
// Trivia can be completely ignored if we've already coalesced
309+
// something.
310+
return !literal.isEmpty
307311
default:
308312
return false
309313
}

Tests/RegexTests/MatchTests.swift

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,23 @@ extension RegexTests {
345345
input: "e\u{301}0e\u{302}",
346346
match: "e\u{301}0e\u{302}"
347347
)
348+
firstMatchTest(
349+
#"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#,
350+
input: "e\u{301}\u{315}\u{35C}",
351+
match: "e\u{301}\u{315}\u{35C}"
352+
)
353+
firstMatchTest(
354+
#"(?x) e \u{35C} \u{315 301}"#,
355+
input: "e\u{301}\u{315}\u{35C}",
356+
match: "e\u{301}\u{315}\u{35C}"
357+
)
358+
359+
// We don't coalesce across groups.
360+
firstMatchTests(
361+
#"e\u{301}(?:\u{315}\u{35C})?"#,
362+
("e\u{301}", "e\u{301}"),
363+
("e\u{301}\u{315}\u{35C}", nil)
364+
)
348365

349366
// Escape sequences that represent scalar values.
350367
firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
@@ -833,6 +850,30 @@ extension RegexTests {
833850
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
834851
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
835852
)
853+
firstMatchTests(
854+
#"(?x) [ e \u{315} \u{301} \u{35C} ]"#,
855+
("e", nil),
856+
("e\u{315}", nil),
857+
("e\u{301}", nil),
858+
("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"),
859+
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
860+
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
861+
)
862+
863+
// We don't coalesce across character classes.
864+
firstMatchTests(
865+
#"e[\u{315}\u{301}\u{35C}]"#,
866+
("e", nil),
867+
("e\u{315}", nil),
868+
("e\u{315}\u{301}", nil),
869+
("e\u{301}\u{315}\u{35C}", nil)
870+
)
871+
firstMatchTests(
872+
#"[e[\u{301}]]"#,
873+
("e", "e"),
874+
("\u{301}", "\u{301}"),
875+
("e\u{301}", nil)
876+
)
836877

837878
firstMatchTests(
838879
#"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#,
@@ -1021,6 +1062,16 @@ extension RegexTests {
10211062
("e\u{302}", nil),
10221063
("e\u{303}", "e\u{303}"))
10231064

1065+
firstMatchTests(
1066+
#"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#,
1067+
("e", nil),
1068+
("\u{301}", nil),
1069+
("\u{302}", nil),
1070+
("\u{303}", nil),
1071+
("e\u{301}", nil),
1072+
("e\u{302}", nil),
1073+
("e\u{303}", "e\u{303}"))
1074+
10241075
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
10251076

10261077
// These are metacharacters in certain contexts, but normal characters
@@ -2191,6 +2242,11 @@ extension RegexTests {
21912242
#"\u{65 301}"#,
21922243
(eComposed, true),
21932244
(eDecomposed, true))
2245+
2246+
matchTest(
2247+
#"(?x) \u{65} \u{301}"#,
2248+
(eComposed, true),
2249+
(eDecomposed, true))
21942250
}
21952251

21962252
func testCanonicalEquivalenceCharacterClass() throws {

Tests/RegexTests/RenderDSLTests.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,18 @@ extension RenderDSLTests {
177177
}
178178
"""#)
179179

180+
try testConversion(#"(?x) a \u{301}"#, #"""
181+
Regex {
182+
"a\u{301}"
183+
}
184+
"""#)
185+
186+
try testConversion(#"(?x) [ a b c \u{301} ] "#, #"""
187+
Regex {
188+
One(.anyOf("abc\u{301}"))
189+
}
190+
"""#)
191+
180192
try testConversion(#"👨\u{200D}👨\u{200D}👧\u{200D}👦"#, #"""
181193
Regex {
182194
"👨\u{200D}👨\u{200D}👧\u{200D}👦"

0 commit comments

Comments
 (0)