Skip to content

Commit dc4171f

Browse files
committed
Allow coalescing through trivia
I also noticed that `lexQuantifier` could silently eat trivia if it failed to lex a quantification, so also fix that.
1 parent 96adc3c commit dc4171f

File tree

5 files changed

+104
-22
lines changed

5 files changed

+104
-22
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -480,35 +480,37 @@ extension Parser {
480480
///
481481
mutating func lexQuantifier(
482482
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
483-
var trivia: [AST.Trivia] = []
483+
tryEating { p in
484+
var trivia: [AST.Trivia] = []
484485

485-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
486+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
486487

487-
let amt: Located<Quant.Amount>? = recordLoc { p in
488-
if p.tryEat("*") { return .zeroOrMore }
489-
if p.tryEat("+") { return .oneOrMore }
490-
if p.tryEat("?") { return .zeroOrOne }
488+
let amt: Located<Quant.Amount>? = p.recordLoc { p in
489+
if p.tryEat("*") { return .zeroOrMore }
490+
if p.tryEat("+") { return .oneOrMore }
491+
if p.tryEat("?") { return .zeroOrOne }
491492

492-
return p.tryEating { p in
493-
guard p.tryEat("{"),
494-
let range = p.lexRange(trivia: &trivia),
495-
p.tryEat("}")
496-
else { return nil }
497-
return range.value
493+
return p.tryEating { p in
494+
guard p.tryEat("{"),
495+
let range = p.lexRange(trivia: &trivia),
496+
p.tryEat("}")
497+
else { return nil }
498+
return range.value
499+
}
498500
}
499-
}
500-
guard let amt = amt else { return nil }
501+
guard let amt = amt else { return nil }
501502

502-
// PCRE allows non-semantic whitespace here in extended syntax mode.
503-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
503+
// PCRE allows non-semantic whitespace here in extended syntax mode.
504+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
504505

505-
let kind: Located<Quant.Kind> = recordLoc { p in
506-
if p.tryEat("?") { return .reluctant }
507-
if p.tryEat("+") { return .possessive }
508-
return .eager
509-
}
506+
let kind: Located<Quant.Kind> = p.recordLoc { p in
507+
if p.tryEat("?") { return .reluctant }
508+
if p.tryEat("+") { return .possessive }
509+
return .eager
510+
}
510511

511-
return (amt, kind, trivia)
512+
return (amt, kind, trivia)
513+
}
512514
}
513515

514516
/// Try to consume a range, returning `nil` if unsuccessful.

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,10 @@ fileprivate extension Compiler.ByteCodeGen {
812812
current.append(lhs)
813813
rangeOperands.append(String(rhs))
814814
return true
815+
case .trivia:
816+
// Trivia can be completely ignored if we've already coalesced
817+
// something.
818+
return !current.isEmpty
815819
default:
816820
return false
817821
}
@@ -935,6 +939,10 @@ fileprivate extension Compiler.ByteCodeGen {
935939
case .quotedLiteral(let q):
936940
str += q
937941
return true
942+
case .trivia:
943+
// Trivia can be completely ignored if we've already coalesced
944+
// something.
945+
return !str.isEmpty
938946
default:
939947
return false
940948
}

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,10 @@ extension PrettyPrinter {
314314
case .quotedLiteral(let q):
315315
literal.append(q)
316316
return true
317+
case .trivia:
318+
// Trivia can be completely ignored if we've already coalesced
319+
// something.
320+
return !literal.isEmpty
317321
default:
318322
return false
319323
}

Tests/RegexTests/MatchTests.swift

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,23 @@ extension RegexTests {
345345
input: "e\u{301}0e\u{302}",
346346
match: "e\u{301}0e\u{302}"
347347
)
348+
firstMatchTest(
349+
#"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#,
350+
input: "e\u{301}\u{315}\u{35C}",
351+
match: "e\u{301}\u{315}\u{35C}"
352+
)
353+
firstMatchTest(
354+
#"(?x) e \u{35C} \u{315 301}"#,
355+
input: "e\u{301}\u{315}\u{35C}",
356+
match: "e\u{301}\u{315}\u{35C}"
357+
)
358+
359+
// We don't coalesce across groups.
360+
firstMatchTests(
361+
#"e\u{301}(?:\u{315}\u{35C})?"#,
362+
("e\u{301}", "e\u{301}"),
363+
("e\u{301}\u{315}\u{35C}", nil)
364+
)
348365

349366
// Escape sequences that represent scalar values.
350367
firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
@@ -824,6 +841,30 @@ extension RegexTests {
824841
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
825842
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
826843
)
844+
firstMatchTests(
845+
#"(?x) [ e \u{315} \u{301} \u{35C} ]"#,
846+
("e", nil),
847+
("e\u{315}", nil),
848+
("e\u{301}", nil),
849+
("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"),
850+
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
851+
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
852+
)
853+
854+
// We don't coalesce across character classes.
855+
firstMatchTests(
856+
#"e[\u{315}\u{301}\u{35C}]"#,
857+
("e", nil),
858+
("e\u{315}", nil),
859+
("e\u{315}\u{301}", nil),
860+
("e\u{301}\u{315}\u{35C}", nil)
861+
)
862+
firstMatchTests(
863+
#"[e[\u{301}]]"#,
864+
("e", "e"),
865+
("\u{301}", "\u{301}"),
866+
("e\u{301}", nil)
867+
)
827868

828869
firstMatchTests(
829870
#"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#,
@@ -1012,6 +1053,16 @@ extension RegexTests {
10121053
("e\u{302}", nil),
10131054
("e\u{303}", "e\u{303}"))
10141055

1056+
firstMatchTests(
1057+
#"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#,
1058+
("e", nil),
1059+
("\u{301}", nil),
1060+
("\u{302}", nil),
1061+
("\u{303}", nil),
1062+
("e\u{301}", nil),
1063+
("e\u{302}", nil),
1064+
("e\u{303}", "e\u{303}"))
1065+
10151066
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
10161067

10171068
// These are metacharacters in certain contexts, but normal characters
@@ -2182,6 +2233,11 @@ extension RegexTests {
21822233
#"\u{65 301}"#,
21832234
(eComposed, true),
21842235
(eDecomposed, true))
2236+
2237+
matchTest(
2238+
#"(?x) \u{65} \u{301}"#,
2239+
(eComposed, true),
2240+
(eDecomposed, true))
21852241
}
21862242

21872243
func testCanonicalEquivalenceCharacterClass() throws {

Tests/RegexTests/RenderDSLTests.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,18 @@ extension RenderDSLTests {
203203
}
204204
"""#)
205205

206+
try testConversion(#"(?x) a \u{301}"#, #"""
207+
Regex {
208+
"a\u{301}"
209+
}
210+
"""#)
211+
212+
try testConversion(#"(?x) [ a b c \u{301} ] "#, #"""
213+
Regex {
214+
One(.anyOf("abc\u{301}"))
215+
}
216+
"""#)
217+
206218
try testConversion(#"👨\u{200D}👨\u{200D}👧\u{200D}👦"#, #"""
207219
Regex {
208220
"👨\u{200D}👨\u{200D}👧\u{200D}👦"

0 commit comments

Comments
 (0)