Skip to content

Commit fad4dd9

Browse files
authored
Merge pull request #272 from hamishknight/posix-quirks
2 parents 0338178 + 657c4a6 commit fad4dd9

File tree

4 files changed

+22
-15
lines changed

4 files changed

+22
-15
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,14 +1064,13 @@ extension Source {
10641064
}
10651065

10661066
mutating func lexCustomCCStart(
1067-
context: ParsingContext
10681067
) throws -> Located<CustomCC.Start>? {
10691068
recordLoc { src in
10701069
// Make sure we don't have a POSIX character property. This may require
10711070
// walking to its ending to make sure we have a closing ':]', as otherwise
10721071
// we have a custom character class.
10731072
// TODO: This behavior seems subtle, could we warn?
1074-
guard !src.canLexPOSIXCharacterProperty(context: context) else {
1073+
guard !src.canLexPOSIXCharacterProperty() else {
10751074
return nil
10761075
}
10771076
if src.tryEat("[") {
@@ -1104,11 +1103,8 @@ extension Source {
11041103
}
11051104

11061105
private mutating func lexPOSIXCharacterProperty(
1107-
context: ParsingContext
11081106
) throws -> Located<AST.Atom.CharacterProperty>? {
1109-
// Only allowed in a custom character class.
1110-
guard context.isInCustomCharacterClass else { return nil }
1111-
return try recordLoc { src in
1107+
try recordLoc { src in
11121108
try src.tryEating { src in
11131109
guard src.tryEat(sequence: "[:") else { return nil }
11141110
let inverted = src.tryEat("^")
@@ -1127,10 +1123,10 @@ extension Source {
11271123
}
11281124
}
11291125

1130-
private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool {
1126+
private func canLexPOSIXCharacterProperty() -> Bool {
11311127
do {
11321128
var src = self
1133-
return try src.lexPOSIXCharacterProperty(context: context) != nil
1129+
return try src.lexPOSIXCharacterProperty() != nil
11341130
} catch {
11351131
// We want to tend on the side of lexing a POSIX character property, so
11361132
// even if it is invalid in some way (e.g invalid property names), still
@@ -1818,8 +1814,9 @@ extension Source {
18181814
if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil }
18191815
// TODO: Store customCC in the atom, if that's useful
18201816

1821-
// POSIX character property.
1822-
if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value {
1817+
// POSIX character property. Like \p{...} this is also allowed outside of
1818+
// a custom character class.
1819+
if let prop = try src.lexPOSIXCharacterProperty()?.value {
18231820
return .property(prop)
18241821
}
18251822

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ extension Parser {
403403
}
404404

405405
// Check if we have the start of a custom character class '['.
406-
if let cccStart = try source.lexCustomCCStart(context: context) {
406+
if let cccStart = try source.lexCustomCCStart() {
407407
return .customCharacterClass(
408408
try parseCustomCharacterClass(cccStart))
409409
}
@@ -487,7 +487,7 @@ extension Parser {
487487
while source.peek() != "]" && source.peekCCBinOp() == nil {
488488

489489
// Nested custom character class.
490-
if let cccStart = try source.lexCustomCCStart(context: context) {
490+
if let cccStart = try source.lexCustomCCStart() {
491491
members.append(.custom(try parseCustomCharacterClass(cccStart)))
492492
continue
493493
}

Sources/_StringProcessing/Utility/ASTBuilder.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,11 @@ func prop(
354354
) -> AST.Node {
355355
atom(.property(.init(kind, isInverted: inverted, isPOSIX: false)))
356356
}
357+
func posixProp(
358+
_ kind: AST.Atom.CharacterProperty.Kind, inverted: Bool = false
359+
) -> AST.Node {
360+
atom(.property(.init(kind, isInverted: inverted, isPOSIX: true)))
361+
}
357362

358363
// Raw atom constructing variant
359364
func atom_a(

Tests/RegexTests/ParseTests.swift

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -477,12 +477,9 @@ extension RegexTests {
477477
// These are custom character classes, not invalid POSIX character classes.
478478
// TODO: This behavior is subtle, we ought to warn.
479479
parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e")))
480-
parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":"))
481480
parseTest("[:a]", charClass(":", "a"))
482481
parseTest("[a:]", charClass("a", ":"))
483482
parseTest("[:]", charClass(":"))
484-
parseTest("[::]", charClass(":", ":"))
485-
parseTest("[:=:]", charClass(":", "=", ":"))
486483
parseTest("[[:]]", charClass(charClass(":")))
487484
parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":")))
488485

@@ -522,6 +519,12 @@ extension RegexTests {
522519
posixProp_m(.binary(.uppercase), inverted: true),
523520
"c", "d"))
524521

522+
// Like ICU, we allow POSIX character properties outside of custom character
523+
// classes. This also appears to be suggested by UTS#18.
524+
// TODO: We should likely emit a warning.
525+
parseTest("[:space:]", posixProp(.binary(.whitespace)))
526+
parseTest("[:script=Greek:]", posixProp(.script(.greek)))
527+
525528
parseTest("[[[:space:]]]", charClass(charClass(
526529
posixProp_m(.binary(.whitespace))
527530
)))
@@ -2252,6 +2255,8 @@ extension RegexTests {
22522255
diagnosticTest("[[:a:", .expected("]"))
22532256
diagnosticTest("[[:a[:]", .expected("]"))
22542257

2258+
diagnosticTest("[::]", .emptyProperty)
2259+
diagnosticTest("[:=:]", .emptyProperty)
22552260
diagnosticTest("[[::]]", .emptyProperty)
22562261
diagnosticTest("[[:=:]]", .emptyProperty)
22572262

0 commit comments

Comments
 (0)