Skip to content

Commit 4290d8e

Browse files
authored
Merge pull request #592 from hamishknight/main-merge
2 parents 34d057d + 1e76d29 commit 4290d8e

20 files changed

+1413
-253
lines changed

Package.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,17 @@ let package = Package(
7575
name: "RegexBuilder",
7676
dependencies: ["_StringProcessing", "_RegexParser"],
7777
swiftSettings: publicStdlibSettings),
78+
.target(name: "TestSupport",
79+
swiftSettings: [availabilityDefinition]),
7880
.testTarget(
7981
name: "RegexTests",
80-
dependencies: ["_StringProcessing"],
82+
dependencies: ["_StringProcessing", "TestSupport"],
8183
swiftSettings: [
8284
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
8385
]),
8486
.testTarget(
8587
name: "RegexBuilderTests",
86-
dependencies: ["_StringProcessing", "RegexBuilder"],
88+
dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"],
8789
swiftSettings: [
8890
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"])
8991
]),

Sources/TestSupport/TestSupport.swift

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
import XCTest
13+
14+
// We need to split this out of the test files, as it needs to be compiled
15+
// *without* `-disable-availability-checking` to ensure the #available check is
16+
// not compiled into a no-op.
17+
18+
#if os(Linux)
19+
public func XCTExpectFailure(
20+
_ message: String? = nil, body: () throws -> Void
21+
) rethrows {}
22+
#endif
23+
24+
/// Guards certain tests to make sure we have a new stdlib available.
25+
public func ensureNewStdlib(
26+
file: StaticString = #file, line: UInt = #line
27+
) -> Bool {
28+
guard #available(SwiftStdlib 5.7, *) else {
29+
XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) }
30+
return false
31+
}
32+
return true
33+
}

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -755,8 +755,10 @@ extension AST.Atom {
755755
/// Whether this atom is valid as the operand of a custom character class
756756
/// range.
757757
public var isValidCharacterClassRangeBound: Bool {
758-
// If we have a literal character value for this, it can be used as a bound.
759-
if literalCharacterValue != nil { return true }
758+
if let c = literalCharacterValue {
759+
// We only match character range bounds that are single scalar NFC.
760+
return c.hasExactlyOneScalar && c.isNFC
761+
}
760762
switch kind {
761763
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
762764
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -480,35 +480,37 @@ extension Parser {
480480
///
481481
mutating func lexQuantifier(
482482
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
483-
var trivia: [AST.Trivia] = []
483+
tryEating { p in
484+
var trivia: [AST.Trivia] = []
484485

485-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
486+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
486487

487-
let amt: Located<Quant.Amount>? = recordLoc { p in
488-
if p.tryEat("*") { return .zeroOrMore }
489-
if p.tryEat("+") { return .oneOrMore }
490-
if p.tryEat("?") { return .zeroOrOne }
488+
let amt: Located<Quant.Amount>? = p.recordLoc { p in
489+
if p.tryEat("*") { return .zeroOrMore }
490+
if p.tryEat("+") { return .oneOrMore }
491+
if p.tryEat("?") { return .zeroOrOne }
491492

492-
return p.tryEating { p in
493-
guard p.tryEat("{"),
494-
let range = p.lexRange(trivia: &trivia),
495-
p.tryEat("}")
496-
else { return nil }
497-
return range.value
493+
return p.tryEating { p in
494+
guard p.tryEat("{"),
495+
let range = p.lexRange(trivia: &trivia),
496+
p.tryEat("}")
497+
else { return nil }
498+
return range.value
499+
}
498500
}
499-
}
500-
guard let amt = amt else { return nil }
501+
guard let amt = amt else { return nil }
501502

502-
// PCRE allows non-semantic whitespace here in extended syntax mode.
503-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
503+
// PCRE allows non-semantic whitespace here in extended syntax mode.
504+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
504505

505-
let kind: Located<Quant.Kind> = recordLoc { p in
506-
if p.tryEat("?") { return .reluctant }
507-
if p.tryEat("+") { return .possessive }
508-
return .eager
509-
}
506+
let kind: Located<Quant.Kind> = p.recordLoc { p in
507+
if p.tryEat("?") { return .reluctant }
508+
if p.tryEat("+") { return .possessive }
509+
return .eager
510+
}
510511

511-
return (amt, kind, trivia)
512+
return (amt, kind, trivia)
513+
}
512514
}
513515

514516
/// Try to consume a range, returning `nil` if unsuccessful.

Sources/_RegexParser/Utility/Misc.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,21 @@ extension Substring {
1919
var string: String { String(self) }
2020
}
2121

22+
extension Character {
23+
/// Whether this character is made up of exactly one Unicode scalar value.
24+
public var hasExactlyOneScalar: Bool {
25+
let scalars = unicodeScalars
26+
return scalars.index(after: scalars.startIndex) == scalars.endIndex
27+
}
28+
29+
/// Whether the given character is in NFC form.
30+
internal var isNFC: Bool {
31+
if isASCII { return true }
32+
let str = String(self)
33+
return str._nfcCodeUnits.elementsEqual(str.utf8)
34+
}
35+
}
36+
2237
extension CustomStringConvertible {
2338
@_alwaysEmitIntoClient
2439
public var halfWidthCornerQuoted: String {

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 162 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen {
775775
builder.label(exit)
776776
}
777777

778+
/// Coalesce any adjacent scalar members in a custom character class together.
779+
/// This is required in order to produce correct grapheme matching behavior.
780+
func coalescingCustomCharacterClassMembers(
781+
_ members: [DSLTree.CustomCharacterClass.Member]
782+
) -> [DSLTree.CustomCharacterClass.Member] {
783+
struct Accumulator {
784+
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
785+
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
786+
/// ranges will be created.
787+
private var rangeOperands: [String] = [""]
788+
789+
/// The current range operand.
790+
private var current: String {
791+
_read { yield rangeOperands[rangeOperands.count - 1] }
792+
_modify { yield &rangeOperands[rangeOperands.count - 1] }
793+
}
794+
795+
/// Try to accumulate a character class member, returning `true` if
796+
/// successful, `false` otherwise.
797+
mutating func tryAccumulate(
798+
_ member: DSLTree.CustomCharacterClass.Member
799+
) -> Bool {
800+
switch member {
801+
case .atom(let a):
802+
guard let c = a.literalCharacterValue else { return false }
803+
current.append(c)
804+
return true
805+
case .quotedLiteral(let str):
806+
current += str
807+
return true
808+
case let .range(lhs, rhs):
809+
guard let lhs = lhs.literalCharacterValue,
810+
let rhs = rhs.literalCharacterValue
811+
else { return false }
812+
current.append(lhs)
813+
rangeOperands.append(String(rhs))
814+
return true
815+
case .trivia:
816+
// Trivia can be completely ignored if we've already coalesced
817+
// something.
818+
return !current.isEmpty
819+
default:
820+
return false
821+
}
822+
}
823+
824+
func finish() -> [DSLTree.CustomCharacterClass.Member] {
825+
if rangeOperands.count == 1 {
826+
// If we didn't have any additional range operands, this isn't a
827+
// range, we can just form a standard quoted literal.
828+
return [.quotedLiteral(current)]
829+
}
830+
var members = [DSLTree.CustomCharacterClass.Member]()
831+
832+
// We have other range operands, splice them together. For N operands
833+
// we have N - 1 ranges.
834+
for (i, lhs) in rangeOperands.dropLast().enumerated() {
835+
let rhs = rangeOperands[i + 1]
836+
837+
// If this is the first operand we only need to drop the last
838+
// character for its quoted members, otherwise this is both an LHS
839+
// and RHS of a range, and as such needs both sides trimmed.
840+
let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast()
841+
if !leading.isEmpty {
842+
members.append(.quotedLiteral(String(leading)))
843+
}
844+
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
845+
}
846+
// We've handled everything except the quoted portion of the last
847+
// operand, add it now.
848+
let trailing = rangeOperands.last!.dropFirst()
849+
if !trailing.isEmpty {
850+
members.append(.quotedLiteral(String(trailing)))
851+
}
852+
return members
853+
}
854+
}
855+
return members
856+
.map { m -> DSLTree.CustomCharacterClass.Member in
857+
// First we need to recursively coalsce any child character classes.
858+
switch m {
859+
case .custom(let ccc):
860+
return .custom(coalescingCustomCharacterClass(ccc))
861+
case .intersection(let lhs, let rhs):
862+
return .intersection(
863+
coalescingCustomCharacterClass(lhs),
864+
coalescingCustomCharacterClass(rhs))
865+
case .subtraction(let lhs, let rhs):
866+
return .subtraction(
867+
coalescingCustomCharacterClass(lhs),
868+
coalescingCustomCharacterClass(rhs))
869+
case .symmetricDifference(let lhs, let rhs):
870+
return .symmetricDifference(
871+
coalescingCustomCharacterClass(lhs),
872+
coalescingCustomCharacterClass(rhs))
873+
case .atom, .range, .quotedLiteral, .trivia:
874+
return m
875+
}
876+
}
877+
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
878+
accum.tryAccumulate(member)
879+
}
880+
}
881+
882+
func coalescingCustomCharacterClass(
883+
_ ccc: DSLTree.CustomCharacterClass
884+
) -> DSLTree.CustomCharacterClass {
885+
// This only needs to be done in grapheme semantic mode. In scalar semantic
886+
// mode, we don't want to coalesce any scalars into a grapheme. This
887+
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
888+
// U+302.
889+
guard options.semanticLevel == .graphemeCluster else { return ccc }
890+
891+
let members = coalescingCustomCharacterClassMembers(ccc.members)
892+
return .init(members: members, isInverted: ccc.isInverted)
893+
}
894+
778895
mutating func emitCustomCharacterClass(
779896
_ ccc: DSLTree.CustomCharacterClass
780897
) throws {
898+
// Before emitting a custom character class in grapheme semantic mode, we
899+
// need to coalesce together any adjacent characters and scalars, over which
900+
// we can perform grapheme breaking. This includes e.g range bounds for
901+
// `[e\u{301}-\u{302}]`.
902+
let ccc = coalescingCustomCharacterClass(ccc)
781903
if let asciiBitset = ccc.asAsciiBitset(options),
782904
optimizationsEnabled {
783905
if options.semanticLevel == .unicodeScalar {
@@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen {
791913
}
792914
}
793915

916+
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
917+
// Before emitting a concatenation, we need to flatten out any nested
918+
// concatenations, and coalesce any adjacent characters and scalars, forming
919+
// quoted literals of their contents, over which we can perform grapheme
920+
// breaking.
921+
func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
922+
switch node {
923+
case .concatenation(let ch):
924+
return ch.flatMap(flatten)
925+
case .convertedRegexLiteral(let n, _):
926+
return flatten(n)
927+
default:
928+
return [node]
929+
}
930+
}
931+
let children = children
932+
.flatMap(flatten)
933+
.coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
934+
switch node {
935+
case .atom(let a):
936+
guard let c = a.literalCharacterValue else { return false }
937+
str.append(c)
938+
return true
939+
case .quotedLiteral(let q):
940+
str += q
941+
return true
942+
case .trivia:
943+
// Trivia can be completely ignored if we've already coalesced
944+
// something.
945+
return !str.isEmpty
946+
default:
947+
return false
948+
}
949+
}
950+
for child in children {
951+
try emitConcatenationComponent(child)
952+
}
953+
}
954+
794955
@discardableResult
795956
mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
796957
switch node {
@@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen {
799960
try emitAlternation(children)
800961

801962
case let .concatenation(children):
802-
for child in children {
803-
try emitConcatenationComponent(child)
804-
}
963+
try emitConcatenation(children)
805964

806965
case let .capture(name, refId, child, transform):
807966
options.beginScope()

Sources/_StringProcessing/Compiler.swift

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,19 +42,43 @@ class Compiler {
4242
}
4343
}
4444

45+
/// Hashable wrapper for `Any.Type`.
46+
struct AnyHashableType: CustomStringConvertible, Hashable {
47+
var ty: Any.Type
48+
init(_ ty: Any.Type) {
49+
self.ty = ty
50+
}
51+
var description: String { "\(ty)" }
52+
53+
static func == (lhs: Self, rhs: Self) -> Bool {
54+
lhs.ty == rhs.ty
55+
}
56+
func hash(into hasher: inout Hasher) {
57+
hasher.combine(ObjectIdentifier(ty))
58+
}
59+
}
60+
4561
// An error produced when compiling a regular expression.
46-
enum RegexCompilationError: Error, CustomStringConvertible {
62+
enum RegexCompilationError: Error, Hashable, CustomStringConvertible {
4763
// TODO: Source location?
4864
case uncapturedReference
65+
case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType)
66+
case invalidCharacterClassRangeOperand(Character)
67+
68+
static func incorrectOutputType(
69+
incorrect: Any.Type, correct: Any.Type
70+
) -> Self {
71+
.incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct))
72+
}
4973

50-
case incorrectOutputType(incorrect: Any.Type, correct: Any.Type)
51-
5274
var description: String {
5375
switch self {
5476
case .uncapturedReference:
5577
return "Found a reference used before it captured any match."
5678
case .incorrectOutputType(let incorrect, let correct):
5779
return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'"
80+
case .invalidCharacterClassRangeOperand(let c):
81+
return "'\(c)' is an invalid bound for character class range"
5882
}
5983
}
6084
}

0 commit comments

Comments
 (0)