Skip to content

Commit 6f76f36

Browse files
committed
Merge branch 'main' into match-scalar
2 parents eeb38e9 + 711c6e3 commit 6f76f36

20 files changed

+649
-83
lines changed

Sources/_RegexParser/Regex/AST/CustomCharClass.swift

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ extension AST {
6262
self.rhs = rhs
6363
self.trivia = trivia
6464
}
65+
66+
public var location: SourceLocation {
67+
lhs.location.union(with: rhs.location)
68+
}
6569
}
6670
public enum SetOp: String, Hashable {
6771
case subtraction = "--"
@@ -108,6 +112,25 @@ extension CustomCC.Member {
108112
public var isSemantic: Bool {
109113
!isTrivia
110114
}
115+
116+
public var location: SourceLocation {
117+
switch self {
118+
case let .custom(c): return c.location
119+
case let .range(r): return r.location
120+
case let .atom(a): return a.location
121+
case let .quote(q): return q.location
122+
case let .trivia(t): return t.location
123+
case let .setOperation(lhs, dash, rhs):
124+
var loc = dash.location
125+
if let lhs = lhs.first {
126+
loc = loc.union(with: lhs.location)
127+
}
128+
if let rhs = rhs.last {
129+
loc = loc.union(with: rhs.location)
130+
}
131+
return loc
132+
}
133+
}
111134
}
112135

113136
extension AST.CustomCharacterClass {

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ enum ParseError: Error, Hashable {
9494
case invalidNamedReference(String)
9595
case duplicateNamedCapture(String)
9696
case invalidCharacterClassRangeOperand
97+
case unsupportedDotNetSubtraction
9798
case invalidQuantifierRange(Int, Int)
9899
case invalidCharacterRange(from: Character, to: Character)
99100
case notQuantifiable
@@ -174,7 +175,9 @@ extension ParseError: CustomStringConvertible {
174175
case .expectedCustomCharacterClassMembers:
175176
return "expected custom character class members"
176177
case .invalidCharacterClassRangeOperand:
177-
return "invalid character class range"
178+
return "invalid bound for character class range"
179+
case .unsupportedDotNetSubtraction:
180+
return "subtraction with '-' is unsupported; use '--' instead"
178181
case .emptyProperty:
179182
return "empty property"
180183
case .unknownProperty(let key, let value):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,25 @@ extension Source {
12451245
return nil
12461246
}
12471247

1248+
/// Check to see if we can lex a .NET subtraction. Returns the
1249+
/// location of the `-`.
1250+
///
1251+
/// DotNetSubtraction -> Trivia* '-' Trivia* CustomCharClass
1252+
///
1253+
func canLexDotNetCharClassSubtraction(
1254+
context: ParsingContext
1255+
) -> SourceLocation? {
1256+
lookahead { src in
1257+
// We can lex '-' as a .NET subtraction if it precedes a custom character
1258+
// class.
1259+
while (try? src.lexTrivia(context: context)) != nil {}
1260+
guard let dashLoc = src.tryEatWithLoc("-") else { return nil }
1261+
while (try? src.lexTrivia(context: context)) != nil {}
1262+
guard src.lexCustomCCStart() != nil else { return nil }
1263+
return dashLoc
1264+
}
1265+
}
1266+
12481267
private mutating func lexPOSIXCharacterProperty(
12491268
) throws -> Located<AST.Atom.CharacterProperty>? {
12501269
try recordLoc { src in

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 83 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,12 @@ extension Parser {
502502
var members: Array<Member> = []
503503
try parseCCCMembers(into: &members)
504504

505+
// Make sure we have at least one semantic member.
506+
if members.none(\.isSemantic) {
507+
throw Source.LocatedError(
508+
ParseError.expectedCustomCharacterClassMembers, start.location)
509+
}
510+
505511
// If we have a binary set operator, parse it and the next members. Note
506512
// that this means we left associate for a chain of operators.
507513
// TODO: We may want to diagnose and require users to disambiguate, at least
@@ -511,16 +517,12 @@ extension Parser {
511517
var rhs: Array<Member> = []
512518
try parseCCCMembers(into: &rhs)
513519

514-
if members.none(\.isSemantic) || rhs.none(\.isSemantic) {
520+
if rhs.none(\.isSemantic) {
515521
throw Source.LocatedError(
516522
ParseError.expectedCustomCharacterClassMembers, start.location)
517523
}
518524
members = [.setOperation(members, binOp, rhs)]
519525
}
520-
if members.none(\.isSemantic) {
521-
throw Source.LocatedError(
522-
ParseError.expectedCustomCharacterClassMembers, start.location)
523-
}
524526
try source.expect("]")
525527
return CustomCC(start, members, loc(start.location.start))
526528
}
@@ -550,48 +552,88 @@ extension Parser {
550552
return nil
551553
}
552554

553-
mutating func parseCCCMembers(
554-
into members: inout Array<CustomCC.Member>
555+
/// Attempt to parse a custom character class range into `members`, or regular
556+
/// members if a range cannot be formed.
557+
mutating func parsePotentialCCRange(
558+
into members: inout [CustomCC.Member]
555559
) throws {
556-
// Parse members until we see the end of the custom char class or an
557-
// operator.
558-
while let member = try parseCCCMember() {
559-
members.append(member)
560-
561-
// If we have an atom, we can try to parse a character class range. Each
562-
// time we parse a component of the range, we append to `members` in case
563-
// it ends up not being a range, and we bail. If we succeed in parsing, we
564-
// remove the intermediate members.
565-
if case .atom(let lhs) = member {
566-
let membersBeforeRange = members.count - 1
567-
568-
while let t = try source.lexTrivia(context: context) {
569-
members.append(.trivia(t))
570-
}
560+
guard let lhs = members.last, lhs.isSemantic else { return }
561+
562+
// Try and see if we can parse a character class range. Each time we parse
563+
// a component of the range, we append to `members` in case it ends up not
564+
// being a range, and we bail. If we succeed in parsing, we remove the
565+
// intermediate members.
566+
let membersBeforeRange = members.count - 1
567+
while let t = try source.lexTrivia(context: context) {
568+
members.append(.trivia(t))
569+
}
570+
guard let dash = source.lexCustomCharacterClassRangeOperator() else {
571+
return
572+
}
571573

572-
guard let dash = source.lexCustomCharacterClassRangeOperator() else {
573-
continue
574-
}
575-
// If we can't parse a range, '-' becomes literal, e.g `[6-]`.
576-
members.append(.atom(.init(.char("-"), dash)))
574+
// If we can't parse a range, '-' becomes literal, e.g `[6-]`.
575+
members.append(.atom(.init(.char("-"), dash)))
577576

578-
while let t = try source.lexTrivia(context: context) {
579-
members.append(.trivia(t))
577+
while let t = try source.lexTrivia(context: context) {
578+
members.append(.trivia(t))
579+
}
580+
guard let rhs = try parseCCCMember() else { return }
581+
members.append(rhs)
582+
583+
func makeOperand(_ m: CustomCC.Member, isLHS: Bool) throws -> AST.Atom {
584+
switch m {
585+
case .atom(let a):
586+
return a
587+
case .custom:
588+
// Not supported. While .NET allows `x-[...]` to spell subtraction, we
589+
// require `x--[...]`. We also ban `[...]-x` for consistency.
590+
if isLHS {
591+
throw Source.LocatedError(
592+
ParseError.invalidCharacterClassRangeOperand, m.location)
593+
} else {
594+
throw Source.LocatedError(
595+
ParseError.unsupportedDotNetSubtraction, m.location)
580596
}
581-
guard let rhs = try parseCCCMember() else { continue }
582-
members.append(rhs)
583-
584-
guard case let .atom(rhs) = rhs else { continue }
585-
586-
// We've successfully parsed an atom LHS and RHS, so form a range,
587-
// collecting the trivia we've parsed, and replacing the members that
588-
// would have otherwise been added to the custom character class.
589-
let rangeMemberCount = members.count - membersBeforeRange
590-
let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia)
591-
members.removeLast(rangeMemberCount)
592-
members.append(.range(.init(lhs, dash, rhs, trivia: trivia)))
597+
case .quote:
598+
// Currently unsupported, we need to figure out what the semantics
599+
// would be for grapheme/scalar modes.
600+
throw Source.LocatedError(
601+
ParseError.unsupported("range with quoted sequence"), m.location)
602+
case .trivia:
603+
throw Unreachable("Should have been lexed separately")
604+
case .range, .setOperation:
605+
throw Unreachable("Parsed later")
593606
}
594607
}
608+
let lhsOp = try makeOperand(lhs, isLHS: true)
609+
let rhsOp = try makeOperand(rhs, isLHS: false)
610+
611+
// We've successfully parsed an atom LHS and RHS, so form a range,
612+
// collecting the trivia we've parsed, and replacing the members that
613+
// would have otherwise been added to the custom character class.
614+
let rangeMemberCount = members.count - membersBeforeRange
615+
let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia)
616+
members.removeLast(rangeMemberCount)
617+
members.append(.range(.init(lhsOp, dash, rhsOp, trivia: trivia)))
618+
619+
// We need to specially check if we can lex a .NET character class
620+
// subtraction here as e.g `[a-c-[...]]` is allowed in .NET. Otherwise we'd
621+
// treat the second `-` as literal.
622+
if let dashLoc = source.canLexDotNetCharClassSubtraction(context: context) {
623+
throw Source.LocatedError(
624+
ParseError.unsupportedDotNetSubtraction, dashLoc)
625+
}
626+
}
627+
628+
mutating func parseCCCMembers(
629+
into members: inout Array<CustomCC.Member>
630+
) throws {
631+
// Parse members and ranges until we see the end of the custom char class
632+
// or an operator.
633+
while let member = try parseCCCMember() {
634+
members.append(member)
635+
try parsePotentialCCRange(into: &members)
636+
}
595637
}
596638
}
597639

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,16 @@ extension Compiler {
88
/// This is used to determine whether to apply initial options.
99
var hasEmittedFirstMatchableAtom = false
1010

11-
init(options: MatchingOptions, captureList: CaptureList) {
11+
private let compileOptions: CompileOptions
12+
fileprivate var optimizationsEnabled: Bool { !compileOptions.contains(.disableOptimizations) }
13+
14+
init(
15+
options: MatchingOptions,
16+
compileOptions: CompileOptions,
17+
captureList: CaptureList
18+
) {
1219
self.options = options
20+
self.compileOptions = compileOptions
1321
self.builder.captureList = captureList
1422
}
1523
}
@@ -670,8 +678,16 @@ fileprivate extension Compiler.ByteCodeGen {
670678
mutating func emitCustomCharacterClass(
671679
_ ccc: DSLTree.CustomCharacterClass
672680
) throws {
673-
let consumer = try ccc.generateConsumer(options)
674-
builder.buildConsume(by: consumer)
681+
if let asciiBitset = ccc.asAsciiBitset(options),
682+
options.semanticLevel == .graphemeCluster,
683+
optimizationsEnabled {
684+
// future work: add a bit to .matchBitset to consume either a character
685+
// or a scalar so we can have this optimization in scalar mode
686+
builder.buildMatchAsciiBitset(asciiBitset)
687+
} else {
688+
let consumer = try ccc.generateConsumer(options)
689+
builder.buildConsume(by: consumer)
690+
}
675691
}
676692

677693
@discardableResult

Sources/_StringProcessing/Compiler.swift

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class Compiler {
1616

1717
// TODO: Or are these stored on the tree?
1818
var options = MatchingOptions()
19+
private var compileOptions: CompileOptions = .default
1920

2021
init(ast: AST) {
2122
self.tree = ast.dslTree
@@ -25,23 +26,22 @@ class Compiler {
2526
self.tree = tree
2627
}
2728

29+
init(tree: DSLTree, compileOptions: CompileOptions) {
30+
self.tree = tree
31+
self.compileOptions = compileOptions
32+
}
33+
2834
__consuming func emit() throws -> MEProgram {
2935
// TODO: Handle global options
3036
var codegen = ByteCodeGen(
31-
options: options, captureList: tree.captureList
32-
)
37+
options: options,
38+
compileOptions:
39+
compileOptions,
40+
captureList: tree.captureList)
3341
return try codegen.emitRoot(tree.root)
3442
}
3543
}
3644

37-
func _compileRegex(
38-
_ regex: String, _ syntax: SyntaxOptions = .traditional
39-
) throws -> Executor {
40-
let ast = try parse(regex, .semantic, syntax)
41-
let program = try Compiler(ast: ast).emit()
42-
return Executor(program: program)
43-
}
44-
4545
// An error produced when compiling a regular expression.
4646
enum RegexCompilationError: Error, CustomStringConvertible {
4747
// TODO: Source location?
@@ -54,3 +54,35 @@ enum RegexCompilationError: Error, CustomStringConvertible {
5454
}
5555
}
5656
}
57+
58+
// Testing support
59+
@available(SwiftStdlib 5.7, *)
60+
func _compileRegex(
61+
_ regex: String,
62+
_ syntax: SyntaxOptions = .traditional,
63+
_ semanticLevel: RegexSemanticLevel? = nil
64+
) throws -> Executor {
65+
let ast = try parse(regex, .semantic, syntax)
66+
let dsl: DSLTree
67+
68+
switch semanticLevel?.base {
69+
case .graphemeCluster:
70+
let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)])
71+
dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root))
72+
case .unicodeScalar:
73+
let sequence = AST.MatchingOptionSequence(adding: [.init(.unicodeScalarSemantics, location: .fake)])
74+
dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root))
75+
case .none:
76+
dsl = ast.dslTree
77+
}
78+
let program = try Compiler(tree: dsl).emit()
79+
return Executor(program: program)
80+
}
81+
82+
extension Compiler {
83+
struct CompileOptions: OptionSet {
84+
let rawValue: Int
85+
static let disableOptimizations = CompileOptions(rawValue: 1)
86+
static let `default`: CompileOptions = []
87+
}
88+
}

0 commit comments

Comments
 (0)