Skip to content

[5.7] [DNM] Cherry-pick batch test PR #443

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
863aebe
Merge pull request #575 from Azoy/various-tidbits
Azoy Jul 14, 2022
1079533
Merge pull request #576 from Azoy/options-regex
Azoy Jul 14, 2022
5966a5c
Allow matching tests to specify semantic level
hamishknight Jul 15, 2022
fe63fb4
Rip out unused _CharacterClassModel API
hamishknight Jul 15, 2022
b309fa5
Remove _CharacterClassModel conformance to RegexComponent
hamishknight Jul 15, 2022
0ab3079
Internalize `_CharacterClassModel`
hamishknight Jul 15, 2022
b454390
Fix `CharacterClass.newlineSequence`
hamishknight Jul 15, 2022
8e920c9
Rename `any` -> `dot`
hamishknight Jul 15, 2022
d6a03a0
Re-introduce `DSLTree.Atom.any`
hamishknight Jul 15, 2022
da59c30
Fix `CharacterClass.any`
hamishknight Jul 15, 2022
217aef4
Rename `startOfLine`/`endOfLine` -> `caretAnchor`/`dollarAnchor`
hamishknight Jul 15, 2022
dff47ff
Move AssertionKind onto the DSL
hamishknight Jul 15, 2022
1b3ba2c
Fix `Anchor.startOfLine` and `Anchor.endOfLine`
hamishknight Jul 15, 2022
0570133
Add some tests for `CharacterClass.anyGraphemeCluster`
hamishknight Jul 15, 2022
c7b42f8
Add some tests for `CharacterClass.horizontalWhitespace`
hamishknight Jul 15, 2022
47888e6
Implement `CharacterClass.anyNonNewline`
hamishknight Jul 15, 2022
429b699
Break out of quantification loop if there is no forward progress (#560)
rctcwyvrn Jul 11, 2022
92a051a
Optimize matching to match on scalar values when possible (#525)
rctcwyvrn Jul 12, 2022
33a937c
Validate optimizations when a match fails
hamishknight Jul 7, 2022
e343554
Guard against testing with older stdlibs
hamishknight Jul 19, 2022
1acb82a
Add some extra character class newline matching tests
hamishknight Jul 19, 2022
f56ac11
Fix character class range matching
hamishknight Jul 19, 2022
6523a93
Coalesce adjacent scalars and characters in the DSL
hamishknight Jul 20, 2022
b61c770
Fix scalar mode for quoted sequences in character class
hamishknight Jul 20, 2022
bda6fbc
Form ASCII bitsets for quoted sequences in character classes
hamishknight Jul 20, 2022
e9838da
Coalesce character class members
hamishknight Jul 20, 2022
d5cad1c
Throw `RegexCompilationError` for invalid character class bounds
hamishknight Jul 20, 2022
f2d44ff
Allow coalescing through trivia
hamishknight Jul 20, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,17 @@ let package = Package(
name: "RegexBuilder",
dependencies: ["_StringProcessing", "_RegexParser"],
swiftSettings: publicStdlibSettings),
.target(name: "TestSupport",
swiftSettings: [availabilityDefinition]),
.testTarget(
name: "RegexTests",
dependencies: ["_StringProcessing"],
dependencies: ["_StringProcessing", "TestSupport"],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
]),
.testTarget(
name: "RegexBuilderTests",
dependencies: ["_StringProcessing", "RegexBuilder"],
dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"])
]),
Expand Down
44 changes: 35 additions & 9 deletions Sources/RegexBuilder/Anchor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,30 @@ public struct Anchor {

@available(SwiftStdlib 5.7, *)
extension Anchor: RegexComponent {
var baseAssertion: DSLTree._AST.AssertionKind {
var baseAssertion: DSLTree.Atom.Assertion {
switch kind {
case .startOfSubject: return .startOfSubject(isInverted)
case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline(isInverted)
case .endOfSubject: return .endOfSubject(isInverted)
case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject(isInverted)
case .textSegmentBoundary: return .textSegmentBoundary(isInverted)
case .startOfLine: return .startOfLine(isInverted)
case .endOfLine: return .endOfLine(isInverted)
case .wordBoundary: return .wordBoundary(isInverted)
case .startOfSubject:
// FIXME: Inverted?
return .startOfSubject
case .endOfSubjectBeforeNewline:
// FIXME: Inverted?
return .endOfSubjectBeforeNewline
case .endOfSubject:
// FIXME: Inverted?
return .endOfSubject
case .firstMatchingPositionInSubject:
// FIXME: Inverted?
return .firstMatchingPositionInSubject
case .textSegmentBoundary:
return isInverted ? .notTextSegment : .textSegment
case .startOfLine:
// FIXME: Inverted?
return .startOfLine
case .endOfLine:
// FIXME: Inverted?
return .endOfLine
case .wordBoundary:
return isInverted ? .notWordBoundary : .wordBoundary
}
}

Expand Down Expand Up @@ -104,6 +118,12 @@ extension Anchor {
///
/// This anchor is equivalent to `^` in regex syntax when the `m` option
/// has been enabled or `anchorsMatchLineEndings(true)` has been called.
///
/// For example, the following regexes are all equivalent:
///
/// - `Regex { Anchor.startOfLine }`
/// - `/(?m)^/` or `/(?m:^)/`
/// - `/^/.anchorsMatchLineEndings(true)`
public static var startOfLine: Anchor {
Anchor(kind: .startOfLine)
}
Expand All @@ -113,6 +133,12 @@ extension Anchor {
///
/// This anchor is equivalent to `$` in regex syntax when the `m` option
/// has been enabled or `anchorsMatchLineEndings(true)` has been called.
///
/// For example, the following regexes are all equivalent:
///
/// - `Regex { Anchor.endOfLine }`
/// - `/(?m)$/` or `/(?m:$)/`
/// - `/$/.anchorsMatchLineEndings(true)`
public static var endOfLine: Anchor {
Anchor(kind: .endOfLine)
}
Expand Down
25 changes: 13 additions & 12 deletions Sources/RegexBuilder/CharacterClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,8 @@ public struct CharacterClass {
self.ccc = ccc
}

init(unconverted model: _CharacterClassModel) {
guard let ccc = model.makeDSLTreeCharacterClass() else {
fatalError("Unsupported character class")
}
self.ccc = ccc
init(unconverted atom: DSLTree._AST.Atom) {
self.ccc = .init(members: [.atom(.unconverted(atom))])
}
}

Expand All @@ -48,16 +45,20 @@ extension RegexComponent where Self == CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [.atom(.any)]))
}

public static var anyNonNewline: CharacterClass {
.init(DSLTree.CustomCharacterClass(members: [.atom(.anyNonNewline)]))
}

public static var anyGraphemeCluster: CharacterClass {
.init(unconverted: .anyGrapheme)
.init(unconverted: ._anyGrapheme)
}

public static var whitespace: CharacterClass {
.init(unconverted: .whitespace)
.init(unconverted: ._whitespace)
}

public static var digit: CharacterClass {
.init(unconverted: .digit)
.init(unconverted: ._digit)
}

public static var hexDigit: CharacterClass {
Expand All @@ -69,19 +70,19 @@ extension RegexComponent where Self == CharacterClass {
}

public static var horizontalWhitespace: CharacterClass {
.init(unconverted: .horizontalWhitespace)
.init(unconverted: ._horizontalWhitespace)
}

public static var newlineSequence: CharacterClass {
.init(unconverted: .newlineSequence)
.init(unconverted: ._newlineSequence)
}

public static var verticalWhitespace: CharacterClass {
.init(unconverted: .verticalWhitespace)
.init(unconverted: ._verticalWhitespace)
}

public static var word: CharacterClass {
.init(unconverted: .word)
.init(unconverted: ._word)
}
}

Expand Down
33 changes: 33 additions & 0 deletions Sources/TestSupport/TestSupport.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

import XCTest

// We need to split this out of the test files, as it needs to be compiled
// *without* `-disable-availability-checking` to ensure the #available check is
// not compiled into a no-op.

#if os(Linux)
public func XCTExpectFailure(
_ message: String? = nil, body: () throws -> Void
) rethrows {}
#endif

/// Guards certain tests to make sure we have a new stdlib available.
public func ensureNewStdlib(
file: StaticString = #file, line: UInt = #line
) -> Bool {
guard #available(SwiftStdlib 5.7, *) else {
XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) }
return false
}
return true
}
89 changes: 15 additions & 74 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,13 @@ extension AST {
case namedCharacter(String)

/// .
case any
case dot

/// ^
case startOfLine
case caretAnchor

/// $
case endOfLine
case dollarAnchor

// References
case backreference(Reference)
Expand Down Expand Up @@ -104,9 +104,9 @@ extension AST.Atom {
case .callout(let v): return v
case .backtrackingDirective(let v): return v
case .changeMatchingOptions(let v): return v
case .any: return nil
case .startOfLine: return nil
case .endOfLine: return nil
case .dot: return nil
case .caretAnchor: return nil
case .dollarAnchor: return nil
case .invalid: return nil
}
}
Expand Down Expand Up @@ -511,67 +511,6 @@ extension AST.Atom.CharacterProperty {
}
}

extension AST.Atom {
/// Anchors and other built-in zero-width assertions.
public enum AssertionKind: String, Hashable {
/// \A
case startOfSubject = #"\A"#

/// \Z
case endOfSubjectBeforeNewline = #"\Z"#

/// \z
case endOfSubject = #"\z"#

/// \K
case resetStartOfMatch = #"\K"#

/// \G
case firstMatchingPositionInSubject = #"\G"#

/// \y
case textSegment = #"\y"#

/// \Y
case notTextSegment = #"\Y"#

/// ^
case startOfLine = #"^"#

/// $
case endOfLine = #"$"#

/// \b (from outside a custom character class)
case wordBoundary = #"\b"#

/// \B
case notWordBoundary = #"\B"#

}

public var assertionKind: AssertionKind? {
switch kind {
case .startOfLine: return .startOfLine
case .endOfLine: return .endOfLine

case .escaped(.wordBoundary): return .wordBoundary
case .escaped(.notWordBoundary): return .notWordBoundary
case .escaped(.startOfSubject): return .startOfSubject
case .escaped(.endOfSubject): return .endOfSubject
case .escaped(.textSegment): return .textSegment
case .escaped(.notTextSegment): return .notTextSegment
case .escaped(.endOfSubjectBeforeNewline):
return .endOfSubjectBeforeNewline
case .escaped(.firstMatchingPositionInSubject):
return .firstMatchingPositionInSubject

case .escaped(.resetStartOfMatch): return .resetStartOfMatch

default: return nil
}
}
}

extension AST.Atom {
public enum Callout: Hashable {
/// A PCRE callout written `(?C...)`
Expand Down Expand Up @@ -806,18 +745,20 @@ extension AST.Atom {
// the AST? Or defer for the matching engine?
return nil

case .scalarSequence, .property, .any, .startOfLine, .endOfLine,
.backreference, .subpattern, .callout, .backtrackingDirective,
.changeMatchingOptions, .invalid:
case .scalarSequence, .property, .dot, .caretAnchor,
.dollarAnchor, .backreference, .subpattern, .callout,
.backtrackingDirective, .changeMatchingOptions, .invalid:
return nil
}
}

/// Whether this atom is valid as the operand of a custom character class
/// range.
public var isValidCharacterClassRangeBound: Bool {
// If we have a literal character value for this, it can be used as a bound.
if literalCharacterValue != nil { return true }
if let c = literalCharacterValue {
// We only match character range bounds that are single scalar NFC.
return c.hasExactlyOneScalar && c.isNFC
}
switch kind {
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
Expand Down Expand Up @@ -858,7 +799,7 @@ extension AST.Atom {
case .keyboardMetaControl(let x):
return "\\M-\\C-\(x)"

case .property, .escaped, .any, .startOfLine, .endOfLine,
case .property, .escaped, .dot, .caretAnchor, .dollarAnchor,
.backreference, .subpattern, .namedCharacter, .callout,
.backtrackingDirective, .changeMatchingOptions, .invalid:
return nil
Expand All @@ -874,7 +815,7 @@ extension AST.Atom {
// TODO: Are callouts quantifiable?
case .escaped(let esc):
return esc.isQuantifiable
case .startOfLine, .endOfLine:
case .caretAnchor, .dollarAnchor:
return false
default:
return true
Expand Down
52 changes: 27 additions & 25 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -480,35 +480,37 @@ extension Parser {
///
mutating func lexQuantifier(
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
var trivia: [AST.Trivia] = []
tryEating { p in
var trivia: [AST.Trivia] = []

if let t = lexNonSemanticWhitespace() { trivia.append(t) }
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }

let amt: Located<Quant.Amount>? = recordLoc { p in
if p.tryEat("*") { return .zeroOrMore }
if p.tryEat("+") { return .oneOrMore }
if p.tryEat("?") { return .zeroOrOne }
let amt: Located<Quant.Amount>? = p.recordLoc { p in
if p.tryEat("*") { return .zeroOrMore }
if p.tryEat("+") { return .oneOrMore }
if p.tryEat("?") { return .zeroOrOne }

return p.tryEating { p in
guard p.tryEat("{"),
let range = p.lexRange(trivia: &trivia),
p.tryEat("}")
else { return nil }
return range.value
return p.tryEating { p in
guard p.tryEat("{"),
let range = p.lexRange(trivia: &trivia),
p.tryEat("}")
else { return nil }
return range.value
}
}
}
guard let amt = amt else { return nil }
guard let amt = amt else { return nil }

// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }

let kind: Located<Quant.Kind> = recordLoc { p in
if p.tryEat("?") { return .reluctant }
if p.tryEat("+") { return .possessive }
return .eager
}
let kind: Located<Quant.Kind> = p.recordLoc { p in
if p.tryEat("?") { return .reluctant }
if p.tryEat("+") { return .possessive }
return .eager
}

return (amt, kind, trivia)
return (amt, kind, trivia)
}
}

/// Try to consume a range, returning `nil` if unsuccessful.
Expand Down Expand Up @@ -2073,9 +2075,9 @@ extension Parser {
p.unreachable("Should have lexed a group or group-like atom")

// (sometimes) special metacharacters
case ".": return customCC ? .char(".") : .any
case "^": return customCC ? .char("^") : .startOfLine
case "$": return customCC ? .char("$") : .endOfLine
case ".": return customCC ? .char(".") : .dot
case "^": return customCC ? .char("^") : .caretAnchor
case "$": return customCC ? .char("$") : .dollarAnchor

// Escaped
case "\\": return p.expectEscaped().value
Expand Down
Loading