Skip to content

[5.7] Character class and scalar coalescing fixes #588

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 21, 2022
Merged
6 changes: 4 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -75,15 +75,17 @@ let package = Package(
name: "RegexBuilder",
dependencies: ["_StringProcessing", "_RegexParser"],
swiftSettings: publicStdlibSettings),
.target(name: "TestSupport",
swiftSettings: [availabilityDefinition]),
.testTarget(
name: "RegexTests",
dependencies: ["_StringProcessing"],
dependencies: ["_StringProcessing", "TestSupport"],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
]),
.testTarget(
name: "RegexBuilderTests",
dependencies: ["_StringProcessing", "RegexBuilder"],
dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"],
swiftSettings: [
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"])
]),
Expand Down
33 changes: 33 additions & 0 deletions Sources/TestSupport/TestSupport.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
//
//===----------------------------------------------------------------------===//

import XCTest

// We need to split this out of the test files, as it needs to be compiled
// *without* `-disable-availability-checking` to ensure the #available check is
// not compiled into a no-op.

#if os(Linux)
public func XCTExpectFailure(
_ message: String? = nil, body: () throws -> Void
) rethrows {}
#endif

/// Guards certain tests to make sure we have a new stdlib available.
public func ensureNewStdlib(
file: StaticString = #file, line: UInt = #line
) -> Bool {
guard #available(SwiftStdlib 5.7, *) else {
XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) }
return false
}
return true
}
6 changes: 4 additions & 2 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -755,8 +755,10 @@ extension AST.Atom {
/// Whether this atom is valid as the operand of a custom character class
/// range.
public var isValidCharacterClassRangeBound: Bool {
// If we have a literal character value for this, it can be used as a bound.
if literalCharacterValue != nil { return true }
if let c = literalCharacterValue {
// We only match character range bounds that are single scalar NFC.
return c.hasExactlyOneScalar && c.isNFC
}
switch kind {
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
Expand Down
46 changes: 24 additions & 22 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -480,35 +480,37 @@ extension Parser {
///
mutating func lexQuantifier(
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
var trivia: [AST.Trivia] = []
tryEating { p in
var trivia: [AST.Trivia] = []

if let t = lexNonSemanticWhitespace() { trivia.append(t) }
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }

let amt: Located<Quant.Amount>? = recordLoc { p in
if p.tryEat("*") { return .zeroOrMore }
if p.tryEat("+") { return .oneOrMore }
if p.tryEat("?") { return .zeroOrOne }
let amt: Located<Quant.Amount>? = p.recordLoc { p in
if p.tryEat("*") { return .zeroOrMore }
if p.tryEat("+") { return .oneOrMore }
if p.tryEat("?") { return .zeroOrOne }

return p.tryEating { p in
guard p.tryEat("{"),
let range = p.lexRange(trivia: &trivia),
p.tryEat("}")
else { return nil }
return range.value
return p.tryEating { p in
guard p.tryEat("{"),
let range = p.lexRange(trivia: &trivia),
p.tryEat("}")
else { return nil }
return range.value
}
}
}
guard let amt = amt else { return nil }
guard let amt = amt else { return nil }

// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }

let kind: Located<Quant.Kind> = recordLoc { p in
if p.tryEat("?") { return .reluctant }
if p.tryEat("+") { return .possessive }
return .eager
}
let kind: Located<Quant.Kind> = p.recordLoc { p in
if p.tryEat("?") { return .reluctant }
if p.tryEat("+") { return .possessive }
return .eager
}

return (amt, kind, trivia)
return (amt, kind, trivia)
}
}

/// Try to consume a range, returning `nil` if unsuccessful.
Expand Down
15 changes: 15 additions & 0 deletions Sources/_RegexParser/Utility/Misc.swift
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,21 @@ extension Substring {
var string: String { String(self) }
}

extension Character {
/// Whether this character is made up of exactly one Unicode scalar value.
public var hasExactlyOneScalar: Bool {
let scalars = unicodeScalars
return scalars.index(after: scalars.startIndex) == scalars.endIndex
}

/// Whether the given character is in NFC form.
internal var isNFC: Bool {
if isASCII { return true }
let str = String(self)
return str._nfcCodeUnits.elementsEqual(str.utf8)
}
}

extension CustomStringConvertible {
@_alwaysEmitIntoClient
public var halfWidthCornerQuoted: String {
Expand Down
165 changes: 162 additions & 3 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen {
builder.label(exit)
}

/// Coalesce any adjacent scalar members in a custom character class together.
/// This is required in order to produce correct grapheme matching behavior.
func coalescingCustomCharacterClassMembers(
_ members: [DSLTree.CustomCharacterClass.Member]
) -> [DSLTree.CustomCharacterClass.Member] {
struct Accumulator {
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
/// ranges will be created.
private var rangeOperands: [String] = [""]

/// The current range operand.
private var current: String {
_read { yield rangeOperands[rangeOperands.count - 1] }
_modify { yield &rangeOperands[rangeOperands.count - 1] }
}

/// Try to accumulate a character class member, returning `true` if
/// successful, `false` otherwise.
mutating func tryAccumulate(
_ member: DSLTree.CustomCharacterClass.Member
) -> Bool {
switch member {
case .atom(let a):
guard let c = a.literalCharacterValue else { return false }
current.append(c)
return true
case .quotedLiteral(let str):
current += str
return true
case let .range(lhs, rhs):
guard let lhs = lhs.literalCharacterValue,
let rhs = rhs.literalCharacterValue
else { return false }
current.append(lhs)
rangeOperands.append(String(rhs))
return true
case .trivia:
// Trivia can be completely ignored if we've already coalesced
// something.
return !current.isEmpty
default:
return false
}
}

func finish() -> [DSLTree.CustomCharacterClass.Member] {
if rangeOperands.count == 1 {
// If we didn't have any additional range operands, this isn't a
// range, we can just form a standard quoted literal.
return [.quotedLiteral(current)]
}
var members = [DSLTree.CustomCharacterClass.Member]()

// We have other range operands, splice them together. For N operands
// we have N - 1 ranges.
for (i, lhs) in rangeOperands.dropLast().enumerated() {
let rhs = rangeOperands[i + 1]

// If this is the first operand we only need to drop the last
// character for its quoted members, otherwise this is both an LHS
// and RHS of a range, and as such needs both sides trimmed.
let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast()
if !leading.isEmpty {
members.append(.quotedLiteral(String(leading)))
}
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
}
// We've handled everything except the quoted portion of the last
// operand, add it now.
let trailing = rangeOperands.last!.dropFirst()
if !trailing.isEmpty {
members.append(.quotedLiteral(String(trailing)))
}
return members
}
}
return members
.map { m -> DSLTree.CustomCharacterClass.Member in
// First we need to recursively coalsce any child character classes.
switch m {
case .custom(let ccc):
return .custom(coalescingCustomCharacterClass(ccc))
case .intersection(let lhs, let rhs):
return .intersection(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .subtraction(let lhs, let rhs):
return .subtraction(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .symmetricDifference(let lhs, let rhs):
return .symmetricDifference(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .atom, .range, .quotedLiteral, .trivia:
return m
}
}
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
accum.tryAccumulate(member)
}
}

func coalescingCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) -> DSLTree.CustomCharacterClass {
// This only needs to be done in grapheme semantic mode. In scalar semantic
// mode, we don't want to coalesce any scalars into a grapheme. This
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
// U+302.
guard options.semanticLevel == .graphemeCluster else { return ccc }

let members = coalescingCustomCharacterClassMembers(ccc.members)
return .init(members: members, isInverted: ccc.isInverted)
}

mutating func emitCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) throws {
// Before emitting a custom character class in grapheme semantic mode, we
// need to coalesce together any adjacent characters and scalars, over which
// we can perform grapheme breaking. This includes e.g range bounds for
// `[e\u{301}-\u{302}]`.
let ccc = coalescingCustomCharacterClass(ccc)
if let asciiBitset = ccc.asAsciiBitset(options),
optimizationsEnabled {
if options.semanticLevel == .unicodeScalar {
Expand All @@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
// Before emitting a concatenation, we need to flatten out any nested
// concatenations, and coalesce any adjacent characters and scalars, forming
// quoted literals of their contents, over which we can perform grapheme
// breaking.
func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
switch node {
case .concatenation(let ch):
return ch.flatMap(flatten)
case .convertedRegexLiteral(let n, _):
return flatten(n)
default:
return [node]
}
}
let children = children
.flatMap(flatten)
.coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
switch node {
case .atom(let a):
guard let c = a.literalCharacterValue else { return false }
str.append(c)
return true
case .quotedLiteral(let q):
str += q
return true
case .trivia:
// Trivia can be completely ignored if we've already coalesced
// something.
return !str.isEmpty
default:
return false
}
}
for child in children {
try emitConcatenationComponent(child)
}
}

@discardableResult
mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
switch node {
Expand All @@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen {
try emitAlternation(children)

case let .concatenation(children):
for child in children {
try emitConcatenationComponent(child)
}
try emitConcatenation(children)

case let .capture(name, refId, child, transform):
options.beginScope()
Expand Down
30 changes: 27 additions & 3 deletions Sources/_StringProcessing/Compiler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,43 @@ class Compiler {
}
}

/// Hashable wrapper for `Any.Type`.
struct AnyHashableType: CustomStringConvertible, Hashable {
var ty: Any.Type
init(_ ty: Any.Type) {
self.ty = ty
}
var description: String { "\(ty)" }

static func == (lhs: Self, rhs: Self) -> Bool {
lhs.ty == rhs.ty
}
func hash(into hasher: inout Hasher) {
hasher.combine(ObjectIdentifier(ty))
}
}

// An error produced when compiling a regular expression.
enum RegexCompilationError: Error, CustomStringConvertible {
enum RegexCompilationError: Error, Hashable, CustomStringConvertible {
// TODO: Source location?
case uncapturedReference
case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType)
case invalidCharacterClassRangeOperand(Character)

static func incorrectOutputType(
incorrect: Any.Type, correct: Any.Type
) -> Self {
.incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct))
}

case incorrectOutputType(incorrect: Any.Type, correct: Any.Type)

var description: String {
switch self {
case .uncapturedReference:
return "Found a reference used before it captured any match."
case .incorrectOutputType(let incorrect, let correct):
return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'"
case .invalidCharacterClassRangeOperand(let c):
return "'\(c)' is an invalid bound for character class range"
}
}
}
Expand Down
Loading