Skip to content

Better coalesce adjacent scalars #574

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 24 additions & 22 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -480,35 +480,37 @@ extension Parser {
///
mutating func lexQuantifier(
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
var trivia: [AST.Trivia] = []
tryEating { p in
var trivia: [AST.Trivia] = []

if let t = lexNonSemanticWhitespace() { trivia.append(t) }
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }

let amt: Located<Quant.Amount>? = recordLoc { p in
if p.tryEat("*") { return .zeroOrMore }
if p.tryEat("+") { return .oneOrMore }
if p.tryEat("?") { return .zeroOrOne }
let amt: Located<Quant.Amount>? = p.recordLoc { p in
if p.tryEat("*") { return .zeroOrMore }
if p.tryEat("+") { return .oneOrMore }
if p.tryEat("?") { return .zeroOrOne }

return p.tryEating { p in
guard p.tryEat("{"),
let range = p.lexRange(trivia: &trivia),
p.tryEat("}")
else { return nil }
return range.value
return p.tryEating { p in
guard p.tryEat("{"),
let range = p.lexRange(trivia: &trivia),
p.tryEat("}")
else { return nil }
return range.value
}
}
}
guard let amt = amt else { return nil }
guard let amt = amt else { return nil }

// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }

let kind: Located<Quant.Kind> = recordLoc { p in
if p.tryEat("?") { return .reluctant }
if p.tryEat("+") { return .possessive }
return .eager
}
let kind: Located<Quant.Kind> = p.recordLoc { p in
if p.tryEat("?") { return .reluctant }
if p.tryEat("+") { return .possessive }
return .eager
}

return (amt, kind, trivia)
return (amt, kind, trivia)
}
}

/// Try to consume a range, returning `nil` if unsuccessful.
Expand Down
165 changes: 162 additions & 3 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -775,9 +775,131 @@ fileprivate extension Compiler.ByteCodeGen {
builder.label(exit)
}

/// Coalesce any adjacent scalar members in a custom character class together.
/// This is required in order to produce correct grapheme matching behavior.
func coalescingCustomCharacterClassMembers(
_ members: [DSLTree.CustomCharacterClass.Member]
) -> [DSLTree.CustomCharacterClass.Member] {
struct Accumulator {
/// A series of range operands. For example, in `[ab-cde-fg]`, this will
/// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
/// ranges will be created.
private var rangeOperands: [String] = [""]

/// The current range operand.
private var current: String {
_read { yield rangeOperands[rangeOperands.count - 1] }
_modify { yield &rangeOperands[rangeOperands.count - 1] }
}

/// Try to accumulate a character class member, returning `true` if
/// successful, `false` otherwise.
mutating func tryAccumulate(
_ member: DSLTree.CustomCharacterClass.Member
) -> Bool {
switch member {
case .atom(let a):
guard let c = a.literalCharacterValue else { return false }
current.append(c)
return true
case .quotedLiteral(let str):
current += str
return true
case let .range(lhs, rhs):
guard let lhs = lhs.literalCharacterValue,
let rhs = rhs.literalCharacterValue
else { return false }
current.append(lhs)
rangeOperands.append(String(rhs))
return true
case .trivia:
// Trivia can be completely ignored if we've already coalesced
// something.
return !current.isEmpty
default:
return false
}
}

func finish() -> [DSLTree.CustomCharacterClass.Member] {
if rangeOperands.count == 1 {
// If we didn't have any additional range operands, this isn't a
// range, we can just form a standard quoted literal.
return [.quotedLiteral(current)]
}
var members = [DSLTree.CustomCharacterClass.Member]()

// We have other range operands, splice them together. For N operands
// we have N - 1 ranges.
for (i, lhs) in rangeOperands.dropLast().enumerated() {
let rhs = rangeOperands[i + 1]

// If this is the first operand we only need to drop the last
// character for its quoted members, otherwise this is both an LHS
// and RHS of a range, and as such needs both sides trimmed.
let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast()
if !leading.isEmpty {
members.append(.quotedLiteral(String(leading)))
}
members.append(.range(.char(lhs.last!), .char(rhs.first!)))
}
// We've handled everything except the quoted portion of the last
// operand, add it now.
let trailing = rangeOperands.last!.dropFirst()
if !trailing.isEmpty {
members.append(.quotedLiteral(String(trailing)))
}
return members
}
}
return members
.map { m -> DSLTree.CustomCharacterClass.Member in
// First we need to recursively coalsce any child character classes.
switch m {
case .custom(let ccc):
return .custom(coalescingCustomCharacterClass(ccc))
case .intersection(let lhs, let rhs):
return .intersection(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .subtraction(let lhs, let rhs):
return .subtraction(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .symmetricDifference(let lhs, let rhs):
return .symmetricDifference(
coalescingCustomCharacterClass(lhs),
coalescingCustomCharacterClass(rhs))
case .atom, .range, .quotedLiteral, .trivia:
return m
}
}
.coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in
accum.tryAccumulate(member)
}
}

func coalescingCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) -> DSLTree.CustomCharacterClass {
// This only needs to be done in grapheme semantic mode. In scalar semantic
// mode, we don't want to coalesce any scalars into a grapheme. This
// means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
// U+302.
guard options.semanticLevel == .graphemeCluster else { return ccc }

let members = coalescingCustomCharacterClassMembers(ccc.members)
return .init(members: members, isInverted: ccc.isInverted)
}

mutating func emitCustomCharacterClass(
_ ccc: DSLTree.CustomCharacterClass
) throws {
// Before emitting a custom character class in grapheme semantic mode, we
// need to coalesce together any adjacent characters and scalars, over which
// we can perform grapheme breaking. This includes e.g range bounds for
// `[e\u{301}-\u{302}]`.
let ccc = coalescingCustomCharacterClass(ccc)
if let asciiBitset = ccc.asAsciiBitset(options),
optimizationsEnabled {
if options.semanticLevel == .unicodeScalar {
Expand All @@ -791,6 +913,45 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
// Before emitting a concatenation, we need to flatten out any nested
// concatenations, and coalesce any adjacent characters and scalars, forming
// quoted literals of their contents, over which we can perform grapheme
// breaking.
func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
switch node {
case .concatenation(let ch):
return ch.flatMap(flatten)
case .convertedRegexLiteral(let n, _):
return flatten(n)
default:
return [node]
}
}
let children = children
.flatMap(flatten)
.coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
switch node {
case .atom(let a):
guard let c = a.literalCharacterValue else { return false }
str.append(c)
return true
case .quotedLiteral(let q):
str += q
return true
case .trivia:
// Trivia can be completely ignored if we've already coalesced
// something.
return !str.isEmpty
default:
return false
}
}
for child in children {
try emitConcatenationComponent(child)
}
}

@discardableResult
mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
switch node {
Expand All @@ -799,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen {
try emitAlternation(children)

case let .concatenation(children):
for child in children {
try emitConcatenationComponent(child)
}
try emitConcatenation(children)

case let .capture(name, refId, child, transform):
options.beginScope()
Expand Down
30 changes: 27 additions & 3 deletions Sources/_StringProcessing/Compiler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,43 @@ class Compiler {
}
}

/// Hashable wrapper for `Any.Type`.
struct AnyHashableType: CustomStringConvertible, Hashable {
var ty: Any.Type
init(_ ty: Any.Type) {
self.ty = ty
}
var description: String { "\(ty)" }

static func == (lhs: Self, rhs: Self) -> Bool {
lhs.ty == rhs.ty
}
func hash(into hasher: inout Hasher) {
hasher.combine(ObjectIdentifier(ty))
}
}

// An error produced when compiling a regular expression.
enum RegexCompilationError: Error, CustomStringConvertible {
enum RegexCompilationError: Error, Hashable, CustomStringConvertible {
// TODO: Source location?
case uncapturedReference
case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType)
case invalidCharacterClassRangeOperand(Character)

static func incorrectOutputType(
incorrect: Any.Type, correct: Any.Type
) -> Self {
.incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct))
}

case incorrectOutputType(incorrect: Any.Type, correct: Any.Type)

var description: String {
switch self {
case .uncapturedReference:
return "Found a reference used before it captured any match."
case .incorrectOutputType(let incorrect, let correct):
return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'"
case .invalidCharacterClassRangeOperand(let c):
return "'\(c)' is an invalid bound for character class range"
}
}
}
Expand Down
61 changes: 33 additions & 28 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ extension DSLTree._AST.Atom {
extension Character {
func generateConsumer(
_ opts: MatchingOptions
) throws -> MEProgram.ConsumeFunction? {
) throws -> MEProgram.ConsumeFunction {
let isCaseInsensitive = opts.isCaseInsensitive
switch opts.semanticLevel {
case .graphemeCluster:
Expand Down Expand Up @@ -327,24 +327,25 @@ extension DSLTree.CustomCharacterClass.Member {
_ opts: MatchingOptions,
_ isInverted: Bool
) -> DSLTree.CustomCharacterClass.AsciiBitset? {
typealias Bitset = DSLTree.CustomCharacterClass.AsciiBitset
switch self {
case let .atom(a):
if let val = a.singleScalarASCIIValue {
return DSLTree.CustomCharacterClass.AsciiBitset(
val,
isInverted,
opts.isCaseInsensitive
)
return Bitset(val, isInverted, opts.isCaseInsensitive)
}
case let .range(low, high):
if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue {
return DSLTree.CustomCharacterClass.AsciiBitset(
low: lowVal,
high: highVal,
isInverted: isInverted,
isCaseInsensitive: opts.isCaseInsensitive
)
if let lowVal = low.singleScalarASCIIValue,
let highVal = high.singleScalarASCIIValue {
return Bitset(low: lowVal, high: highVal, isInverted: isInverted,
isCaseInsensitive: opts.isCaseInsensitive)
}
case .quotedLiteral(let str):
var bitset = Bitset(isInverted: isInverted)
for c in str {
guard let ascii = c._singleScalarAsciiValue else { return nil }
bitset = bitset.union(Bitset(ascii, isInverted, opts.isCaseInsensitive))
}
return bitset
default:
return nil
}
Expand All @@ -361,12 +362,20 @@ extension DSLTree.CustomCharacterClass.Member {
}
return c
case let .range(low, high):
guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else {
guard let lhsChar = low.literalCharacterValue else {
throw Unsupported("\(low) in range")
}
guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else {
guard let rhsChar = high.literalCharacterValue else {
throw Unsupported("\(high) in range")
}

// We must have NFC single scalar bounds.
guard let lhs = lhsChar.singleScalar, lhs.isNFC else {
throw RegexCompilationError.invalidCharacterClassRangeOperand(lhsChar)
}
guard let rhs = rhsChar.singleScalar, rhs.isNFC else {
throw RegexCompilationError.invalidCharacterClassRangeOperand(rhsChar)
}
guard lhs <= rhs else {
throw Unsupported("Invalid range \(low)-\(high)")
}
Expand Down Expand Up @@ -456,21 +465,17 @@ extension DSLTree.CustomCharacterClass.Member {
}
return rhs(input, bounds)
}
case .quotedLiteral(let s):
if opts.isCaseInsensitive {
return { input, bounds in
guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else {
return nil
}
return input.index(after: bounds.lowerBound)
}
} else {
return { input, bounds in
guard s.contains(input[bounds.lowerBound]) else {
return nil
case .quotedLiteral(let str):
let consumers = try str.map {
try $0.generateConsumer(opts)
}
return { input, bounds in
for fn in consumers {
if let idx = fn(input, bounds) {
return idx
}
return input.index(after: bounds.lowerBound)
}
return nil
}
case .trivia:
// TODO: Should probably strip this earlier...
Expand Down
Loading