Skip to content

[5.7] Fix infinite loop and scalar matching in grapheme mode + scalar matching optimizations #569

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 133 additions & 67 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,14 @@ fileprivate extension Compiler.ByteCodeGen {
emitDot()

case let .char(c):
try emitCharacter(c)
emitCharacter(c)

case let .scalar(s):
try emitScalar(s)
if options.semanticLevel == .graphemeCluster {
emitCharacter(Character(s))
} else {
emitMatchScalar(s)
}

case let .assertion(kind):
try emitAssertion(kind)
Expand All @@ -94,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitQuotedLiteral(_ s: String) {
guard options.semanticLevel == .graphemeCluster else {
for char in s {
for scalar in char.unicodeScalars {
emitMatchScalar(scalar)
}
}
return
}

// Fast path for eliding boundary checks for an all ascii quoted literal
if optimizationsEnabled && s.allSatisfy(\.isASCII) {
let lastIdx = s.unicodeScalars.indices.last!
for idx in s.unicodeScalars.indices {
let boundaryCheck = idx == lastIdx
let scalar = s.unicodeScalars[idx]
if options.isCaseInsensitive && scalar.properties.isCased {
builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck)
} else {
builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck)
}
}
return
}

for c in s { emitCharacter(c) }
}

mutating func emitBackreference(
_ ref: AST.Reference
) throws {
Expand Down Expand Up @@ -257,41 +289,47 @@ fileprivate extension Compiler.ByteCodeGen {
}
}

mutating func emitScalar(_ s: UnicodeScalar) throws {
// TODO: Native instruction buildMatchScalar(s)
if options.isCaseInsensitive {
// TODO: e.g. buildCaseInsensitiveMatchScalar(s)
builder.buildConsume(by: consumeScalar {
$0.properties.lowercaseMapping == s.properties.lowercaseMapping
})
mutating func emitMatchScalar(_ s: UnicodeScalar) {
assert(options.semanticLevel == .unicodeScalar)
if options.isCaseInsensitive && s.properties.isCased {
builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false)
} else {
builder.buildConsume(by: consumeScalar {
$0 == s
})
builder.buildMatchScalar(s, boundaryCheck: false)
}
}

mutating func emitCharacter(_ c: Character) throws {
// Unicode scalar matches the specific scalars that comprise a character
mutating func emitCharacter(_ c: Character) {
// Unicode scalar mode matches the specific scalars that comprise a character
if options.semanticLevel == .unicodeScalar {
for scalar in c.unicodeScalars {
try emitScalar(scalar)
emitMatchScalar(scalar)
}
return
}

if options.isCaseInsensitive && c.isCased {
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
builder.buildConsume { input, bounds in
let inputChar = input[bounds.lowerBound].lowercased()
let matchChar = c.lowercased()
return inputChar == matchChar
? input.index(after: bounds.lowerBound)
: nil
if optimizationsEnabled && c.isASCII {
// c.isCased ensures that c is not CR-LF,
// so we know that c is a single scalar
assert(c.unicodeScalars.count == 1)
builder.buildMatchScalarCaseInsensitive(
c.unicodeScalars.last!,
boundaryCheck: true)
} else {
builder.buildMatch(c, isCaseInsensitive: true)
}
} else {
builder.buildMatch(c)
return
}

if optimizationsEnabled && c.isASCII {
let lastIdx = c.unicodeScalars.indices.last!
for idx in c.unicodeScalars.indices {
builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx)
}
return
}

builder.buildMatch(c, isCaseInsensitive: false)
}

mutating func emitAny() {
Expand Down Expand Up @@ -567,7 +605,12 @@ fileprivate extension Compiler.ByteCodeGen {
decrement %minTrips and fallthrough

loop-body:
<if can't guarantee forward progress && extraTrips = nil>:
mov currentPosition %pos
evaluate the subexpression
<if can't guarantee forward progress && extraTrips = nil>:
if %pos is currentPosition:
goto exit
goto min-trip-count control block

exit-policy control block:
Expand Down Expand Up @@ -670,7 +713,28 @@ fileprivate extension Compiler.ByteCodeGen {
// <subexpression>
// branch min-trip-count
builder.label(loopBody)

// if we aren't sure if the child node will have forward progress and
// we have an unbounded quantification
let startPosition: PositionRegister?
let emitPositionChecking =
(!optimizationsEnabled || !child.guaranteesForwardProgress) &&
extraTrips == nil

if emitPositionChecking {
startPosition = builder.makePositionRegister()
builder.buildMoveCurrentPosition(into: startPosition!)
} else {
startPosition = nil
}
try emitNode(child)
if emitPositionChecking {
// in all quantifier cases, no matter what minTrips or extraTrips is,
// if we have a successful non-advancing match, branch to exit because it
// can match an arbitrary number of times
builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!)
}

if minTrips <= 1 {
// fallthrough
} else {
Expand Down Expand Up @@ -715,11 +779,12 @@ fileprivate extension Compiler.ByteCodeGen {
_ ccc: DSLTree.CustomCharacterClass
) throws {
if let asciiBitset = ccc.asAsciiBitset(options),
options.semanticLevel == .graphemeCluster,
optimizationsEnabled {
// future work: add a bit to .matchBitset to consume either a character
// or a scalar so we can have this optimization in scalar mode
builder.buildMatchAsciiBitset(asciiBitset)
if options.semanticLevel == .unicodeScalar {
builder.buildScalarMatchAsciiBitset(asciiBitset)
} else {
builder.buildMatchAsciiBitset(asciiBitset)
}
} else {
let consumer = try ccc.generateConsumer(options)
builder.buildConsume(by: consumer)
Expand Down Expand Up @@ -796,45 +861,7 @@ fileprivate extension Compiler.ByteCodeGen {
try emitAtom(a)

case let .quotedLiteral(s):
if options.semanticLevel == .graphemeCluster {
if options.isCaseInsensitive {
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
builder.buildConsume { input, bounds in
var iterator = s.makeIterator()
var currentIndex = bounds.lowerBound
while let ch = iterator.next() {
guard currentIndex < bounds.upperBound,
ch.lowercased() == input[currentIndex].lowercased()
else { return nil }
input.formIndex(after: &currentIndex)
}
return currentIndex
}
} else {
builder.buildMatchSequence(s)
}
} else {
builder.buildConsume {
[caseInsensitive = options.isCaseInsensitive] input, bounds in
// TODO: Case folding
var iterator = s.unicodeScalars.makeIterator()
var currentIndex = bounds.lowerBound
while let scalar = iterator.next() {
guard currentIndex < bounds.upperBound else { return nil }
if caseInsensitive {
if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
return nil
}
} else {
if scalar != input.unicodeScalars[currentIndex] {
return nil
}
}
input.unicodeScalars.formIndex(after: &currentIndex)
}
return currentIndex
}
}
emitQuotedLiteral(s)

case let .convertedRegexLiteral(n, _):
return try emitNode(n)
Expand All @@ -856,3 +883,42 @@ fileprivate extension Compiler.ByteCodeGen {
return nil
}
}

extension DSLTree.Node {
var guaranteesForwardProgress: Bool {
switch self {
case .orderedChoice(let children):
return children.allSatisfy { $0.guaranteesForwardProgress }
case .concatenation(let children):
return children.contains(where: { $0.guaranteesForwardProgress })
case .capture(_, _, let node, _):
return node.guaranteesForwardProgress
case .nonCapturingGroup(let kind, let child):
switch kind.ast {
case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind:
return false
default: return child.guaranteesForwardProgress
}
case .atom(let atom):
switch atom {
case .changeMatchingOptions, .assertion: return false
default: return true
}
case .trivia, .empty:
return false
case .quotedLiteral(let string):
return !string.isEmpty
case .convertedRegexLiteral(let node, _):
return node.guaranteesForwardProgress
case .consumer, .matcher:
// Allow zero width consumers and matchers
return false
case .customCharacterClass:
return true
case .quantification(let amount, _, let child):
let (atLeast, _) = amount.ast.bounds
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
default: return false
}
}
}
Loading