Skip to content

Commit 9efa856

Browse files
rctcwyvrnhamishknight
authored andcommitted
Break out of quantification loop if there is no forward progress (swiftlang#560)
This fixes infinite loops when we loop over an internal node that does not have any forward progress. Also included is an optimization to only emit the check/break instructions if we have a case that might result in an infinite loop (possibly non-progressing inner node + unlimited quantification)
1 parent e94ecea commit 9efa856

File tree

9 files changed

+205
-11
lines changed

9 files changed

+205
-11
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,12 @@ fileprivate extension Compiler.ByteCodeGen {
567567
decrement %minTrips and fallthrough
568568
569569
loop-body:
570+
<if can't guarantee forward progress && extraTrips = nil>:
571+
mov currentPosition %pos
570572
evaluate the subexpression
573+
<if can't guarantee forward progress && extraTrips = nil>:
574+
if %pos is currentPosition:
575+
goto exit
571576
goto min-trip-count control block
572577
573578
exit-policy control block:
@@ -670,7 +675,28 @@ fileprivate extension Compiler.ByteCodeGen {
670675
// <subexpression>
671676
// branch min-trip-count
672677
builder.label(loopBody)
678+
679+
// if we aren't sure if the child node will have forward progress and
680+
// we have an unbounded quantification
681+
let startPosition: PositionRegister?
682+
let emitPositionChecking =
683+
(!optimizationsEnabled || !child.guaranteesForwardProgress) &&
684+
extraTrips == nil
685+
686+
if emitPositionChecking {
687+
startPosition = builder.makePositionRegister()
688+
builder.buildMoveCurrentPosition(into: startPosition!)
689+
} else {
690+
startPosition = nil
691+
}
673692
try emitNode(child)
693+
if emitPositionChecking {
694+
// in all quantifier cases, no matter what minTrips or extraTrips is,
695+
// if we have a successful non-advancing match, branch to exit because it
696+
// can match an arbitrary number of times
697+
builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!)
698+
}
699+
674700
if minTrips <= 1 {
675701
// fallthrough
676702
} else {
@@ -856,3 +882,42 @@ fileprivate extension Compiler.ByteCodeGen {
856882
return nil
857883
}
858884
}
885+
886+
extension DSLTree.Node {
887+
var guaranteesForwardProgress: Bool {
888+
switch self {
889+
case .orderedChoice(let children):
890+
return children.allSatisfy { $0.guaranteesForwardProgress }
891+
case .concatenation(let children):
892+
return children.contains(where: { $0.guaranteesForwardProgress })
893+
case .capture(_, _, let node, _):
894+
return node.guaranteesForwardProgress
895+
case .nonCapturingGroup(let kind, let child):
896+
switch kind.ast {
897+
case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind:
898+
return false
899+
default: return child.guaranteesForwardProgress
900+
}
901+
case .atom(let atom):
902+
switch atom {
903+
case .changeMatchingOptions, .assertion: return false
904+
default: return true
905+
}
906+
case .trivia, .empty:
907+
return false
908+
case .quotedLiteral(let string):
909+
return !string.isEmpty
910+
case .convertedRegexLiteral(let node, _):
911+
return node.guaranteesForwardProgress
912+
case .consumer, .matcher:
913+
// Allow zero width consumers and matchers
914+
return false
915+
case .customCharacterClass:
916+
return true
917+
case .quantification(let amount, _, let child):
918+
let (atLeast, _) = amount.ast.bounds
919+
return atLeast ?? 0 > 0 && child.guaranteesForwardProgress
920+
default: return false
921+
}
922+
}
923+
}

Sources/_StringProcessing/Engine/Backtracking.swift

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,18 @@ extension Processor {
3232
// The int registers store values that can be relevant to
3333
// backtracking, such as the number of trips in a quantification.
3434
var intRegisters: [Int]
35+
// Same with position registers
36+
var posRegisters: [Input.Index]
3537

3638
var destructure: (
3739
pc: InstructionAddress,
3840
pos: Position?,
3941
stackEnd: CallStackAddress,
4042
captureEnds: [_StoredCapture],
41-
intRegisters: [Int]
43+
intRegisters: [Int],
44+
PositionRegister: [Input.Index]
4245
) {
43-
(pc, pos, stackEnd, captureEnds, intRegisters)
46+
(pc, pos, stackEnd, captureEnds, intRegisters, posRegisters)
4447
}
4548
}
4649

@@ -53,7 +56,8 @@ extension Processor {
5356
pos: addressOnly ? nil : currentPosition,
5457
stackEnd: .init(callStack.count),
5558
captureEnds: storedCaptures,
56-
intRegisters: registers.ints)
59+
intRegisters: registers.ints,
60+
posRegisters: registers.positions)
5761
}
5862
}
5963

Sources/_StringProcessing/Engine/InstPayload.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -284,10 +284,10 @@ extension Instruction.Payload {
284284
interpretPair()
285285
}
286286

287-
init(pos: PositionRegister, pos2: PositionRegister) {
288-
self.init(pos, pos2)
287+
init(addr: InstructionAddress, position: PositionRegister) {
288+
self.init(addr, position)
289289
}
290-
var pairedPosPos: (PositionRegister, PositionRegister) {
290+
var pairedAddrPos: (InstructionAddress, PositionRegister) {
291291
interpretPair()
292292
}
293293

Sources/_StringProcessing/Engine/Instruction.swift

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ extension Instruction {
3737
///
3838
case moveImmediate
3939

40+
/// Move the current position into a register
41+
///
42+
/// moveCurrentPosition(into: PositionRegister)
43+
///
44+
/// Operands:
45+
/// - Position register to move into
46+
case moveCurrentPosition
47+
4048
// MARK: General Purpose: Control flow
4149

4250
/// Branch to a new instruction
@@ -57,6 +65,16 @@ extension Instruction {
5765
///
5866
case condBranchZeroElseDecrement
5967

68+
/// Conditionally branch if the current position is the same as the register
69+
///
70+
/// condBranch(
71+
/// to: InstAddr, ifSamePositionAs: PositionRegister)
72+
///
73+
/// Operands:
74+
/// - Instruction address to branch to, if the position in the register is the same as currentPosition
75+
/// - Position register to check against
76+
case condBranchSamePosition
77+
6078
// TODO: Function calls
6179

6280
// MARK: - Matching

Sources/_StringProcessing/Engine/MEBuilder.swift

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ extension MEProgram {
3232
var nextIntRegister = IntRegister(0)
3333
var nextCaptureRegister = CaptureRegister(0)
3434
var nextValueRegister = ValueRegister(0)
35+
var nextPositionRegister = PositionRegister(0)
3536

3637
// Special addresses or instructions
3738
var failAddressToken: AddressToken? = nil
@@ -105,6 +106,14 @@ extension MEProgram.Builder {
105106
fixup(to: t)
106107
}
107108

109+
mutating func buildCondBranch(
110+
to t: AddressToken,
111+
ifSamePositionAs r: PositionRegister
112+
) {
113+
instructions.append(.init(.condBranchSamePosition, .init(position: r)))
114+
fixup(to: t)
115+
}
116+
108117
mutating func buildSave(_ t: AddressToken) {
109118
instructions.append(.init(.save))
110119
fixup(to: t)
@@ -211,6 +220,10 @@ extension MEProgram.Builder {
211220
.init(value: value, capture: capture)))
212221
}
213222

223+
mutating func buildMoveCurrentPosition(into r: PositionRegister) {
224+
instructions.append(.init(.moveCurrentPosition, .init(position: r)))
225+
}
226+
214227
mutating func buildBackreference(
215228
_ cap: CaptureRegister
216229
) {
@@ -257,7 +270,8 @@ extension MEProgram.Builder {
257270
switch inst.opcode {
258271
case .condBranchZeroElseDecrement:
259272
payload = .init(addr: addr, int: inst.payload.int)
260-
273+
case .condBranchSamePosition:
274+
payload = .init(addr: addr, position: inst.payload.position)
261275
case .branch, .save, .saveAddress, .clearThrough:
262276
payload = .init(addr: addr)
263277

@@ -281,6 +295,7 @@ extension MEProgram.Builder {
281295
regInfo.sequences = sequences.count
282296
regInfo.ints = nextIntRegister.rawValue
283297
regInfo.values = nextValueRegister.rawValue
298+
regInfo.positions = nextPositionRegister.rawValue
284299
regInfo.bitsets = asciiBitsets.count
285300
regInfo.consumeFunctions = consumeFunctions.count
286301
regInfo.assertionFunctions = assertionFunctions.count
@@ -421,6 +436,12 @@ extension MEProgram.Builder {
421436
return r
422437
}
423438

439+
mutating func makePositionRegister() -> PositionRegister {
440+
let r = nextPositionRegister
441+
defer { nextPositionRegister.rawValue += 1 }
442+
return r
443+
}
444+
424445
// TODO: A register-mapping helper struct, which could release
425446
// registers without monotonicity required
426447

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ extension Processor {
245245
}
246246

247247
mutating func signalFailure() {
248-
guard let (pc, pos, stackEnd, capEnds, intRegisters) =
248+
guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) =
249249
savePoints.popLast()?.destructure
250250
else {
251251
state = .fail
@@ -259,6 +259,7 @@ extension Processor {
259259
callStack.removeLast(callStack.count - stackEnd.rawValue)
260260
storedCaptures = capEnds
261261
registers.ints = intRegisters
262+
registers.positions = posRegisters
262263
}
263264

264265
mutating func abort(_ e: Error? = nil) {
@@ -315,7 +316,10 @@ extension Processor {
315316

316317
registers[reg] = int
317318
controller.step()
318-
319+
case .moveCurrentPosition:
320+
let reg = payload.position
321+
registers[reg] = currentPosition
322+
controller.step()
319323
case .branch:
320324
controller.pc = payload.addr
321325

@@ -327,7 +331,13 @@ extension Processor {
327331
registers[int] -= 1
328332
controller.step()
329333
}
330-
334+
case .condBranchSamePosition:
335+
let (addr, pos) = payload.pairedAddrPos
336+
if registers[pos] == currentPosition {
337+
controller.pc = addr
338+
} else {
339+
controller.step()
340+
}
331341
case .save:
332342
let resumeAddr = payload.addr
333343
let sp = makeSavePoint(resumeAddr)

Sources/_StringProcessing/Engine/Registers.swift

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ extension Processor {
4747
var ints: [Int]
4848

4949
var values: [Any]
50+
51+
var positions: [Input.Index]
5052
}
5153
}
5254

@@ -66,6 +68,12 @@ extension Processor.Registers {
6668
values[i.rawValue] = newValue
6769
}
6870
}
71+
subscript(_ i: PositionRegister) -> Input.Index {
72+
get { positions[i.rawValue] }
73+
set {
74+
positions[i.rawValue] = newValue
75+
}
76+
}
6977
subscript(_ i: ElementRegister) -> Input.Element {
7078
elements[i.rawValue]
7179
}
@@ -89,6 +97,8 @@ extension Processor.Registers {
8997
}
9098

9199
extension Processor.Registers {
100+
static let sentinelIndex = "".startIndex
101+
92102
init(
93103
_ program: MEProgram,
94104
_ sentinel: String.Index
@@ -120,11 +130,15 @@ extension Processor.Registers {
120130

121131
self.values = Array(
122132
repeating: SentinelValue(), count: info.values)
133+
self.positions = Array(
134+
repeating: Processor.Registers.sentinelIndex,
135+
count: info.positions)
123136
}
124137

125138
mutating func reset(sentinel: Input.Index) {
126139
self.ints._setAll(to: 0)
127140
self.values._setAll(to: SentinelValue())
141+
self.positions._setAll(to: Processor.Registers.sentinelIndex)
128142
}
129143
}
130144

Tests/RegexTests/CompileTests.swift

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,4 +208,40 @@ extension RegexTests {
208208
expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset])
209209
expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy])
210210
}
211+
212+
func testQuantificationForwardProgressCompile() {
213+
// Unbounded quantification + non forward progressing inner nodes
214+
// Expect to emit the position checking instructions
215+
expectProgram(for: #"(?:(?=a)){1,}"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
216+
expectProgram(for: #"(?:\b)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
217+
expectProgram(for: #"(?:(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
218+
expectProgram(for: #"(?:|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
219+
expectProgram(for: #"(?:\w|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
220+
expectProgram(for: #"(?:\w|(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
221+
expectProgram(for: #"(?:\w|(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
222+
expectProgram(for: #"(?:\w|(?#comment)(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
223+
expectProgram(for: #"(?:\w|(?i))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
224+
225+
// Bounded quantification, don't emit position checking
226+
expectProgram(for: #"(?:(?=a)){1,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
227+
expectProgram(for: #"(?:\b)?"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
228+
expectProgram(for: #"(?:(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
229+
expectProgram(for: #"(?:|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
230+
expectProgram(for: #"(?:\w|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
231+
expectProgram(for: #"(?:\w|(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
232+
expectProgram(for: #"(?:\w|(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
233+
expectProgram(for: #"(?:\w|(?#comment)(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
234+
expectProgram(for: #"(?:\w|(?i)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
235+
236+
// Inner node is a quantification that does not guarantee forward progress
237+
expectProgram(for: #"(a*)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
238+
expectProgram(for: #"(a?)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
239+
expectProgram(for: #"(a{,5})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
240+
expectProgram(for: #"((\b){,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
241+
expectProgram(for: #"((\b){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
242+
expectProgram(for: #"((|){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition])
243+
// Inner node is a quantification that guarantees forward progress
244+
expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
245+
expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition])
246+
}
211247
}

Tests/RegexTests/MatchTests.swift

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1943,5 +1943,31 @@ extension RegexTests {
19431943
XCTAssertEqual(matches.count, 3)
19441944
}
19451945
}
1946-
}
19471946

1947+
func expectCompletion(regex: String, in target: String) {
1948+
let expectation = XCTestExpectation(description: "Run the given regex to completion")
1949+
Task.init {
1950+
let r = try! Regex(regex)
1951+
let val = target.matches(of: r).isEmpty
1952+
expectation.fulfill()
1953+
return val
1954+
}
1955+
wait(for: [expectation], timeout: 3.0)
1956+
}
1957+
1958+
func testQuantificationForwardProgress() {
1959+
expectCompletion(regex: #"(?:(?=a)){1,}"#, in: "aa")
1960+
expectCompletion(regex: #"(?:\b)+"#, in: "aa")
1961+
expectCompletion(regex: #"(?:(?#comment))+"#, in: "aa")
1962+
expectCompletion(regex: #"(?:|)+"#, in: "aa")
1963+
expectCompletion(regex: #"(?:\w|)+"#, in: "aa")
1964+
expectCompletion(regex: #"(?:\w|(?i-i:))+"#, in: "aa")
1965+
expectCompletion(regex: #"(?:\w|(?#comment))+"#, in: "aa")
1966+
expectCompletion(regex: #"(?:\w|(?#comment)(?i-i:))+"#, in: "aa")
1967+
expectCompletion(regex: #"(?:\w|(?i))+"#, in: "aa")
1968+
expectCompletion(regex: #"(a*)*"#, in: "aa")
1969+
expectCompletion(regex: #"(a?)*"#, in: "aa")
1970+
expectCompletion(regex: #"(a{,4})*"#, in: "aa")
1971+
expectCompletion(regex: #"((|)+)*"#, in: "aa")
1972+
}
1973+
}

0 commit comments

Comments
 (0)