Skip to content

Commit e719e63

Browse files
committed
Handle advancing with possibile sub-character end.
When searching in a substring with an endIndex that isn't on a character boundary, advancing from the penultimate character position to the substring's endIndex can end up failing, since advancing by a character finds the character-aligned index that is _past_ the substring end. This change handles that case.
1 parent 16ba9d0 commit e719e63

File tree

2 files changed

+17
-140
lines changed

2 files changed

+17
-140
lines changed

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 0 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -24,47 +24,6 @@ extension DSLTree._AST.Atom {
2424
}
2525
}
2626

27-
extension Character {
28-
// func generateConsumer(
29-
// _ opts: MatchingOptions
30-
// ) throws -> MEProgram.ConsumeFunction {
31-
// let isCaseInsensitive = opts.isCaseInsensitive
32-
// switch opts.semanticLevel {
33-
// case .graphemeCluster:
34-
// return { input, bounds in
35-
// guard let (char, next) = input.characterAndEnd(
36-
// at: bounds.lowerBound, limitedBy: bounds.upperBound)
37-
// else { return nil }
38-
// if isCaseInsensitive && isCased {
39-
// return char.lowercased() == self.lowercased()
40-
// ? next
41-
// : nil
42-
// } else {
43-
// return char == self
44-
// ? next
45-
// : nil
46-
// }
47-
// }
48-
// case .unicodeScalar:
49-
// // TODO: This should only be reachable from character class emission, can
50-
// // we guarantee that? Otherwise we'd want a different matching behavior.
51-
// let consumers = unicodeScalars.map { s in consumeScalar {
52-
// isCaseInsensitive
53-
// ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping
54-
// : $0 == s
55-
// }}
56-
// return { input, bounds in
57-
// for fn in consumers {
58-
// if let idx = fn(input, bounds) {
59-
// return idx
60-
// }
61-
// }
62-
// return nil
63-
// }
64-
// }
65-
// }
66-
}
67-
6827
extension DSLTree.Atom {
6928
var singleScalarASCIIValue: UInt8? {
7029
switch self {
@@ -78,76 +37,6 @@ extension DSLTree.Atom {
7837
return nil
7938
}
8039
}
81-
82-
// TODO: If ByteCodeGen switches first, then this is unnecessary for
83-
// top-level nodes, but it's also invoked for `.atom` members of a custom CC
84-
// func generateConsumer(
85-
// _ opts: MatchingOptions
86-
// ) throws -> MEProgram.ConsumeFunction? {
87-
// switch self {
88-
// case let .char(c):
89-
// return try c.generateConsumer(opts)
90-
//
91-
// case let .scalar(s):
92-
// // A scalar always matches the same as a single scalar character. This
93-
// // means it must match a whole grapheme in grapheme semantic mode, but
94-
// // can match a single scalar in scalar semantic mode.
95-
// return try Character(s).generateConsumer(opts)
96-
//
97-
// case .any:
98-
// // FIXME: Should this be a total ordering?
99-
// if opts.semanticLevel == .graphemeCluster {
100-
// return { input, bounds in
101-
// input.index(after: bounds.lowerBound)
102-
// }
103-
// } else {
104-
// return consumeScalar { _ in
105-
// true
106-
// }
107-
// }
108-
//
109-
// case .anyNonNewline:
110-
// switch opts.semanticLevel {
111-
// case .graphemeCluster:
112-
// return { input, bounds in
113-
// input[bounds.lowerBound].isNewline
114-
// ? nil
115-
// : input.index(after: bounds.lowerBound)
116-
// }
117-
// case .unicodeScalar:
118-
// return { input, bounds in
119-
// input[bounds.lowerBound].isNewline
120-
// ? nil
121-
// : input.unicodeScalars.index(after: bounds.lowerBound)
122-
// }
123-
// }
124-
//
125-
// case .dot:
126-
// throw Unreachable(".atom(.dot) should be handled by emitDot")
127-
//
128-
// case .assertion:
129-
// // TODO: We could handle, should this be total?
130-
// return nil
131-
// case .characterClass(let cc):
132-
// return cc.generateConsumer(opts)
133-
//
134-
// case .backreference:
135-
// // TODO: Should we handle?
136-
// return nil
137-
//
138-
// case .symbolicReference:
139-
// // TODO: Should we handle?
140-
// return nil
141-
//
142-
// case .changeMatchingOptions:
143-
// // TODO: Should we handle?
144-
// return nil
145-
//
146-
// case let .unconverted(a):
147-
// return try a.ast.generateConsumer(opts)
148-
// }
149-
//
150-
// }
15140
}
15241

15342
extension DSLTree.Atom.CharacterClass {
@@ -389,28 +278,6 @@ extension DSLTree.CustomCharacterClass {
389278
}
390279
)
391280
}
392-
393-
// func generateConsumer(
394-
// _ opts: MatchingOptions
395-
// ) throws -> MEProgram.ConsumeFunction {
396-
// // NOTE: Easy way to implement, obviously not performant
397-
// let consumers = try members.map {
398-
// try $0.generateConsumer(opts)
399-
// }
400-
// return { input, bounds in
401-
// for consumer in consumers {
402-
// if let idx = consumer(input, bounds) {
403-
// return isInverted ? nil : idx
404-
// }
405-
// }
406-
// if isInverted {
407-
// return opts.semanticLevel == .graphemeCluster
408-
// ? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound)
409-
// : input.unicodeScalars.index(after: bounds.lowerBound)
410-
// }
411-
// return nil
412-
// }
413-
// }
414281
}
415282

416283
// NOTE: Conveniences, though not most performant
@@ -519,7 +386,6 @@ extension AST.Atom.CharacterProperty {
519386

520387
case .generalCategory(let p):
521388
return try p.generateConsumer(opts)
522-
// fatalError("TODO: Map categories: \(p)")
523389

524390
case .binary(let prop, value: let value):
525391
let cons = try prop.generateConsumer(opts)

Sources/_StringProcessing/Engine/Processor.swift

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -175,14 +175,25 @@ extension Processor {
175175
// save point was restored
176176
mutating func consume(_ n: Distance) -> Bool {
177177
// TODO: needs benchmark coverage
178-
guard let idx = input.index(
178+
if let idx = input.index(
179179
currentPosition, offsetBy: n.rawValue, limitedBy: end
180-
) else {
181-
signalFailure()
182-
return false
180+
) {
181+
currentPosition = idx
182+
return true
183183
}
184-
currentPosition = idx
185-
return true
184+
185+
// If `end` falls in the middle of a character, and we are trying to advance
186+
// by one "character", then we should max out at `end` even though the above
187+
// advancement will result in `nil`.
188+
if n == 1, let idx = input.unicodeScalars.index(
189+
currentPosition, offsetBy: n.rawValue, limitedBy: end
190+
) {
191+
currentPosition = idx
192+
return true
193+
}
194+
195+
signalFailure()
196+
return false
186197
}
187198

188199
// Advances in unicode scalar view

0 commit comments

Comments
 (0)