Skip to content

Expanded options #246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -111,30 +111,41 @@ extension Compiler.ByteCodeGen {
}

case .startOfLine:
builder.buildAssert { (input, pos, bounds) in
pos == input.startIndex ||
input[input.index(before: pos)].isNewline
if options.anchorsMatchNewlines {
builder.buildAssert { (input, pos, bounds) in
pos == input.startIndex || input[input.index(before: pos)].isNewline
}
} else {
builder.buildAssert { (input, pos, bounds) in
pos == input.startIndex
}
}

case .endOfLine:
builder.buildAssert { (input, pos, bounds) in
pos == input.endIndex || input[pos].isNewline
if options.anchorsMatchNewlines {
builder.buildAssert { (input, pos, bounds) in
pos == input.endIndex || input[pos].isNewline
}
} else {
builder.buildAssert { (input, pos, bounds) in
pos == input.endIndex
}
}

case .wordBoundary:
// TODO: May want to consider Unicode level
builder.buildAssert { (input, pos, bounds) in
builder.buildAssert { [options] (input, pos, bounds) in
// TODO: How should we handle bounds?
CharacterClass.word.isBoundary(
input, at: pos, bounds: bounds)
input, at: pos, bounds: bounds, with: options)
}

case .notWordBoundary:
// TODO: May want to consider Unicode level
builder.buildAssert { (input, pos, bounds) in
builder.buildAssert { [options] (input, pos, bounds) in
// TODO: How should we handle bounds?
!CharacterClass.word.isBoundary(
input, at: pos, bounds: bounds)
input, at: pos, bounds: bounds, with: options)
}
}
}
Expand Down
74 changes: 49 additions & 25 deletions Sources/_StringProcessing/CharacterClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,14 @@ public struct CharacterClass: Hashable {
var op: SetOperator
var rhs: CharacterSetComponent

public func matches(_ c: Character) -> Bool {
public func matches(_ c: Character, with options: MatchingOptions) -> Bool {
switch op {
case .intersection:
return lhs.matches(c) && rhs.matches(c)
return lhs.matches(c, with: options) && rhs.matches(c, with: options)
case .subtraction:
return lhs.matches(c) && !rhs.matches(c)
return lhs.matches(c, with: options) && !rhs.matches(c, with: options)
case .symmetricDifference:
return lhs.matches(c) != rhs.matches(c)
return lhs.matches(c, with: options) != rhs.matches(c, with: options)
}
}
}
Expand All @@ -87,14 +87,28 @@ public struct CharacterClass: Hashable {
.setOperation(.init(lhs: lhs, op: op, rhs: rhs))
}

public func matches(_ character: Character) -> Bool {
public func matches(_ character: Character, with options: MatchingOptions) -> Bool {
switch self {
case .character(let c): return c == character
case .range(let range): return range.contains(character)
case .character(let c):
if options.isCaseInsensitive {
return c.lowercased() == character.lowercased()
} else {
return c == character
}
case .range(let range):
if options.isCaseInsensitive {
let newLower = range.lowerBound.lowercased()
let newUpper = range.upperBound.lowercased()
// FIXME: Is failing this possible? Is this the right behavior if so?
guard newLower <= newUpper else { return false }
return (newLower...newUpper).contains(character.lowercased())
} else {
return range.contains(character)
}
case .characterClass(let custom):
let str = String(character)
return custom.matches(in: str, at: str.startIndex) != nil
case .setOperation(let op): return op.matches(character)
return custom.matches(in: str, at: str.startIndex, with: options) != nil
case .setOperation(let op): return op.matches(character, with: options)
}
}
}
Expand Down Expand Up @@ -135,21 +149,26 @@ public struct CharacterClass: Hashable {

/// Returns the end of the match of this character class in `str`, if
/// it matches.
public func matches(in str: String, at i: String.Index) -> String.Index? {
public func matches(in str: String, at i: String.Index, with options: MatchingOptions) -> String.Index? {
switch matchLevel {
case .graphemeCluster:
let c = str[i]
var matched: Bool
switch cc {
case .any, .anyGrapheme: matched = true
case .digit: matched = c.isNumber
case .hexDigit: matched = c.isHexDigit
case .digit:
matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits)
case .hexDigit:
matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits)
case .horizontalWhitespace: fatalError("Not implemented")
case .newlineSequence: matched = c.isNewline
case .newlineSequence:
matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces)
case .verticalWhitespace: fatalError("Not implemented")
case .whitespace: matched = c.isWhitespace
case .word: matched = c.isWordCharacter
case .custom(let set): matched = set.any { $0.matches(c) }
case .whitespace:
matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces)
case .word:
matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord)
case .custom(let set): matched = set.any { $0.matches(c, with: options) }
}
if isInverted {
matched.toggle()
Expand All @@ -161,13 +180,17 @@ public struct CharacterClass: Hashable {
switch cc {
case .any: matched = true
case .anyGrapheme: fatalError("Not matched in this mode")
case .digit: matched = c.properties.numericType != nil
case .hexDigit: matched = Character(c).isHexDigit
case .digit:
matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits)
case .hexDigit:
matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits)
case .horizontalWhitespace: fatalError("Not implemented")
case .newlineSequence: fatalError("Not implemented")
case .verticalWhitespace: fatalError("Not implemented")
case .whitespace: matched = c.properties.isWhitespace
case .word: matched = c.properties.isAlphabetic || c == "_"
case .whitespace:
matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces)
case .word:
matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord)
case .custom: fatalError("Not supported")
}
if isInverted {
Expand Down Expand Up @@ -495,21 +518,22 @@ extension CharacterClass {
func isBoundary(
_ input: String,
at pos: String.Index,
bounds: Range<String.Index>
bounds: Range<String.Index>,
with options: MatchingOptions
) -> Bool {
// FIXME: How should we handle bounds?
// We probably need two concepts
if input.isEmpty { return false }
if pos == input.startIndex {
return self.matches(in: input, at: pos) != nil
return self.matches(in: input, at: pos, with: options) != nil
}
let priorIdx = input.index(before: pos)
if pos == input.endIndex {
return self.matches(in: input, at: priorIdx) != nil
return self.matches(in: input, at: priorIdx, with: options) != nil
}

let prior = self.matches(in: input, at: priorIdx) != nil
let current = self.matches(in: input, at: pos) != nil
let prior = self.matches(in: input, at: priorIdx, with: options) != nil
let current = self.matches(in: input, at: pos, with: options) != nil
return prior != current
}

Expand Down
2 changes: 1 addition & 1 deletion Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ extension AST.Atom {
) {
return { input, bounds in
// FIXME: should we worry about out of bounds?
cc.matches(in: input, at: bounds.lowerBound)
cc.matches(in: input, at: bounds.lowerBound, with: opts)
}
}

Expand Down
27 changes: 24 additions & 3 deletions Sources/_StringProcessing/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import _RegexParser

/// A type that represents the current state of regex matching options, with
/// stack-based scoping.
struct MatchingOptions {
public struct MatchingOptions {
fileprivate var stack: [Representation]

fileprivate func _invariantCheck() {
Expand Down Expand Up @@ -67,11 +67,32 @@ extension MatchingOptions {
stack.last!.contains(.singleLine)
}

var anchorsMatchNewlines: Bool {
stack.last!.contains(.multiline)
}

var usesASCIIWord: Bool {
stack.last!.contains(.asciiOnlyWord)
|| stack.last!.contains(.asciiOnlyPOSIXProps)
}

var usesASCIIDigits: Bool {
stack.last!.contains(.asciiOnlyDigit)
|| stack.last!.contains(.asciiOnlyPOSIXProps)
}

var usesASCIISpaces: Bool {
stack.last!.contains(.asciiOnlySpace)
|| stack.last!.contains(.asciiOnlyPOSIXProps)
}

var usesSimpleUnicodeBoundaries: Bool {
!stack.last!.contains(.unicodeWordBoundaries)
}

enum SemanticLevel {
case graphemeCluster
case unicodeScalar
// TODO: include?
// case byte
}

var semanticLevel: SemanticLevel {
Expand Down
Loading