Skip to content

Implement semantic diagnostics #379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Sources/PatternConverter/PatternConverter.swift
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ struct PatternConverter: ParsableCommand {
print("Converting '\(delim)\(regex)\(delim)'")

let ast = try _RegexParser.parse(
regex,
regex, .semantic,
experimentalSyntax ? .experimental : .traditional)

// Show rendered source ranges
Expand Down
19 changes: 19 additions & 0 deletions Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,23 @@ extension AST.Atom.EscapedBuiltin {
return nil
}
}

public var isQuantifiable: Bool {
switch self {
case .alarm, .escape, .formfeed, .newline, .carriageReturn, .tab,
.singleDataUnit, .decimalDigit, .notDecimalDigit, .horizontalWhitespace,
.notHorizontalWhitespace, .notNewline, .newlineSequence, .whitespace,
.notWhitespace, .verticalTab, .notVerticalTab, .wordCharacter,
.notWordCharacter, .backspace, .graphemeCluster, .trueAnychar:
return true

case .wordBoundary, .notWordBoundary, .startOfSubject,
.endOfSubjectBeforeNewline, .endOfSubject,
.firstMatchingPositionInSubject, .resetStartOfMatch, .textSegment,
.notTextSegment:
return false
}
}
}

extension AST.Atom {
Expand Down Expand Up @@ -749,6 +766,8 @@ extension AST.Atom {
case .changeMatchingOptions:
return false
// TODO: Are callouts quantifiable?
case .escaped(let esc):
return esc.isQuantifiable
default:
return true
}
Expand Down
15 changes: 10 additions & 5 deletions Sources/_RegexParser/Regex/Parse/CaptureList.swift
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,18 @@ extension CaptureList {
public var name: String?
public var type: Any.Type?
public var optionalDepth: Int
public var location: SourceLocation

public init(
name: String? = nil,
type: Any.Type? = nil,
optionalDepth: Int
optionalDepth: Int,
_ location: SourceLocation
) {
self.name = name
self.type = type
self.optionalDepth = optionalDepth
self.location = location
}
}
}
Expand All @@ -61,13 +64,14 @@ extension AST.Node {
case let .group(g):
switch g.kind.value {
case .capture:
list.append(.init(optionalDepth: nesting))
list.append(.init(optionalDepth: nesting, g.location))

case .namedCapture(let name):
list.append(.init(name: name.value, optionalDepth: nesting))
list.append(.init(name: name.value, optionalDepth: nesting, g.location))

case .balancedCapture(let b):
list.append(.init(name: b.name?.value, optionalDepth: nesting))
list.append(.init(name: b.name?.value, optionalDepth: nesting,
g.location))

default: break
}
Expand Down Expand Up @@ -124,7 +128,8 @@ extension CaptureList.Capture: Equatable {
public static func == (lhs: Self, rhs: Self) -> Bool {
lhs.name == rhs.name &&
lhs.optionalDepth == rhs.optionalDepth &&
lhs.type == rhs.type
lhs.type == rhs.type &&
lhs.location == rhs.location
}
}
extension CaptureList: Equatable {}
Expand Down
2 changes: 1 addition & 1 deletion Sources/_RegexParser/Regex/Parse/CompilerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ public func swiftCompilerParseRegexLiteral(
_ input: String, captureBufferOut: UnsafeMutableRawBufferPointer
) throws -> (regexToEmit: String, version: Int) {
do {
let ast = try parseWithDelimiters(input)
let ast = try parseWithDelimiters(input, .semantic)
// Serialize the capture structure for later type inference.
assert(captureBufferOut.count >= input.utf8.count)
ast.captureStructure.encode(to: captureBufferOut)
Expand Down
35 changes: 31 additions & 4 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ enum ParseError: Error, Hashable {
// TODO: I wonder if it makes sense to store the string.
// This can make equality weird.

// MARK: Syntactic Errors

case numberOverflow(String)
case expectedNumDigits(String, Int)
case expectedNumber(String, kind: RadixKind)
Expand Down Expand Up @@ -43,7 +45,6 @@ enum ParseError: Error, Hashable {

case cannotReferToWholePattern

case notQuantifiable
case quantifierRequiresOperand(String)

case backtrackingDirectiveMustHaveName(String)
Expand All @@ -55,7 +56,6 @@ enum ParseError: Error, Hashable {
case cannotRemoveMatchingOptionsAfterCaret

case expectedCustomCharacterClassMembers
case invalidCharacterClassRangeOperand

case emptyProperty
case unknownProperty(key: String?, value: String)
Expand All @@ -73,6 +73,17 @@ enum ParseError: Error, Hashable {
case cannotRemoveExtendedSyntaxInMultilineMode

case expectedCalloutArgument

// MARK: Semantic Errors

case unsupported(String)
case deprecatedUnicode(String)
case invalidReference(Int)
case duplicateNamedCapture(String)
case invalidCharacterClassRangeOperand
case invalidQuantifierRange(Int, Int)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same enum or separate enum? (I haven't thought about it)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially added them as a separate enum, but it seemed cleaner to do it this way as they share all the same logic as other parser errors for e.g printing and catch block handling. This for example means they can use the same testing logic as other parsing tests. We could split them out in the future, but for now at least I think this is the simplest way to go.

case invalidCharacterRange(from: Character, to: Character)
case notQuantifiable
}

extension IdentifierKind {
Expand All @@ -88,6 +99,7 @@ extension IdentifierKind {
extension ParseError: CustomStringConvertible {
var description: String {
switch self {
// MARK: Syntactic Errors
case let .numberOverflow(s):
return "number overflow: \(s)"
case let .expectedNumDigits(s, i):
Expand All @@ -114,8 +126,6 @@ extension ParseError: CustomStringConvertible {
return "invalid escape sequence '\\\(c)'"
case .cannotReferToWholePattern:
return "cannot refer to whole pattern here"
case .notQuantifiable:
return "expression is not quantifiable"
case .quantifierRequiresOperand(let q):
return "quantifier '\(q)' must appear after expression"
case .backtrackingDirectiveMustHaveName(let b):
Expand Down Expand Up @@ -167,6 +177,23 @@ extension ParseError: CustomStringConvertible {
return "extended syntax may not be disabled in multi-line mode"
case .expectedCalloutArgument:
return "expected argument to callout"

// MARK: Semantic Errors

case let .unsupported(kind):
return "\(kind) is not currently supported"
case let .deprecatedUnicode(kind):
return "\(kind) is a deprecated Unicode property, and is not supported"
case let .invalidReference(i):
return "no capture numbered \(i)"
case let .duplicateNamedCapture(str):
return "group named '\(str)' already exists"
case let .invalidQuantifierRange(lhs, rhs):
return "range lower bound '\(lhs)' must be less than or equal to upper bound '\(rhs)'"
case let .invalidCharacterRange(from: lhs, to: rhs):
return "character '\(lhs)' must compare less than or equal to '\(rhs)'"
case .notQuantifiable:
return "expression is not quantifiable"
}
}
}
Expand Down
35 changes: 23 additions & 12 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,6 @@ extension Parser {
if let (amt, kind, trivia) =
try source.lexQuantifier(context: context) {
let location = loc(_start)
guard operand.isQuantifiable else {
throw Source.LocatedError(ParseError.notQuantifiable, location)
}
result.append(.quantification(
.init(amt, kind, operand, location, trivia: trivia)))
} else {
Expand Down Expand Up @@ -543,11 +540,6 @@ extension Parser {
// Range between atoms.
if let (dashLoc, rhs) =
try source.lexCustomCharClassRangeEnd(context: context) {
guard atom.isValidCharacterClassRangeBound &&
rhs.isValidCharacterClassRangeBound else {
throw ParseError.invalidCharacterClassRangeOperand
}
// TODO: Validate lower <= upper?
members.append(.range(.init(atom, dashLoc, rhs)))
continue
}
Expand All @@ -558,13 +550,31 @@ extension Parser {
}
}

public enum ASTStage {
/// The regex is parsed, and a syntactically valid AST is returned. Otherwise
/// an error is thrown. This is useful for e.g syntax coloring.
case syntactic

/// The regex is parsed, and a syntactically and semantically valid AST is
/// returned. Otherwise an error is thrown. A semantically valid AST has been
/// checked for e.g unsupported constructs and invalid backreferences.
case semantic
}

public func parse<S: StringProtocol>(
_ regex: S, _ syntax: SyntaxOptions
_ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions
) throws -> AST where S.SubSequence == Substring
{
let source = Source(String(regex))
var parser = Parser(source, syntax: syntax)
return try parser.parse()
let ast = try parser.parse()
switch stage {
case .syntactic:
break
case .semantic:
try validate(ast)
}
return ast
}

/// Retrieve the default set of syntax options that a delimiter and literal
Expand All @@ -591,11 +601,12 @@ fileprivate func defaultSyntaxOptions(
/// Parses a given regex string with delimiters, inferring the syntax options
/// from the delimiters used.
public func parseWithDelimiters<S: StringProtocol>(
_ regex: S
_ regex: S, _ stage: ASTStage
) throws -> AST where S.SubSequence == Substring {
let (contents, delim) = droppingRegexDelimiters(String(regex))
do {
return try parse(contents, defaultSyntaxOptions(delim, contents: contents))
let syntax = defaultSyntaxOptions(delim, contents: contents)
return try parse(contents, stage, syntax)
} catch let error as LocatedErrorProtocol {
// Convert the range in 'contents' to the range in 'regex'.
let delimCount = delim.opening.count
Expand Down
Loading