Skip to content

Commit d6c073d

Browse files
committed
Recover from parser errors
Currently we use Swift error handling for parser errors. While this is convenient, it has a number of drawbacks: - Any AST parsed gets thrown away as soon as we encounter an error. This prevents clients from being able to get any useful information from invalid AST (rdar://93677069). - Multiple diagnostics cannot be issued, meaning that e.g a basic syntactic error could obscure a more useful semantic error. - It doesn't extend nicely to e.g warning diagnostics, meaning that we'd eventually end up with 2 ways of emitting diagnostics. - The thrown errors relied on `recordLoc` blocks to annotate them with source location info, which could lead to errors without location info if we forgot to add the appropriate `recordLoc` calls. Additionally, in some cases we want a more fine grained location info than the block would give us. Therefore this commit removes the use of Swift error handling throughout the parser. The parser is now a total function that _always_ returns an AST. If errors are encountered while parsing, they are recorded, and are attached to the resulting AST by the parser. The parser attempts to recover as much of the AST it can when encountering an error. As such, there is now are now `.invalid` atom and character property kinds. Sema then runs and can attach more diagnostics onto the AST. For now, the compiler interface remains the same, and we pick a single error to `throw`, but this will be changed in a later PR to allow multiple errors and warnings, as well as AST recovery. This also means we can better preserve the capture type in the presence of parser errors. Fortunately, in most cases, this is quite a mechanical transformation. It entails: - Moving the lexical analysis methods onto the `Parser`. We were already passing `ParsingContext` parameters for most of them, so it's not clear they were benefitting from the isolation that `Source` offered. Effectively this means that all parsing has access to the context and diagnostics. - Converting error throwing statements into calls to the parser's `error` method (or `unreachable` method for unreachables). This commit also updates the parser tests to be able to be able to match against multiple diagnostics.
1 parent fcbd10c commit d6c073d

File tree

12 files changed

+1942
-1684
lines changed

12 files changed

+1942
-1684
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ extension AST {
8080

8181
// (?i), (?i-m), ...
8282
case changeMatchingOptions(MatchingOptionSequence)
83+
84+
// An invalid atom created by a parse error.
85+
case invalid
8386
}
8487
}
8588
}
@@ -104,6 +107,7 @@ extension AST.Atom {
104107
case .any: return nil
105108
case .startOfLine: return nil
106109
case .endOfLine: return nil
110+
case .invalid: return nil
107111
}
108112
}
109113

@@ -465,6 +469,9 @@ extension AST.Atom.CharacterProperty {
465469
/// Some special properties implemented by Java.
466470
case javaSpecial(JavaSpecial)
467471

472+
/// An invalid property that has been diagnosed by the parser.
473+
case invalid(key: String?, value: String)
474+
468475
public enum MapKind: Hashable {
469476
case lowercase
470477
case uppercase
@@ -801,7 +808,7 @@ extension AST.Atom {
801808

802809
case .scalarSequence, .property, .any, .startOfLine, .endOfLine,
803810
.backreference, .subpattern, .callout, .backtrackingDirective,
804-
.changeMatchingOptions:
811+
.changeMatchingOptions, .invalid:
805812
return nil
806813
}
807814
}
@@ -815,6 +822,10 @@ extension AST.Atom {
815822
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
816823
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
817824
return true
825+
case .scalarSequence:
826+
// Unsupported for now (and we will diagnose as such), but treat it as a
827+
// valid range operand for better recovery.
828+
return true
818829
default:
819830
return false
820831
}
@@ -849,7 +860,7 @@ extension AST.Atom {
849860

850861
case .property, .escaped, .any, .startOfLine, .endOfLine,
851862
.backreference, .subpattern, .namedCharacter, .callout,
852-
.backtrackingDirective, .changeMatchingOptions:
863+
.backtrackingDirective, .changeMatchingOptions, .invalid:
853864
return nil
854865
}
855866
}

Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift

Lines changed: 64 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -9,25 +9,25 @@
99
//
1010
//===----------------------------------------------------------------------===//
1111

12-
extension Source {
12+
extension Parser {
1313
typealias PropertyKind = AST.Atom.CharacterProperty.Kind
1414

1515
static private func withNormalizedForms<T>(
16-
_ str: String, requireInPrefix: Bool = false, match: (String) throws -> T?
17-
) rethrows -> T? {
16+
_ str: String, requireInPrefix: Bool = false, match: (String) -> T?
17+
) -> T? {
1818
// This follows the rules provided by UAX44-LM3, including trying to drop an
1919
// "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for
2020
// consistency with other engines and the Unicode.Scalar.Properties names.
2121
let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" }
2222
.lowercased()
2323
if requireInPrefix {
2424
guard str.hasPrefix("in") else { return nil }
25-
return try match(String(str.dropFirst(2)))
25+
return match(String(str.dropFirst(2)))
2626
}
27-
if let m = try match(str) {
27+
if let m = match(str) {
2828
return m
2929
}
30-
if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) {
30+
if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) {
3131
return m
3232
}
3333
return nil
@@ -736,31 +736,40 @@ extension Source {
736736
return (major, minor)
737737
}
738738

739-
static func classifyCharacterPropertyValueOnly(
740-
_ value: String
741-
) throws -> PropertyKind {
742-
guard !value.isEmpty else { throw ParseError.emptyProperty }
739+
mutating func classifyCharacterPropertyValueOnly(
740+
_ valueLoc: Located<String>
741+
) -> PropertyKind {
742+
let value = valueLoc.value
743+
744+
func error(_ err: ParseError) -> PropertyKind {
745+
self.error(err, at: valueLoc.location)
746+
return .invalid(key: nil, value: value)
747+
}
748+
749+
guard !value.isEmpty else {
750+
return error(.emptyProperty)
751+
}
743752

744753
// Some special cases defined by UTS#18 (and Oniguruma for 'ANY' and
745754
// 'Assigned').
746-
if let specialProp = classifySpecialPropValue(value) {
755+
if let specialProp = Self.classifySpecialPropValue(value) {
747756
return specialProp
748757
}
749758

750759
// The following properties we can infer keys/values for.
751-
if let prop = classifyBoolProperty(value) {
760+
if let prop = Self.classifyBoolProperty(value) {
752761
return .binary(prop, value: true)
753762
}
754-
if let cat = classifyGeneralCategory(value) {
763+
if let cat = Self.classifyGeneralCategory(value) {
755764
return .generalCategory(cat)
756765
}
757-
if let script = classifyScriptProperty(value) {
766+
if let script = Self.classifyScriptProperty(value) {
758767
return .scriptExtension(script)
759768
}
760-
if let posix = classifyPOSIX(value) {
769+
if let posix = Self.classifyPOSIX(value) {
761770
return .posix(posix)
762771
}
763-
if let block = classifyBlockProperty(value, valueOnly: true) {
772+
if let block = Self.classifyBlockProperty(value, valueOnly: true) {
764773
return .block(block)
765774
}
766775

@@ -776,53 +785,67 @@ extension Source {
776785

777786
// TODO: This should be versioned, and do we want a more lax behavior for
778787
// the runtime?
779-
throw ParseError.unknownProperty(key: nil, value: value)
788+
return error(.unknownProperty(key: nil, value: value))
780789
}
781790

782-
static func classifyCharacterProperty(
783-
key: String, value: String
784-
) throws -> PropertyKind {
785-
guard !key.isEmpty && !value.isEmpty else { throw ParseError.emptyProperty }
791+
mutating func classifyCharacterProperty(
792+
key keyLoc: Located<String>, value valueLoc: Located<String>
793+
) -> PropertyKind {
794+
let key = keyLoc.value
795+
let value = valueLoc.value
796+
797+
func valueError(_ err: ParseError) -> PropertyKind {
798+
error(err, at: valueLoc.location)
799+
return .invalid(key: key, value: value)
800+
}
801+
802+
guard !key.isEmpty else {
803+
error(.emptyProperty, at: keyLoc.location)
804+
return .invalid(key: key, value: value)
805+
}
806+
guard !value.isEmpty else {
807+
return valueError(.emptyProperty)
808+
}
786809

787-
if let prop = classifyBoolProperty(key),
788-
let isTrue = classifyCharacterPropertyBoolValue(value) {
810+
if let prop = Self.classifyBoolProperty(key),
811+
let isTrue = Self.classifyCharacterPropertyBoolValue(value) {
789812
return .binary(prop, value: isTrue)
790813
}
791814

792815
// This uses the aliases defined in
793816
// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt.
794-
let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in
817+
let match = Self.withNormalizedForms(key) { normalizedKey -> PropertyKind? in
795818
switch normalizedKey {
796819
case "script", "sc":
797-
guard let script = classifyScriptProperty(value) else {
798-
throw ParseError.unrecognizedScript(value)
820+
guard let script = Self.classifyScriptProperty(value) else {
821+
return valueError(.unrecognizedScript(value))
799822
}
800823
return .script(script)
801824
case "scriptextensions", "scx":
802-
guard let script = classifyScriptProperty(value) else {
803-
throw ParseError.unrecognizedScript(value)
825+
guard let script = Self.classifyScriptProperty(value) else {
826+
return valueError(.unrecognizedScript(value))
804827
}
805828
return .scriptExtension(script)
806829
case "gc", "generalcategory":
807-
guard let cat = classifyGeneralCategory(value) else {
808-
throw ParseError.unrecognizedCategory(value)
830+
guard let cat = Self.classifyGeneralCategory(value) else {
831+
return valueError(.unrecognizedCategory(value))
809832
}
810833
return .generalCategory(cat)
811834
case "age":
812-
guard let (major, minor) = parseAge(value) else {
813-
throw ParseError.invalidAge(value)
835+
guard let (major, minor) = Self.parseAge(value) else {
836+
return valueError(.invalidAge(value))
814837
}
815838
return .age(major: major, minor: minor)
816839
case "name", "na":
817840
return .named(value)
818841
case "numericvalue", "nv":
819842
guard let numericValue = Double(value) else {
820-
throw ParseError.invalidNumericValue(value)
843+
return valueError(.invalidNumericValue(value))
821844
}
822845
return .numericValue(numericValue)
823846
case "numerictype", "nt":
824-
guard let type = classifyNumericType(value) else {
825-
throw ParseError.unrecognizedNumericType(value)
847+
guard let type = Self.classifyNumericType(value) else {
848+
return valueError(.unrecognizedNumericType(value))
826849
}
827850
return .numericType(type)
828851
case "slc", "simplelowercasemapping":
@@ -833,13 +856,13 @@ extension Source {
833856
return .mapping(.titlecase, value)
834857
case "ccc", "canonicalcombiningclass":
835858
guard let cccValue = UInt8(value), cccValue <= 254 else {
836-
throw ParseError.invalidCCC(value)
859+
return valueError(.invalidCCC(value))
837860
}
838861
return .ccc(.init(rawValue: cccValue))
839862

840863
case "blk", "block":
841-
guard let block = classifyBlockProperty(value, valueOnly: false) else {
842-
throw ParseError.unrecognizedBlock(value)
864+
guard let block = Self.classifyBlockProperty(value, valueOnly: false) else {
865+
return valueError(.unrecognizedBlock(value))
843866
}
844867
return .block(block)
845868
default:
@@ -852,6 +875,8 @@ extension Source {
852875
}
853876
// TODO: This should be versioned, and do we want a more lax behavior for
854877
// the runtime?
855-
throw ParseError.unknownProperty(key: key, value: value)
878+
error(.unknownProperty(key: key, value: value),
879+
at: keyLoc.location.union(with: valueLoc.location))
880+
return .invalid(key: key, value: value)
856881
}
857882
}

0 commit comments

Comments
 (0)