Skip to content

Tighten up some syntax rules #393

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion Sources/_RegexParser/Regex/AST/AST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ extension AST {
/// Comments, non-semantic whitespace, etc
case trivia(Trivia)

/// Intepolation `<{...}>`, currently reserved for future use.
case interpolation(Interpolation)

case atom(Atom)

case customCharacterClass(CustomCharacterClass)
Expand All @@ -78,6 +81,7 @@ extension AST.Node {
case let .quantification(v): return v
case let .quote(v): return v
case let .trivia(v): return v
case let .interpolation(v): return v
case let .atom(v): return v
case let .customCharacterClass(v): return v
case let .empty(v): return v
Expand Down Expand Up @@ -128,7 +132,7 @@ extension AST.Node {
case .group, .conditional, .customCharacterClass, .absentFunction:
return true
case .alternation, .concatenation, .quantification, .quote, .trivia,
.empty:
.empty, .interpolation:
return false
}
}
Expand Down Expand Up @@ -192,6 +196,16 @@ extension AST {
}
}

public struct Interpolation: Hashable, _ASTNode {
public let contents: String
public let location: SourceLocation

public init(_ contents: String, _ location: SourceLocation) {
self.contents = contents
self.location = location
}
}

public struct Empty: Hashable, _ASTNode {
public let location: SourceLocation

Expand Down
2 changes: 1 addition & 1 deletion Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ extension AST.Node {
case .alternation, .concatenation, .group,
.conditional, .quantification, .quote,
.trivia, .customCharacterClass, .empty,
.absentFunction:
.absentFunction, .interpolation:
return nil
}
}
Expand Down
2 changes: 1 addition & 1 deletion Sources/_RegexParser/Regex/Parse/CaptureList.swift
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ extension AST.Node {
break
}

case .quote, .trivia, .atom, .customCharacterClass, .empty:
case .quote, .trivia, .atom, .customCharacterClass, .empty, .interpolation:
break
}
}
Expand Down
3 changes: 3 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ enum ParseError: Error, Hashable {
case expectedNonEmptyContents
case expectedEscape
case invalidEscape(Character)
case confusableCharacter(Character)

case cannotReferToWholePattern

Expand Down Expand Up @@ -128,6 +129,8 @@ extension ParseError: CustomStringConvertible {
return "expected escape sequence"
case .invalidEscape(let c):
return "invalid escape sequence '\\\(c)'"
case .confusableCharacter(let c):
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
case .cannotReferToWholePattern:
return "cannot refer to whole pattern here"
case .quantifierRequiresOperand(let q):
Expand Down
42 changes: 37 additions & 5 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,26 @@ extension Source {
return AST.Quote(str.value, str.location)
}

/// Try to consume an interpolation sequence.
///
/// Interpolation -> '<{' String '}>'
///
mutating func lexInterpolation() throws -> AST.Interpolation? {
let contents = try recordLoc { src -> String? in
try src.tryEating { src in
guard src.tryEat(sequence: "<{") else { return nil }
_ = src.lexUntil { $0.isEmpty || $0.starts(with: "}>") }
guard src.tryEat(sequence: "}>") else { return nil }

// Not currently supported. We error here instead of during Sema to
// get a better error for something like `(<{)}>`.
throw ParseError.unsupported("interpolation")
}
}
guard let contents = contents else { return nil }
return .init(contents.value, contents.location)
}

/// Try to consume a comment
///
/// Comment -> '(?#' [^')']* ')'
Expand Down Expand Up @@ -1674,9 +1694,10 @@ extension Source {
break
}

// We only allow unknown escape sequences for non-letter ASCII, and
// non-ASCII whitespace.
guard (char.isASCII && !char.isLetter) ||
// We only allow unknown escape sequences for non-letter non-number ASCII,
// and non-ASCII whitespace.
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.
guard (char.isASCII && !char.isLetter && !char.isNumber) ||
(!char.isASCII && char.isWhitespace)
else {
throw ParseError.invalidEscape(char)
Expand Down Expand Up @@ -1981,10 +2002,21 @@ extension Source {

case "]":
assert(!customCC, "parser should have prevented this")
fallthrough
break

default: return .char(char)
default:
// Reject non-letter non-number non-`\r\n` ASCII characters that have
// multiple scalars. These may be confusable for metacharacters, e.g
// `[\u{301}]` wouldn't be interpreted as a custom character class due
// to the combining accent (assuming it is literal, not `\u{...}`).
let scalars = char.unicodeScalars
if scalars.count > 1 && scalars.first!.isASCII && char != "\r\n" &&
!char.isLetter && !char.isNumber {
throw ParseError.confusableCharacter(char)
}
break
}
return .char(char)
}
guard let kind = kind else { return nil }
return AST.Atom(kind.value, kind.location)
Expand Down
7 changes: 7 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,13 @@ extension Parser {
result.append(.quote(quote))
continue
}

// Interpolation -> `lexInterpolation`
if let interpolation = try source.lexInterpolation() {
result.append(.interpolation(interpolation))
continue
}

// Quantification -> QuantOperand Quantifier?
if let operand = try parseQuantifierOperand() {
if let (amt, kind, trivia) =
Expand Down
5 changes: 5 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Sema.swift
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,11 @@ extension RegexValidator {
// These are Oniguruma specific.
throw error(.unsupported("absent function"), at: a.location)

case .interpolation(let i):
// This is currently rejected in the parser for better diagnostics, but
// reject here too until we get runtime support.
throw error(.unsupported("interpolation"), at: i.location)

case .quote, .trivia, .empty:
break
}
Expand Down
4 changes: 4 additions & 0 deletions Sources/_RegexParser/Regex/Printing/DumpAST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ extension AST.Trivia {
}
}

extension AST.Interpolation {
public var _dumpBase: String { "interpolation <\(contents)>" }
}

extension AST.Empty {
public var _dumpBase: String { "" }
}
Expand Down
9 changes: 9 additions & 0 deletions Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ extension PrettyPrinter {
case let .trivia(t):
output(t._canonicalBase)

case let .interpolation(i):
output(i._canonicalBase)

case let .atom(a):
output(a._canonicalBase)

Expand Down Expand Up @@ -178,6 +181,12 @@ extension AST.Quote {
}
}

extension AST.Interpolation {
var _canonicalBase: String {
"<{\(contents)}>"
}
}

extension AST.Group.Kind {
var _canonicalBase: String {
switch self {
Expand Down
3 changes: 3 additions & 0 deletions Sources/_StringProcessing/Regex/ASTConversion.swift
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ extension AST.Node {
case let .trivia(v):
return .trivia(v.contents)

case .interpolation:
throw Unsupported("TODO: interpolation")

case let .atom(v):
switch v.kind {
case .scalarSequence(let seq):
Expand Down
6 changes: 6 additions & 0 deletions Tests/RegexTests/MatchTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,12 @@ extension RegexTests {
firstMatchTest(
#"abc\d"#, input: "xyzabc123", match: "abc1")

// MARK: Allowed combining characters

firstMatchTest("e\u{301}", input: "e\u{301}", match: "e\u{301}")
firstMatchTest("1\u{358}", input: "1\u{358}", match: "1\u{358}")
firstMatchTest(#"\ \#u{361}"#, input: " \u{361}", match: " \u{361}")

// MARK: Alternations

firstMatchTest(
Expand Down
66 changes: 54 additions & 12 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,12 @@ extension RegexTests {
#"abc\d"#,
concat("a", "b", "c", escaped(.decimalDigit)))

// MARK: Allowed combining characters

parseTest("e\u{301}", "e\u{301}")
parseTest("1\u{358}", "1\u{358}")
parseTest(#"\ \#u{361}"#, " \u{361}")

// MARK: Alternations

parseTest(
Expand Down Expand Up @@ -466,14 +472,6 @@ extension RegexTests {
parseTest(#"[\08]"#, charClass(scalar_m("\u{0}"), "8"))
parseTest(#"[\0707]"#, charClass(scalar_m("\u{1C7}")))

// TODO: These are treated as octal sequences by PCRE, we should warn and
// suggest user prefix with 0.
parseTest(#"[\1]"#, charClass("1"))
parseTest(#"[\123]"#, charClass("1", "2", "3"))
parseTest(#"[\101]"#, charClass("1", "0", "1"))
parseTest(#"[\7777]"#, charClass("7", "7", "7", "7"))
parseTest(#"[\181]"#, charClass("1", "8", "1"))

// We take *up to* the first two valid digits for \x. No valid digits is 0.
parseTest(#"\x"#, scalar("\u{0}"))
parseTest(#"\x5"#, scalar("\u{5}"))
Expand All @@ -484,6 +482,8 @@ extension RegexTests {
parseTest(#"\u{ a }"#, scalar("\u{A}"))
parseTest(#"\u{ a }\u{ B }"#, concat(scalar("\u{A}"), scalar("\u{B}")))

parseTest(#"[\u{301}]"#, charClass(scalar_m("\u{301}")))

// MARK: Scalar sequences

parseTest(#"\u{A bC}"#, scalarSeq("\u{A}", "\u{BC}"))
Expand Down Expand Up @@ -804,6 +804,20 @@ extension RegexTests {
#"a(?#. comment)b"#,
concat("a", "b"))

// MARK: Interpolation

// These are literal as there's no closing '}>'
parseTest("<{", concat("<", "{"))
parseTest("<{a", concat("<", "{", "a"))
parseTest("<{a}", concat("<", "{", "a", "}"))
parseTest("<{<{}", concat("<", "{", "<", "{", "}"))

// Literal as escaped
parseTest(#"\<{}>"#, concat("<", "{", "}", ">"))

// A quantification
parseTest(#"<{2}"#, exactly(2, of: "<"))

// MARK: Quantification

parseTest("a*", zeroOrMore(of: "a"))
Expand Down Expand Up @@ -1267,10 +1281,6 @@ extension RegexTests {
parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported)
parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported)

// Backreferences are not valid in custom character classes.
parseTest(#"[\8]"#, charClass("8"))
parseTest(#"[\9]"#, charClass("9"))

// These are valid references.
parseTest(#"()\1"#, concat(
capture(empty()), backreference(.absolute(1))
Expand Down Expand Up @@ -2547,6 +2557,17 @@ extension RegexTests {
// TODO: Custom diagnostic for missing '\Q'
diagnosticTest(#"\E"#, .invalidEscape("E"))

// PCRE treats these as octal, but we require a `0` prefix.
diagnosticTest(#"[\1]"#, .invalidEscape("1"))
diagnosticTest(#"[\123]"#, .invalidEscape("1"))
diagnosticTest(#"[\101]"#, .invalidEscape("1"))
diagnosticTest(#"[\7777]"#, .invalidEscape("7"))
diagnosticTest(#"[\181]"#, .invalidEscape("1"))

// Backreferences are not valid in custom character classes.
diagnosticTest(#"[\8]"#, .invalidEscape("8"))
diagnosticTest(#"[\9]"#, .invalidEscape("9"))

// Non-ASCII non-whitespace cases.
diagnosticTest(#"\🔥"#, .invalidEscape("🔥"))
diagnosticTest(#"\🇩🇰"#, .invalidEscape("🇩🇰"))
Expand All @@ -2555,6 +2576,27 @@ extension RegexTests {
diagnosticTest(#"\˂"#, .invalidEscape("˂"))
diagnosticTest(#"\d\#u{301}"#, .invalidEscape("d\u{301}"))

// MARK: Confusable characters

diagnosticTest("[\u{301}]", .confusableCharacter("[\u{301}"))
diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}"))
diagnosticTest("{\u{35B}}", .confusableCharacter("{\u{35B}"))
diagnosticTest(#"\\#u{35C}"#, .confusableCharacter(#"\\#u{35C}"#))
diagnosticTest("^\u{35D}", .confusableCharacter("^\u{35D}"))
diagnosticTest("$\u{35E}", .confusableCharacter("$\u{35E}"))
diagnosticTest(".\u{35F}", .confusableCharacter(".\u{35F}"))
diagnosticTest("|\u{360}", .confusableCharacter("|\u{360}"))
diagnosticTest(" \u{361}", .confusableCharacter(" \u{361}"))

// MARK: Interpolation (currently unsupported)

diagnosticTest("<{}>", .unsupported("interpolation"))
diagnosticTest("<{...}>", .unsupported("interpolation"))
diagnosticTest("<{)}>", .unsupported("interpolation"))
diagnosticTest("<{}}>", .unsupported("interpolation"))
diagnosticTest("<{<{}>", .unsupported("interpolation"))
diagnosticTest("(<{)}>", .unsupported("interpolation"))

// MARK: Character properties

diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx"))
Expand Down