-
Notifications
You must be signed in to change notification settings - Fork 49
Fix scalar matching in grapheme semantic mode #565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,45 @@ extension DSLTree._AST.Atom { | |
} | ||
} | ||
|
||
extension Character { | ||
func generateConsumer( | ||
_ opts: MatchingOptions | ||
) throws -> MEProgram.ConsumeFunction? { | ||
let isCaseInsensitive = opts.isCaseInsensitive | ||
switch opts.semanticLevel { | ||
case .graphemeCluster: | ||
return { input, bounds in | ||
let low = bounds.lowerBound | ||
if isCaseInsensitive && isCased { | ||
return input[low].lowercased() == lowercased() | ||
? input.index(after: low) | ||
: nil | ||
} else { | ||
return input[low] == self | ||
? input.index(after: low) | ||
: nil | ||
} | ||
} | ||
case .unicodeScalar: | ||
// TODO: This should only be reachable from character class emission, can | ||
// we guarantee that? Otherwise we'd want a different matching behavior. | ||
let consumers = unicodeScalars.map { s in consumeScalar { | ||
isCaseInsensitive | ||
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping | ||
: $0 == s | ||
}} | ||
return { input, bounds in | ||
for fn in consumers { | ||
if let idx = fn(input, bounds) { | ||
return idx | ||
} | ||
} | ||
return nil | ||
} | ||
} | ||
} | ||
} | ||
|
||
extension DSLTree.Atom { | ||
var singleScalarASCIIValue: UInt8? { | ||
switch self { | ||
|
@@ -72,44 +111,15 @@ extension DSLTree.Atom { | |
func generateConsumer( | ||
_ opts: MatchingOptions | ||
) throws -> MEProgram.ConsumeFunction? { | ||
let isCaseInsensitive = opts.isCaseInsensitive | ||
|
||
switch self { | ||
case let .char(c): | ||
if opts.semanticLevel == .graphemeCluster { | ||
return { input, bounds in | ||
let low = bounds.lowerBound | ||
if isCaseInsensitive && c.isCased { | ||
return input[low].lowercased() == c.lowercased() | ||
? input.index(after: low) | ||
: nil | ||
} else { | ||
return input[low] == c | ||
? input.index(after: low) | ||
: nil | ||
} | ||
} | ||
} else { | ||
let consumers = c.unicodeScalars.map { s in consumeScalar { | ||
isCaseInsensitive | ||
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping | ||
: $0 == s | ||
}} | ||
return { input, bounds in | ||
for fn in consumers { | ||
if let idx = fn(input, bounds) { | ||
return idx | ||
} | ||
} | ||
return nil | ||
} | ||
} | ||
return try c.generateConsumer(opts) | ||
|
||
case let .scalar(s): | ||
return consumeScalar { | ||
isCaseInsensitive | ||
? $0.properties.lowercaseMapping == s.properties.lowercaseMapping | ||
: $0 == s | ||
} | ||
// A scalar always matches the same as a single scalar character. This | ||
// means it must match a whole grapheme in grapheme semantic mode, but | ||
// can match a single scalar in scalar semantic mode. | ||
return try Character(s).generateConsumer(opts) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems pretty convoluted. Why do we not just emit a scalar consuming instruction in scalar semantic mode and set the grapheme boundary bit? CC @rctcwyvrn to see if we need to get some of her instructions in first. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we do that within a consumer matching function? I know we eventually want to migrate off consumer functions, but for now this will generate a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This path should only be reached for non-ascii custom character classes so I don't think we can emit an instruction instead of a consume fn in that case. Maybe we could if we made custom character classes emit in the same way we do alternations? |
||
|
||
case .any: | ||
// FIXME: Should this be a total ordering? | ||
|
@@ -211,16 +221,20 @@ extension AST.Atom { | |
var singleScalar: UnicodeScalar? { | ||
switch kind { | ||
case .scalar(let s): return s.value | ||
case .escaped(let e): | ||
guard let s = e.scalarValue else { return nil } | ||
return s | ||
default: return nil | ||
} | ||
} | ||
|
||
var singleScalarASCIIValue: UInt8? { | ||
if let s = singleScalar, s.isASCII { | ||
return UInt8(ascii: s) | ||
} | ||
switch kind { | ||
case let .char(c) where c != "\r\n": | ||
return c.asciiValue | ||
case let .scalar(s) where s.value.isASCII: | ||
return UInt8(ascii: s.value) | ||
default: | ||
return nil | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -216,7 +216,7 @@ extension AST.Atom { | |
|
||
switch self.kind { | ||
case let .char(c): return .char(c) | ||
case let .scalar(s): return .char(Character(s.value)) | ||
case let .scalar(s): return .scalar(s.value) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want the behaviour of regex to be different depending on if we input a unicode value as its scalar code or as a literal character? ie There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think they should be semantically equivalent (i.e they go through the exact same code paths in byte code emission), but this distinction allows us to have more accurate printing of the DSL tree when performing a literal -> DSL conversion There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (if we didn't need the distinction for printing, IMO it would make sense to outright remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If they behave the same then why would we need to print them differently? Printing them differently would imply that they behave differently I think. Also yea, seeing as we're moving towards making a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For example, if the user wrote Regex {
"e\u{301}"
} instead of: Regex {
"é"
} Semantically they're the same, but IMO it's much nicer to preserve the original syntax the user wrote. |
||
case .any: return .any | ||
case let .backreference(r): return .backreference(.init(ast: r)) | ||
case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1120,6 +1120,45 @@ class RegexDSLTests: XCTestCase { | |
} | ||
} | ||
|
||
func testScalarMatching() throws { | ||
// RegexBuilder provides a RegexComponent conformance for UnicodeScalar. In | ||
// grapheme cluster mode, it should only match entire graphemes. It may | ||
// match a single scalar of a grapheme cluster in scalar semantic mode. | ||
XCTAssertNotNil("a".firstMatch(of: "a" as UnicodeScalar)) | ||
XCTAssertNil("a\u{301}".firstMatch(of: "a" as UnicodeScalar)) | ||
XCTAssertNotNil("a\u{301}".firstMatch( | ||
of: ("a" as UnicodeScalar).regex.matchingSemantics(.unicodeScalar))) | ||
|
||
let r1 = Regex { | ||
"a" as UnicodeScalar | ||
} | ||
XCTAssertNil(try r1.firstMatch(in: "a\u{301}")) | ||
XCTAssertNotNil( | ||
try r1.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") | ||
) | ||
|
||
let r2 = Regex { | ||
CharacterClass.anyOf(["a" as UnicodeScalar, "👍"]) | ||
} | ||
XCTAssertNil(try r2.firstMatch(in: "a\u{301}")) | ||
XCTAssertNotNil( | ||
try r2.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") | ||
) | ||
|
||
let r3 = Regex { | ||
"👨" as UnicodeScalar | ||
"\u{200D}" as UnicodeScalar | ||
"👨" as UnicodeScalar | ||
"\u{200D}" as UnicodeScalar | ||
"👧" as UnicodeScalar | ||
"\u{200D}" as UnicodeScalar | ||
"👦" as UnicodeScalar | ||
} | ||
XCTAssertNil(try r3.firstMatch(in: "👨👨👧👦")) | ||
Comment on lines
+1148
to
+1157
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, actually I'm not sure whether this is correct, as we allow matching for e.g: print("👨👨👧👦".firstMatch(of: /👨\u{200D}👨\u{200D}👧\u{200D}👦/) should we be combining the scalars in a DSL concatenation the same way we combine them in a literal? And if so, would it also apply to regex scalar children e.g Or is a DSL concatenation semantically more like the following? print("👨👨👧👦".firstMatch(of: /(?:👨)(?:\u{200D})(?:👨)(?:\u{200D})(?:👧)(?:\u{200D})(?:👦)/)) // nil There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The scalars should get globbed together into a stream of scalars, over which grapheme breaking can occur. This is similar to scalar escaped inside string literals. For the DSL, I'd treat them like a stream of scalars ala There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah makes sense, though we'll need to define exactly when the combining happens, e.g do we combine the following? Regex {
"a" as UnicodeScalar
/\u{301}/
} and would that still happen if there's further contents in the regex? (because this is now a concat within a concat) e.g: Regex {
"a" as UnicodeScalar
/\u{301}b/
} Should Regex {
"a" as UnicodeScalar
Regex {
"\u{301}" as UnicodeScalar
"b"
}
} |
||
XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "👨👨👧👦")) | ||
XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "👨👨👧👦")) | ||
} | ||
|
||
struct SemanticVersion: Equatable { | ||
var major: Int | ||
var minor: Int | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't this be conditional on whether we're in grapheme semantics or scalar semantics?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
emitCharacter
will callemitConsumeScalar
for scalar mode