Implement \R, \v, \h for character/scalar modes (#384)

natecook1000 · web-flow · commit c16e389b3845 · 2022-05-09T18:14:59.000-05:00
Implement \R, \v, \h for character/scalar modes and 
audit assertions and anchors for semantic level.
diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift
@@ -182,15 +182,16 @@ extension RegexValidator {
     _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation
   ) throws {
     switch esc {
-    case .resetStartOfMatch, .singleDataUnit, .verticalTab, .notVerticalTab,
+    case .resetStartOfMatch, .singleDataUnit,
         // '\N' needs to be emitted using 'emitAny'.
         .notNewline:
       throw error(.unsupported("'\\\(esc.character)'"), at: loc)
 
     // Character classes.
     case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace,
         .wordCharacter, .notWordCharacter, .graphemeCluster, .trueAnychar,
-        .horizontalWhitespace, .notHorizontalWhitespace:
+        .horizontalWhitespace, .notHorizontalWhitespace,
+        .verticalTab, .notVerticalTab:
       break
 
     case .newlineSequence:
diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -80,10 +80,16 @@ extension Compiler.ByteCodeGen {
       }
 
     case .endOfSubjectBeforeNewline:
-      builder.buildAssert { (input, pos, bounds) in
+      builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
         if pos == input.endIndex { return true }
-        return input.index(after: pos) == input.endIndex
-         && input[pos].isNewline
+        switch semanticLevel {
+        case .graphemeCluster:
+          return input.index(after: pos) == input.endIndex
+           && input[pos].isNewline
+        case .unicodeScalar:
+          return input.unicodeScalars.index(after: pos) == input.endIndex
+           && input.unicodeScalars[pos].isNewline
+        }
       }
 
     case .endOfSubject:
@@ -115,8 +121,14 @@ extension Compiler.ByteCodeGen {
 
     case .startOfLine:
       if options.anchorsMatchNewlines {
-        builder.buildAssert { (input, pos, bounds) in
-          pos == input.startIndex || input[input.index(before: pos)].isNewline
+        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
+          if pos == input.startIndex { return true }
+          switch semanticLevel {
+          case .graphemeCluster:
+            return input[input.index(before: pos)].isNewline
+          case .unicodeScalar:
+            return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
+          }
         }
       } else {
         builder.buildAssert { (input, pos, bounds) in
@@ -126,8 +138,14 @@ extension Compiler.ByteCodeGen {
       
     case .endOfLine:
       if options.anchorsMatchNewlines {
-        builder.buildAssert { (input, pos, bounds) in
-          pos == input.endIndex || input[pos].isNewline
+        builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in
+          if pos == input.endIndex { return true }
+          switch semanticLevel {
+          case .graphemeCluster:
+            return input[pos].isNewline
+          case .unicodeScalar:
+            return input.unicodeScalars[pos].isNewline
+          }
         }
       } else {
         builder.buildAssert { (input, pos, bounds) in
diff --git a/Sources/_StringProcessing/Unicode/ScalarProps.swift b/Sources/_StringProcessing/Unicode/ScalarProps.swift
@@ -46,3 +46,19 @@ extension Unicode.Script {
     return result
   }
 }
+
+extension UnicodeScalar {
+  var isHorizontalWhitespace: Bool {
+    value == 0x09 || properties.generalCategory == .spaceSeparator
+  }
+  
+  var isNewline: Bool {
+    switch value {
+      case 0x000A...0x000D /* LF ... CR */: return true
+      case 0x0085 /* NEXT LINE (NEL) */: return true
+      case 0x2028 /* LINE SEPARATOR */: return true
+      case 0x2029 /* PARAGRAPH SEPARATOR */: return true
+      default: return false
+    }
+  }
+}
diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift
@@ -178,15 +178,18 @@ public struct _CharacterClassModel: Hashable {
         matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits)
       case .hexDigit:
         matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits)
-      case .horizontalWhitespace: fatalError("Not implemented")
-      case .newlineSequence:
-        matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces)
-      case .verticalWhitespace: fatalError("Not implemented")
+      case .horizontalWhitespace:
+        matched = c.unicodeScalars.first?.isHorizontalWhitespace == true
+          && (c.isASCII || !options.usesASCIISpaces)
+      case .newlineSequence, .verticalWhitespace:
+        matched = c.unicodeScalars.first?.isNewline == true
+          && (c.isASCII || !options.usesASCIISpaces)
       case .whitespace:
         matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces)
       case .word:
         matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord)
-      case .custom(let set): matched = set.any { $0.matches(c, with: options) }
+      case .custom(let set):
+        matched = set.any { $0.matches(c, with: options) }
       }
       if isInverted {
         matched.toggle()
@@ -206,14 +209,21 @@ public struct _CharacterClassModel: Hashable {
         matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits)
       case .hexDigit:
         matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits)
-      case .horizontalWhitespace: fatalError("Not implemented")
-      case .newlineSequence: fatalError("Not implemented")
-      case .verticalWhitespace: fatalError("Not implemented")
+      case .horizontalWhitespace:
+        matched = c.isHorizontalWhitespace && (c.isASCII || !options.usesASCIISpaces)
+      case .verticalWhitespace:
+        matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces)
+      case .newlineSequence:
+        matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces)
+        if c == "\r" && nextIndex != str.endIndex && str.unicodeScalars[nextIndex] == "\n" {
+          str.unicodeScalars.formIndex(after: &nextIndex)
+        }
       case .whitespace:
         matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces)
       case .word:
         matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord)
-      case .custom: fatalError("Not supported")
+      case .custom(let set):
+        matched = set.any { $0.matches(Character(c), with: options) }
       }
       if isInverted {
         matched.toggle()
diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
@@ -22,6 +22,14 @@ import XCTest
 @testable // for internal `matches(of:)`
 import _StringProcessing
 
+extension UnicodeScalar {
+  var value4Digits: String {
+    let valueString = String(value, radix: 16, uppercase: true)
+    if valueString.count >= 4 { return valueString }
+    return String(repeating: "0", count: 4 - valueString.count) + valueString
+  }
+}
+
 class UTS18Tests: XCTestCase {
   var input: String {
     "ABCdefghîøu\u{308}\u{FFF0} -–—[]123"
@@ -262,21 +270,33 @@ extension UTS18Tests {
       09\u{85}\
       10\u{2028}\
       11\u{2029}\
-      
+      12
       """
     // Check the input counts
     var lines = lineInput.matches(of: regex(#"\d{2}"#))
-    XCTAssertEqual(lines.count, 11)
+    XCTAssertEqual(lines.count, 12)
     // Test \R - newline sequence
-    lines = lineInput.matches(of: regex(#"\d{2}\R"#))
+    lines = lineInput.matches(of: regex(#"\d{2}\R^"#).anchorsMatchLineEndings())
+    XCTAssertEqual(lines.count, 11)
+    // Test \v - vertical space
+    lines = lineInput.matches(of: regex(#"\d{2}\v^"#).anchorsMatchLineEndings())
     XCTAssertEqual(lines.count, 11)
     // Test anchors as line boundaries
     lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings())
-    XCTAssertEqual(lines.count, 11)
+    XCTAssertEqual(lines.count, 12)
     // Test that dot does not match line endings
     lines = lineInput.matches(of: regex(#".+"#))
-    XCTAssertEqual(lines.count, 11)
+    XCTAssertEqual(lines.count, 12)
     
+    // Unicode scalar semantics - \R still matches all, including \r\n sequence
+    lines = lineInput.matches(
+      of: regex(#"\d{2}\R(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())
+    XCTAssertEqual(lines.count, 11)
+    // Unicode scalar semantics - \v matches all except for \r\n sequence
+    lines = lineInput.matches(
+      of: regex(#"\d{2}\v(?=\d)"#).matchingSemantics(.unicodeScalar).anchorsMatchLineEndings())
+    XCTAssertEqual(lines.count, 10)
+
     // Does not contain an empty line
     XCTAssertFalse(lineInput.contains(regex(#"^$"#)))
     // Does contain an empty line (between \n and \r, which are reversed here)