[benchmark] Simplify and add more benchmarks (swiftlang#501)

milseman · rctcwyvrn · commit 1dc7755a6109 · 2022-06-30T11:46:41.000-07:00
* [benchmark] Add no-capture version of grapheme breaking exercise

* [benchmark] Add cross-engine benchmark helpers

* [benchmark] Hangul Syllable finding benchmark
diff --git a/Sources/RegexBenchmark/Benchmark.swift b/Sources/RegexBenchmark/Benchmark.swift
@@ -9,7 +9,7 @@ public protocol RegexBenchmark {
 public struct Benchmark: RegexBenchmark {
   public let name: String
   let regex: Regex<Substring>
-  let ty: MatchType
+  let type: MatchType
   let target: String
 
   public enum MatchType {
@@ -19,7 +19,7 @@ public struct Benchmark: RegexBenchmark {
   }
   
   public func run() {
-    switch ty {
+    switch type {
     case .whole: blackHole(target.wholeMatch(of: regex))
     case .allMatches: blackHole(target.matches(of: regex))
     case .first: blackHole(target.firstMatch(of: regex))
@@ -30,21 +30,21 @@ public struct Benchmark: RegexBenchmark {
 public struct NSBenchmark: RegexBenchmark {
   public let name: String
   let regex: NSRegularExpression
-  let ty: NSMatchType
+  let type: NSMatchType
   let target: String
   
   var range: NSRange {
     NSRange(target.startIndex..<target.endIndex, in: target)
   }
 
   public enum NSMatchType {
-    case all
+    case allMatches
     case first
   }
   
   public func run() {
-    switch ty {
-    case .all: blackHole(regex.matches(in: target, range: range))
+    switch type {
+    case .allMatches: blackHole(regex.matches(in: target, range: range))
     case .first: blackHole(regex.firstMatch(in: target, range: range))
     }
   }
@@ -110,6 +110,80 @@ public struct BenchmarkRunner {
   }
 }
 
+/// A benchmark meant to be ran across multiple engines
+struct CrossBenchmark {
+  /// The base name of the benchmark
+  var baseName: String
+
+  /// The string to compile in differnet engines
+  var regex: String
+
+  /// The text to search
+  var input: String
+
+  // TODO: var output, for validation
+
+  /// Whether this is whole string matching or a searching benchmark
+  ///
+  /// TODO: Probably better ot have a whole-line vs search anywhere, maybe
+  /// accomodate multi-line matching, etc.
+  var isWhole: Bool = false
+
+  func register(_ runner: inout BenchmarkRunner) {
+    let swiftRegex = try! Regex(regex, as: Substring.self)
+
+    let nsPattern = isWhole ? "^" + regex + "$" : regex
+    let nsRegex: NSRegularExpression
+    if isWhole {
+      nsRegex = try! NSRegularExpression(pattern: "^" + regex + "$")
+    } else {
+      nsRegex = try! NSRegularExpression(pattern: regex)
+    }
+
+    if isWhole {
+      runner.register(
+        Benchmark(
+          name: baseName + "Whole",
+          regex: swiftRegex,
+          type: .whole,
+          target: input))
+      runner.register(
+        NSBenchmark(
+          name: baseName + "Whole_NS",
+          regex: nsRegex,
+          type: .first,
+          target: input))
+    } else {
+      runner.register(
+        Benchmark(
+          name: baseName + "First",
+          regex: swiftRegex,
+          type: .first,
+          target: input))
+      runner.register(
+        Benchmark(
+          name: baseName + "All",
+          regex: swiftRegex,
+          type: .allMatches,
+          target: input))
+      runner.register(
+        NSBenchmark(
+          name: baseName + "First_NS",
+          regex: nsRegex,
+          type: .first,
+          target: input))
+      runner.register(
+        NSBenchmark(
+          name: baseName + "All_NS",
+          regex: nsRegex,
+          type: .allMatches,
+          target: input))
+    }
+  }
+}
+
+// TODO: Capture-containing benchmarks
+
 // nom nom nom, consume the argument
 @inline(never)
 public func blackHole<T>(_ x: T) {
diff --git a/Sources/RegexBenchmark/CLI.swift b/Sources/RegexBenchmark/CLI.swift
@@ -14,9 +14,10 @@ struct Runner: ParsableCommand {
   func makeRunner() -> BenchmarkRunner {
     var benchmark = BenchmarkRunner("RegexBench", samples)
     benchmark.addReluctantQuant()
-    benchmark.addBacktracking()
     benchmark.addCSS()
-    benchmark.addFirstMatch()
+    benchmark.addNotFound()
+    benchmark.addGraphemeBreak()
+    benchmark.addHangulSyllable()
     return benchmark
   }
   mutating func run() throws {
diff --git a/Sources/RegexBenchmark/Inputs/GraphemeBreakData.swift b/Sources/RegexBenchmark/Inputs/GraphemeBreakData.swift
diff --git a/Sources/RegexBenchmark/Suite/Backtracking.swift b/Sources/RegexBenchmark/Suite/Backtracking.swift
diff --git a/Sources/RegexBenchmark/Suite/CssRegex.swift b/Sources/RegexBenchmark/Suite/CssRegex.swift
@@ -3,22 +3,12 @@ import _StringProcessing
 
 extension BenchmarkRunner {
   mutating func addCSS() {
-    let r = "--([a-zA-Z0-9_-]+)\\s*:\\s*(.*?):"
-    
-    let cssRegex = Benchmark(
-      name: "cssRegex",
-      regex: try! Regex(r),
-      ty: .allMatches,
-      target: Inputs.swiftOrgCSS
-    )
+    let r = #"--([a-zA-Z0-9_-]+)\s*:\s*(.*?):"#
 
-    let cssRegexNS = NSBenchmark(
-      name: "cssRegexNS",
-      regex: try! NSRegularExpression(pattern: r),
-      ty: .all,
-      target: Inputs.swiftOrgCSS
-    )
-    register(cssRegex)
-    register(cssRegexNS)
+    // FIXME: Why is `first` and `all` the same running time?
+
+    let css = CrossBenchmark(
+      baseName: "css", regex: r, input: Inputs.swiftOrgCSS)
+    css.register(&self)
   }
 }
diff --git a/Sources/RegexBenchmark/Suite/FirstMatch.swift b/Sources/RegexBenchmark/Suite/FirstMatch.swift
diff --git a/Sources/RegexBenchmark/Suite/GraphemeBreak.swift b/Sources/RegexBenchmark/Suite/GraphemeBreak.swift
@@ -0,0 +1,25 @@
+import _StringProcessing
+import RegexBuilder
+
+import Foundation
+
+extension BenchmarkRunner {
+  mutating func addGraphemeBreak() {
+    let input = Inputs.graphemeBreakData
+    let regex = #"(?:[0-9A-F]+)(?:\.\.(?:[0-9A-F]+))?\s+;\s+(?:\w+).*"#
+
+    let benchmark = CrossBenchmark(
+      baseName: "GraphemeBreakNoCap", regex: regex, input: input)
+    benchmark.register(&self)
+  }
+
+  mutating func addHangulSyllable() {
+    let input = Inputs.graphemeBreakData
+    let regex = #"HANGUL SYLLABLE [A-Z]+(?:\.\.HANGUL SYLLABLE [A-Z]+)?"#
+
+    let benchmark = CrossBenchmark(
+      baseName: "HangulSyllable", regex: regex, input: input)
+    benchmark.register(&self)
+  }
+}
+
diff --git a/Sources/RegexBenchmark/Suite/NotFound.swift b/Sources/RegexBenchmark/Suite/NotFound.swift
@@ -0,0 +1,16 @@
+import _StringProcessing
+import Foundation
+
+extension BenchmarkRunner {
+  mutating func addNotFound() {
+    let input = String(repeating: " ", count: 100_000)
+
+    let notFound = CrossBenchmark(
+      baseName: "notFound", regex: "a", input: input)
+    notFound.register(&self)
+
+    let anchoredNotFound = CrossBenchmark(
+      baseName: "notFound", regex: "^ +a", input: input)
+    anchoredNotFound.register(&self)
+  }
+}
diff --git a/Sources/RegexBenchmark/Suite/ReluctantQuant.swift b/Sources/RegexBenchmark/Suite/ReluctantQuant.swift
@@ -3,40 +3,28 @@ import RegexBuilder
 
 extension BenchmarkRunner {
   mutating func addReluctantQuant() {
-    let size = 500000
-    let s = String(repeating: "a", count: size)
-    
-    let reluctantQuant = Benchmark(
-      name: "ReluctantQuant",
-      regex: Regex {
-          OneOrMore(.any, .reluctant)
-      },
-      ty: .whole,
-      target: s
-    )
+    let size = 100_000
+    let input = String(repeating: "a", count: size)
 
-    let eagarQuantWithTerminal = Benchmark(
-      name: "EagarQuantWithTerminal",
-      regex: Regex {
-          OneOrMore(.any, .eager)
-          ";"
-      },
-      ty: .whole,
-      target: s + ";"
-    )
+    let reluctantQuant = CrossBenchmark(
+      baseName: "ReluctantQuant",
+      regex: #".*?"#,
+      input: input,
+      isWhole: true)
+    reluctantQuant.register(&self)
 
-    let reluctantQuantWithTerminal = Benchmark(
-      name: "ReluctantQuantWithTerminal",
-      regex: Regex {
-          OneOrMore(.any, .reluctant)
-          ";"
-      },
-      ty: .whole,
-      target: s + ";"
-    )
-    
-    register(reluctantQuant)
-    register(reluctantQuantWithTerminal)
-    register(eagarQuantWithTerminal)
+    let eagarQuantWithTerminal = CrossBenchmark(
+      baseName: "EagarQuantWithTerminal",
+      regex: #".*;"#,
+      input: input + ";",
+      isWhole: true)
+    eagarQuantWithTerminal.register(&self)
+
+    let reluctantQuantWithTerminal = CrossBenchmark(
+      baseName: "ReluctantQuantWithTerminal",
+      regex: #".*?;"#,
+      input: input + ";",
+      isWhole: true)
+    reluctantQuantWithTerminal.register(&self)
   }
 }