Skip to content

Commit d30a26e

Browse files
committed
Coalesce adjacent scalars and characters in the DSL
Previously we would emit a series of scalars written in the DSL as a series of individual characters in grapheme semantic mode. Change the behavior such that we coalesce any adjacent scalars and characters, including those in regex literals and nested concatenations. We then perform grapheme breaking over the result, and can emit character matches for scalars that coalesced into a grapheme. This transform subsumes a similar transform we performed for regex literals when converting them to a DSLTree. This has the nice side effect of allowing us to better preserve scalar syntax in the DSL transform. rdar://96942688
1 parent a3afdfa commit d30a26e

File tree

8 files changed

+318
-102
lines changed

8 files changed

+318
-102
lines changed

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,41 @@ fileprivate extension Compiler.ByteCodeGen {
791791
}
792792
}
793793

794+
mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
795+
// Before emitting a concatenation, we need to flatten out any nested
796+
// concatenations, and coalesce any adjacent characters and scalars, forming
797+
// quoted literals of their contents, over which we can perform grapheme
798+
// breaking.
799+
func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] {
800+
switch node {
801+
case .concatenation(let ch):
802+
return ch.flatMap(flatten)
803+
case .convertedRegexLiteral(let n, _):
804+
return flatten(n)
805+
default:
806+
return [node]
807+
}
808+
}
809+
let children = children
810+
.flatMap(flatten)
811+
.coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in
812+
switch node {
813+
case .atom(let a):
814+
guard let c = a.literalCharacterValue else { return false }
815+
str.append(c)
816+
return true
817+
case .quotedLiteral(let q):
818+
str += q
819+
return true
820+
default:
821+
return false
822+
}
823+
}
824+
for child in children {
825+
try emitConcatenationComponent(child)
826+
}
827+
}
828+
794829
@discardableResult
795830
mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? {
796831
switch node {
@@ -799,9 +834,7 @@ fileprivate extension Compiler.ByteCodeGen {
799834
try emitAlternation(children)
800835

801836
case let .concatenation(children):
802-
for child in children {
803-
try emitConcatenationComponent(child)
804-
}
837+
try emitConcatenation(children)
805838

806839
case let .capture(name, refId, child, transform):
807840
options.beginScope()

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,9 @@ extension PrettyPrinter {
7070
for namedCapture in namedCaptures {
7171
print("let \(namedCapture) = Reference(Substring.self)")
7272
}
73-
74-
switch node {
75-
case .concatenation(_):
76-
printAsPattern(convertedFromAST: node)
77-
case .convertedRegexLiteral(.concatenation(_), _):
78-
printAsPattern(convertedFromAST: node)
79-
default:
80-
printBlock("Regex") { printer in
81-
printer.printAsPattern(convertedFromAST: node)
82-
}
73+
74+
printBlock("Regex") { printer in
75+
printer.printAsPattern(convertedFromAST: node, isTopLevel: true)
8376
}
8477
}
8578

@@ -89,7 +82,7 @@ extension PrettyPrinter {
8982
// to have a non-backing-off pretty-printer that this
9083
// can defer to.
9184
private mutating func printAsPattern(
92-
convertedFromAST node: DSLTree.Node
85+
convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false
9386
) {
9487
if patternBackoff(DSLTree._Tree(node)) {
9588
printBackoff(node)
@@ -106,11 +99,7 @@ extension PrettyPrinter {
10699
}
107100

108101
case let .concatenation(c):
109-
printBlock("Regex") { printer in
110-
c.forEach {
111-
printer.printAsPattern(convertedFromAST: $0)
112-
}
113-
}
102+
printConcatenationAsPattern(c, isTopLevel: isTopLevel)
114103

115104
case let .nonCapturingGroup(kind, child):
116105
switch kind.ast {
@@ -263,7 +252,7 @@ extension PrettyPrinter {
263252
// check above, so it should work out. Need a
264253
// cleaner way to do this. This means the argument
265254
// label is a lie.
266-
printAsPattern(convertedFromAST: n)
255+
printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel)
267256

268257
case let .customCharacterClass(ccc):
269258
printAsPattern(ccc)
@@ -279,6 +268,60 @@ extension PrettyPrinter {
279268
print("/* TODO: absent function */")
280269
}
281270
}
271+
272+
enum NodeToPrint {
273+
case dslNode(DSLTree.Node)
274+
case stringLiteral(String)
275+
}
276+
277+
mutating func printAsPattern(_ node: NodeToPrint) {
278+
switch node {
279+
case .dslNode(let n):
280+
printAsPattern(convertedFromAST: n)
281+
case .stringLiteral(let str):
282+
print(str)
283+
}
284+
}
285+
286+
mutating func printConcatenationAsPattern(
287+
_ nodes: [DSLTree.Node], isTopLevel: Bool
288+
) {
289+
// We need to coalesce any adjacent character and scalar elements into a
290+
// string literal, preserving scalar syntax.
291+
let nodes = nodes
292+
.map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) }
293+
.coalescing(
294+
with: StringLiteralBuilder(), into: { .stringLiteral($0.result) }
295+
) { literal, node in
296+
guard case .dslNode(let node) = node else { return false }
297+
switch node {
298+
case let .atom(.char(c)):
299+
literal.append(c)
300+
return true
301+
case let .atom(.scalar(s)):
302+
literal.append(unescaped: s._dslBase)
303+
return true
304+
case .quotedLiteral(let q):
305+
literal.append(q)
306+
return true
307+
default:
308+
return false
309+
}
310+
}
311+
if isTopLevel || nodes.count == 1 {
312+
// If we're at the top level, or we coalesced everything into a single
313+
// element, we don't need to print a surrounding Regex { ... }.
314+
for n in nodes {
315+
printAsPattern(n)
316+
}
317+
return
318+
}
319+
printBlock("Regex") { printer in
320+
for n in nodes {
321+
printer.printAsPattern(n)
322+
}
323+
}
324+
}
282325

283326
mutating func printAsPattern(
284327
_ ccc: DSLTree.CustomCharacterClass,
@@ -341,8 +384,7 @@ extension PrettyPrinter {
341384
charMembers.append(c)
342385
return false
343386
case let .scalar(s):
344-
charMembers.append(
345-
unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}")
387+
charMembers.append(unescaped: s._dslBase)
346388
return false
347389
case .unconverted(_):
348390
return true
@@ -449,9 +491,9 @@ extension PrettyPrinter {
449491
case let .scalar(s):
450492

451493
if wrap {
452-
output("One(.anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\"))")
494+
output("One(.anyOf(\(s._dslBase._bareQuoted)))")
453495
} else {
454-
output(".anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\")")
496+
output(".anyOf(\(s._dslBase._bareQuoted))")
455497
}
456498

457499
case let .unconverted(a):
@@ -625,6 +667,10 @@ extension String {
625667
}
626668
}
627669

670+
extension UnicodeScalar {
671+
var _dslBase: String { "\\u{\(String(value, radix: 16, uppercase: true))}" }
672+
}
673+
628674
/// A helper for building string literals, which handles escaping the contents
629675
/// appended.
630676
fileprivate struct StringLiteralBuilder {
@@ -851,19 +897,15 @@ extension AST.Atom {
851897
}
852898

853899
var _dslBase: (String, canBeWrapped: Bool) {
854-
func scalarLiteral(_ s: UnicodeScalar) -> String {
855-
let hex = String(s.value, radix: 16, uppercase: true)
856-
return "\\u{\(hex)}"
857-
}
858900
switch kind {
859901
case let .char(c):
860902
return (String(c), false)
861903

862904
case let .scalar(s):
863-
return (scalarLiteral(s.value), false)
905+
return (s.value._dslBase, false)
864906

865907
case let .scalarSequence(seq):
866-
return (seq.scalarValues.map(scalarLiteral).joined(), false)
908+
return (seq.scalarValues.map(\._dslBase).joined(), false)
867909

868910
case let .property(p):
869911
return (p._dslBase, true)

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 4 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -43,61 +43,7 @@ extension AST.Node {
4343
return .orderedChoice(children)
4444

4545
case let .concatenation(v):
46-
// Coalesce adjacent children who can produce a
47-
// string literal representation
48-
let astChildren = v.children
49-
func coalesce(
50-
_ idx: Array<AST>.Index
51-
) -> (Array<AST>.Index, String)? {
52-
var result = ""
53-
var idx = idx
54-
while idx < astChildren.endIndex {
55-
guard let atom: AST.Atom = astChildren[idx].as() else { break }
56-
57-
// TODO: For printing, nice to coalesce
58-
// scalars literals too. We likely need a different
59-
// approach even before we have a better IR.
60-
if let char = atom.singleCharacter {
61-
result.append(char)
62-
} else if let scalar = atom.singleScalar {
63-
result.append(Character(scalar))
64-
} else if case .scalarSequence(let seq) = atom.kind {
65-
result += seq.scalarValues.map(Character.init)
66-
} else {
67-
break
68-
}
69-
70-
astChildren.formIndex(after: &idx)
71-
}
72-
return result.isEmpty ? nil : (idx, result)
73-
}
74-
75-
// No need to nest single children concatenations
76-
if astChildren.count == 1 {
77-
return astChildren.first!.dslTreeNode
78-
}
79-
80-
// Check for a single child post-coalescing
81-
if let (idx, str) = coalesce(astChildren.startIndex),
82-
idx == astChildren.endIndex
83-
{
84-
return .quotedLiteral(str)
85-
}
86-
87-
// Coalesce adjacent string children
88-
var curIdx = astChildren.startIndex
89-
var children = Array<DSLTree.Node>()
90-
while curIdx < astChildren.endIndex {
91-
if let (nextIdx, str) = coalesce(curIdx) {
92-
// TODO: Track source info...
93-
children.append(.quotedLiteral(str))
94-
curIdx = nextIdx
95-
} else {
96-
children.append(astChildren[curIdx].dslTreeNode)
97-
astChildren.formIndex(after: &curIdx)
98-
}
99-
}
100-
return .concatenation(children)
46+
return .concatenation(v.children.map(\.dslTreeNode))
10147

10248
case let .group(v):
10349
let child = v.child.dslTreeNode
@@ -135,10 +81,9 @@ extension AST.Node {
13581
case let .atom(v):
13682
switch v.kind {
13783
case .scalarSequence(let seq):
138-
// Scalar sequences are splatted into concatenated scalars, which
139-
// becomes a quoted literal. Sequences nested in concatenations have
140-
// already been coalesced, this just handles the lone atom case.
141-
return .quotedLiteral(String(seq.scalarValues.map(Character.init)))
84+
// The DSL doesn't have an equivalent node for scalar sequences. Splat
85+
// them into a concatenation of scalars.
86+
return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) })
14287
default:
14388
return .atom(v.dslTreeAtom)
14489
}

Sources/_StringProcessing/Regex/DSLTree.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,14 @@ extension DSLTree.Node {
334334
default: return nil
335335
}
336336
}
337+
338+
/// If this node is for a converted literal, look through it.
339+
var lookingThroughConvertedLiteral: Self {
340+
switch self {
341+
case let .convertedRegexLiteral(n, _): return n
342+
default: return self
343+
}
344+
}
337345
}
338346

339347
extension DSLTree.Atom {
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
extension Array {
13+
/// Coalesce adjacent elements using a given accumulator. The accumulator is
14+
/// transformed into an element of the array by `finish`. The `accumulate`
15+
/// function should return `true` if the accumulator has coalesced the
16+
/// element, `false` otherwise.
17+
func coalescing<T>(
18+
with initialAccumulator: T, into finish: (T) -> Element,
19+
accumulate: (inout T, Element) -> Bool
20+
) -> Self {
21+
var didAccumulate = false
22+
var accumulator = initialAccumulator
23+
24+
var result = Self()
25+
for elt in self {
26+
if accumulate(&accumulator, elt) {
27+
// The element has been coalesced into accumulator, there is nothing
28+
// else to do.
29+
didAccumulate = true
30+
continue
31+
}
32+
if didAccumulate {
33+
// We have a leftover accumulator, which needs to be finished before we
34+
// can append the next element.
35+
result.append(finish(accumulator))
36+
accumulator = initialAccumulator
37+
didAccumulate = false
38+
}
39+
result.append(elt)
40+
}
41+
// Handle a leftover accumulation.
42+
if didAccumulate {
43+
result.append(finish(accumulator))
44+
}
45+
return result
46+
}
47+
}

0 commit comments

Comments
 (0)