@@ -99,16 +99,18 @@ namespace {
99
99
for (auto *V : RHS)
100
100
AllValues.push_back (V);
101
101
}
102
+
103
+ bool AreSymmetrical (BinOpChain *Other);
102
104
};
103
105
104
106
struct Reduction {
105
107
PHINode *Phi; // The Phi-node from where we start
106
108
// pattern matching.
107
109
Instruction *AccIntAdd; // The accumulating integer add statement,
108
110
// i.e, the reduction statement.
109
-
110
111
OpChainList MACCandidates; // The MAC candidates associated with
111
112
// this reduction statement.
113
+ PMACPairList PMACPairs;
112
114
Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
113
115
};
114
116
@@ -121,10 +123,13 @@ namespace {
121
123
Loop *L;
122
124
const DataLayout *DL;
123
125
Module *M;
126
+ std::map<LoadInst*, LoadInst*> LoadPairs;
127
+ std::map<LoadInst*, SmallVector<LoadInst*, 4 >> SequentialLoads;
124
128
125
- bool InsertParallelMACs (Reduction &Reduction, PMACPairList &PMACPairs);
129
+ bool RecordSequentialLoads (BasicBlock *Header);
130
+ bool InsertParallelMACs (Reduction &Reduction);
126
131
bool AreSequentialLoads (LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
127
- PMACPairList CreateParallelMACPairs (OpChainList &Candidates );
132
+ void CreateParallelMACPairs (Reduction &R );
128
133
Instruction *CreateSMLADCall (LoadInst *VecLd0, LoadInst *VecLd1,
129
134
Instruction *Acc, bool Exchange,
130
135
Instruction *InsertAfter);
@@ -202,6 +207,12 @@ namespace {
202
207
203
208
LLVM_DEBUG (dbgs () << " \n == Parallel DSP pass ==\n " );
204
209
LLVM_DEBUG (dbgs () << " - " << F.getName () << " \n\n " );
210
+
211
+ if (!RecordSequentialLoads (Header)) {
212
+ LLVM_DEBUG (dbgs () << " - No sequential loads found.\n " );
213
+ return false ;
214
+ }
215
+
205
216
Changes = MatchSMLAD (F);
206
217
return Changes;
207
218
}
@@ -254,58 +265,14 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) {
254
265
return false ;
255
266
}
256
267
257
- // Element-by-element comparison of Value lists returning true if they are
258
- // instructions with the same opcode or constants with the same value.
259
- static bool AreSymmetrical (const ValueList &VL0,
260
- const ValueList &VL1) {
261
- if (VL0.size () != VL1.size ()) {
262
- LLVM_DEBUG (dbgs () << " Muls are mismatching operand list lengths: "
263
- << VL0.size () << " != " << VL1.size () << " \n " );
264
- return false ;
265
- }
266
-
267
- const unsigned Pairs = VL0.size ();
268
- LLVM_DEBUG (dbgs () << " Number of operand pairs: " << Pairs << " \n " );
269
-
270
- for (unsigned i = 0 ; i < Pairs; ++i) {
271
- const Value *V0 = VL0[i];
272
- const Value *V1 = VL1[i];
273
- const auto *Inst0 = dyn_cast<Instruction>(V0);
274
- const auto *Inst1 = dyn_cast<Instruction>(V1);
275
-
276
- LLVM_DEBUG (dbgs () << " Pair " << i << " :\n " ;
277
- dbgs () << " mul1: " ; V0->dump ();
278
- dbgs () << " mul2: " ; V1->dump ());
279
-
280
- if (!Inst0 || !Inst1)
281
- return false ;
282
-
283
- if (Inst0->isSameOperationAs (Inst1)) {
284
- LLVM_DEBUG (dbgs () << " OK: same operation found!\n " );
285
- continue ;
286
- }
287
-
288
- const APInt *C0, *C1;
289
- if (!(match (V0, m_APInt (C0)) && match (V1, m_APInt (C1)) && C0 == C1))
290
- return false ;
291
- }
292
-
293
- LLVM_DEBUG (dbgs () << " OK: found symmetrical operand lists.\n " );
294
- return true ;
295
- }
296
-
297
268
template <typename MemInst>
298
269
static bool AreSequentialAccesses (MemInst *MemOp0, MemInst *MemOp1,
299
- MemInstList &VecMem, const DataLayout &DL,
300
- ScalarEvolution &SE) {
270
+ const DataLayout &DL, ScalarEvolution &SE) {
301
271
if (!MemOp0->isSimple () || !MemOp1->isSimple ()) {
302
272
LLVM_DEBUG (dbgs () << " No, not touching volatile access\n " );
303
273
return false ;
304
274
}
305
275
if (isConsecutiveAccess (MemOp0, MemOp1, DL, SE)) {
306
- VecMem.clear ();
307
- VecMem.push_back (MemOp0);
308
- VecMem.push_back (MemOp1);
309
276
LLVM_DEBUG (dbgs () << " OK: accesses are consecutive.\n " );
310
277
return true ;
311
278
}
@@ -328,16 +295,106 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
328
295
return false ;
329
296
}
330
297
331
- return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
298
+ if (!LoadPairs.count (Ld0) || LoadPairs[Ld0] != Ld1)
299
+ return false ;
300
+
301
+ VecMem.clear ();
302
+ VecMem.push_back (Ld0);
303
+ VecMem.push_back (Ld1);
304
+ return true ;
305
+ }
306
+
307
+ // / Iterate through the block and record base, offset pairs of loads as well as
308
+ // / maximal sequences of sequential loads.
309
+ bool ARMParallelDSP::RecordSequentialLoads (BasicBlock *Header) {
310
+ SmallVector<LoadInst*, 8 > Loads;
311
+ for (auto &I : *Header) {
312
+ auto *Ld = dyn_cast<LoadInst>(&I);
313
+ if (!Ld)
314
+ continue ;
315
+ Loads.push_back (Ld);
316
+ }
317
+
318
+ std::map<LoadInst*, LoadInst*> BaseLoads;
319
+
320
+ for (auto *Ld0 : Loads) {
321
+ for (auto *Ld1 : Loads) {
322
+ if (Ld0 == Ld1)
323
+ continue ;
324
+
325
+ if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
326
+ LoadPairs[Ld0] = Ld1;
327
+ if (BaseLoads.count (Ld0)) {
328
+ LoadInst *Base = BaseLoads[Ld0];
329
+ BaseLoads[Ld1] = Base;
330
+ SequentialLoads[Base].push_back (Ld1);
331
+ } else {
332
+ BaseLoads[Ld1] = Ld0;
333
+ SequentialLoads[Ld0].push_back (Ld1);
334
+ }
335
+ }
336
+ }
337
+ }
338
+ return LoadPairs.size () > 1 ;
332
339
}
333
340
334
- PMACPairList
335
- ARMParallelDSP::CreateParallelMACPairs (OpChainList &Candidates) {
341
+ void ARMParallelDSP::CreateParallelMACPairs (Reduction &R) {
342
+ OpChainList &Candidates = R.MACCandidates ;
343
+ PMACPairList &PMACPairs = R.PMACPairs ;
336
344
const unsigned Elems = Candidates.size ();
337
- PMACPairList PMACPairs;
338
345
339
346
if (Elems < 2 )
340
- return PMACPairs;
347
+ return ;
348
+
349
+ auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) {
350
+ if (!PMul0->AreSymmetrical (PMul1))
351
+ return false ;
352
+
353
+ // The first elements of each vector should be loads with sexts. If we
354
+ // find that its two pairs of consecutive loads, then these can be
355
+ // transformed into two wider loads and the users can be replaced with
356
+ // DSP intrinsics.
357
+ for (unsigned x = 0 ; x < PMul0->LHS .size (); x += 2 ) {
358
+ auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS [x]);
359
+ auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS [x]);
360
+ auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS [x]);
361
+ auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS [x]);
362
+
363
+ if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
364
+ return false ;
365
+
366
+ LLVM_DEBUG (dbgs () << " Looking at operands " << x << " :\n "
367
+ << " \t Ld0: " << *Ld0 << " \n "
368
+ << " \t Ld1: " << *Ld1 << " \n "
369
+ << " and operands " << x + 2 << " :\n "
370
+ << " \t Ld2: " << *Ld2 << " \n "
371
+ << " \t Ld3: " << *Ld3 << " \n " );
372
+
373
+ if (AreSequentialLoads (Ld0, Ld1, PMul0->VecLd )) {
374
+ if (AreSequentialLoads (Ld2, Ld3, PMul1->VecLd )) {
375
+ LLVM_DEBUG (dbgs () << " OK: found two pairs of parallel loads!\n " );
376
+ PMACPairs.push_back (std::make_pair (PMul0, PMul1));
377
+ return true ;
378
+ } else if (AreSequentialLoads (Ld3, Ld2, PMul1->VecLd )) {
379
+ LLVM_DEBUG (dbgs () << " OK: found two pairs of parallel loads!\n " );
380
+ LLVM_DEBUG (dbgs () << " exchanging Ld2 and Ld3\n " );
381
+ PMul1->Exchange = true ;
382
+ PMACPairs.push_back (std::make_pair (PMul0, PMul1));
383
+ return true ;
384
+ }
385
+ } else if (AreSequentialLoads (Ld1, Ld0, PMul0->VecLd ) &&
386
+ AreSequentialLoads (Ld2, Ld3, PMul1->VecLd )) {
387
+ LLVM_DEBUG (dbgs () << " OK: found two pairs of parallel loads!\n " );
388
+ LLVM_DEBUG (dbgs () << " exchanging Ld0 and Ld1\n " );
389
+ LLVM_DEBUG (dbgs () << " and swapping muls\n " );
390
+ PMul0->Exchange = true ;
391
+ // Only the second operand can be exchanged, so swap the muls.
392
+ PMACPairs.push_back (std::make_pair (PMul1, PMul0));
393
+ return true ;
394
+ }
395
+ }
396
+ return false ;
397
+ };
341
398
342
399
SmallPtrSet<const Instruction*, 4 > Paired;
343
400
for (unsigned i = 0 ; i < Elems; ++i) {
@@ -364,77 +421,21 @@ ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
364
421
dbgs () << " - " ; Mul0->dump ();
365
422
dbgs () << " - " ; Mul1->dump ());
366
423
367
- const ValueList &Mul0_LHS = PMul0->LHS ;
368
- const ValueList &Mul0_RHS = PMul0->RHS ;
369
- const ValueList &Mul1_LHS = PMul1->LHS ;
370
- const ValueList &Mul1_RHS = PMul1->RHS ;
371
-
372
- if (!AreSymmetrical (Mul0_LHS, Mul1_LHS) ||
373
- !AreSymmetrical (Mul0_RHS, Mul1_RHS))
374
- continue ;
375
-
376
424
LLVM_DEBUG (dbgs () << " OK: mul operands list match:\n " );
377
- // The first elements of each vector should be loads with sexts. If we
378
- // find that its two pairs of consecutive loads, then these can be
379
- // transformed into two wider loads and the users can be replaced with
380
- // DSP intrinsics.
381
- bool Found = false ;
382
- for (unsigned x = 0 ; x < Mul0_LHS.size (); x += 2 ) {
383
- auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
384
- auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
385
- auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
386
- auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
387
-
388
- if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
389
- continue ;
390
-
391
- LLVM_DEBUG (dbgs () << " Looking at operands " << x << " :\n "
392
- << " \t Ld0: " << *Ld0 << " \n "
393
- << " \t Ld1: " << *Ld1 << " \n "
394
- << " and operands " << x + 2 << " :\n "
395
- << " \t Ld2: " << *Ld2 << " \n "
396
- << " \t Ld3: " << *Ld3 << " \n " );
397
-
398
- if (AreSequentialLoads (Ld0, Ld1, PMul0->VecLd )) {
399
- if (AreSequentialLoads (Ld2, Ld3, PMul1->VecLd )) {
400
- LLVM_DEBUG (dbgs () << " OK: found two pairs of parallel loads!\n " );
401
- PMACPairs.push_back (std::make_pair (PMul0, PMul1));
402
- Found = true ;
403
- } else if (AreSequentialLoads (Ld3, Ld2, PMul1->VecLd )) {
404
- LLVM_DEBUG (dbgs () << " OK: found two pairs of parallel loads!\n " );
405
- LLVM_DEBUG (dbgs () << " exchanging Ld2 and Ld3\n " );
406
- PMul1->Exchange = true ;
407
- PMACPairs.push_back (std::make_pair (PMul0, PMul1));
408
- Found = true ;
409
- }
410
- } else if (AreSequentialLoads (Ld1, Ld0, PMul0->VecLd )) {
411
- if (AreSequentialLoads (Ld2, Ld3, PMul1->VecLd )) {
412
- LLVM_DEBUG (dbgs () << " OK: found two pairs of parallel loads!\n " );
413
- LLVM_DEBUG (dbgs () << " exchanging Ld0 and Ld1\n " );
414
- LLVM_DEBUG (dbgs () << " and swapping muls\n " );
415
- PMul0->Exchange = true ;
416
- // Only the second operand can be exchanged, so swap the muls.
417
- PMACPairs.push_back (std::make_pair (PMul1, PMul0));
418
- Found = true ;
419
- }
420
- }
421
- }
422
- if (Found) {
425
+ if (CanPair (PMul0, PMul1)) {
423
426
Paired.insert (Mul0);
424
427
Paired.insert (Mul1);
425
428
break ;
426
429
}
427
430
}
428
431
}
429
- return PMACPairs;
430
432
}
431
433
432
- bool ARMParallelDSP::InsertParallelMACs (Reduction &Reduction,
433
- PMACPairList &PMACPairs) {
434
+ bool ARMParallelDSP::InsertParallelMACs (Reduction &Reduction) {
434
435
Instruction *Acc = Reduction.Phi ;
435
436
Instruction *InsertAfter = Reduction.AccIntAdd ;
436
437
437
- for (auto &Pair : PMACPairs) {
438
+ for (auto &Pair : Reduction. PMACPairs ) {
438
439
BinOpChain *PMul0 = Pair.first ;
439
440
BinOpChain *PMul1 = Pair.second ;
440
441
LLVM_DEBUG (dbgs () << " Found parallel MACs!!\n " ;
@@ -685,8 +686,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
685
686
for (auto &R : Reductions) {
686
687
if (AreAliased (AA, Reads, Writes, R.MACCandidates ))
687
688
return false ;
688
- PMACPairList PMACPairs = CreateParallelMACPairs (R. MACCandidates );
689
- Changed |= InsertParallelMACs (R, PMACPairs );
689
+ CreateParallelMACPairs (R);
690
+ Changed |= InsertParallelMACs (R);
690
691
}
691
692
692
693
LLVM_DEBUG (if (Changed) dbgs () << " Header block:\n " ; Header->dump (););
@@ -733,6 +734,52 @@ Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
733
734
return Call;
734
735
}
735
736
737
+ // Compare the value lists in Other to this chain.
738
+ bool BinOpChain::AreSymmetrical (BinOpChain *Other) {
739
+ // Element-by-element comparison of Value lists returning true if they are
740
+ // instructions with the same opcode or constants with the same value.
741
+ auto CompareValueList = [](const ValueList &VL0,
742
+ const ValueList &VL1) {
743
+ if (VL0.size () != VL1.size ()) {
744
+ LLVM_DEBUG (dbgs () << " Muls are mismatching operand list lengths: "
745
+ << VL0.size () << " != " << VL1.size () << " \n " );
746
+ return false ;
747
+ }
748
+
749
+ const unsigned Pairs = VL0.size ();
750
+ LLVM_DEBUG (dbgs () << " Number of operand pairs: " << Pairs << " \n " );
751
+
752
+ for (unsigned i = 0 ; i < Pairs; ++i) {
753
+ const Value *V0 = VL0[i];
754
+ const Value *V1 = VL1[i];
755
+ const auto *Inst0 = dyn_cast<Instruction>(V0);
756
+ const auto *Inst1 = dyn_cast<Instruction>(V1);
757
+
758
+ LLVM_DEBUG (dbgs () << " Pair " << i << " :\n " ;
759
+ dbgs () << " mul1: " ; V0->dump ();
760
+ dbgs () << " mul2: " ; V1->dump ());
761
+
762
+ if (!Inst0 || !Inst1)
763
+ return false ;
764
+
765
+ if (Inst0->isSameOperationAs (Inst1)) {
766
+ LLVM_DEBUG (dbgs () << " OK: same operation found!\n " );
767
+ continue ;
768
+ }
769
+
770
+ const APInt *C0, *C1;
771
+ if (!(match (V0, m_APInt (C0)) && match (V1, m_APInt (C1)) && C0 == C1))
772
+ return false ;
773
+ }
774
+
775
+ LLVM_DEBUG (dbgs () << " OK: found symmetrical operand lists.\n " );
776
+ return true ;
777
+ };
778
+
779
+ return CompareValueList (LHS, Other->LHS ) &&
780
+ CompareValueList (RHS, Other->RHS );
781
+ }
782
+
736
783
Pass *llvm::createARMParallelDSPPass () {
737
784
return new ARMParallelDSP ();
738
785
}
0 commit comments