Skip to content
This repository was archived by the owner on Feb 5, 2019. It is now read-only.

Commit 0a1353a

Browse files
committed
[ARM] Small reorganisation in ARMParallelDSP
A few code movement things: - AreSymmetrical is now a method of BinOpChain. - Created a lambda in CreateParallelMACPairs to reduce loop nesting. - A Reduction object now gets pasted in a couple of places instead, including CreateParallelMACPairs so it doesn't need to return a value. I've also added RecordSequentialLoads, which is run before the transformation begins, and caches the interesting loads. This can then be queried later instead of cross checking many load values. Differential Revision: https://reviews.llvm.org/D54254 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@346479 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 5de005f commit 0a1353a

File tree

1 file changed

+161
-114
lines changed

1 file changed

+161
-114
lines changed

lib/Target/ARM/ARMParallelDSP.cpp

Lines changed: 161 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -99,16 +99,18 @@ namespace {
9999
for (auto *V : RHS)
100100
AllValues.push_back(V);
101101
}
102+
103+
bool AreSymmetrical(BinOpChain *Other);
102104
};
103105

104106
struct Reduction {
105107
PHINode *Phi; // The Phi-node from where we start
106108
// pattern matching.
107109
Instruction *AccIntAdd; // The accumulating integer add statement,
108110
// i.e, the reduction statement.
109-
110111
OpChainList MACCandidates; // The MAC candidates associated with
111112
// this reduction statement.
113+
PMACPairList PMACPairs;
112114
Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
113115
};
114116

@@ -121,10 +123,13 @@ namespace {
121123
Loop *L;
122124
const DataLayout *DL;
123125
Module *M;
126+
std::map<LoadInst*, LoadInst*> LoadPairs;
127+
std::map<LoadInst*, SmallVector<LoadInst*, 4>> SequentialLoads;
124128

125-
bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
129+
bool RecordSequentialLoads(BasicBlock *Header);
130+
bool InsertParallelMACs(Reduction &Reduction);
126131
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
127-
PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
132+
void CreateParallelMACPairs(Reduction &R);
128133
Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
129134
Instruction *Acc, bool Exchange,
130135
Instruction *InsertAfter);
@@ -202,6 +207,12 @@ namespace {
202207

203208
LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n");
204209
LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n");
210+
211+
if (!RecordSequentialLoads(Header)) {
212+
LLVM_DEBUG(dbgs() << " - No sequential loads found.\n");
213+
return false;
214+
}
215+
205216
Changes = MatchSMLAD(F);
206217
return Changes;
207218
}
@@ -254,58 +265,14 @@ static bool IsNarrowSequence(Value *V, ValueList &VL) {
254265
return false;
255266
}
256267

257-
// Element-by-element comparison of Value lists returning true if they are
258-
// instructions with the same opcode or constants with the same value.
259-
static bool AreSymmetrical(const ValueList &VL0,
260-
const ValueList &VL1) {
261-
if (VL0.size() != VL1.size()) {
262-
LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
263-
<< VL0.size() << " != " << VL1.size() << "\n");
264-
return false;
265-
}
266-
267-
const unsigned Pairs = VL0.size();
268-
LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
269-
270-
for (unsigned i = 0; i < Pairs; ++i) {
271-
const Value *V0 = VL0[i];
272-
const Value *V1 = VL1[i];
273-
const auto *Inst0 = dyn_cast<Instruction>(V0);
274-
const auto *Inst1 = dyn_cast<Instruction>(V1);
275-
276-
LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
277-
dbgs() << "mul1: "; V0->dump();
278-
dbgs() << "mul2: "; V1->dump());
279-
280-
if (!Inst0 || !Inst1)
281-
return false;
282-
283-
if (Inst0->isSameOperationAs(Inst1)) {
284-
LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
285-
continue;
286-
}
287-
288-
const APInt *C0, *C1;
289-
if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
290-
return false;
291-
}
292-
293-
LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
294-
return true;
295-
}
296-
297268
template<typename MemInst>
298269
static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1,
299-
MemInstList &VecMem, const DataLayout &DL,
300-
ScalarEvolution &SE) {
270+
const DataLayout &DL, ScalarEvolution &SE) {
301271
if (!MemOp0->isSimple() || !MemOp1->isSimple()) {
302272
LLVM_DEBUG(dbgs() << "No, not touching volatile access\n");
303273
return false;
304274
}
305275
if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) {
306-
VecMem.clear();
307-
VecMem.push_back(MemOp0);
308-
VecMem.push_back(MemOp1);
309276
LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n");
310277
return true;
311278
}
@@ -328,16 +295,106 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
328295
return false;
329296
}
330297

331-
return AreSequentialAccesses<LoadInst>(Ld0, Ld1, VecMem, *DL, *SE);
298+
if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1)
299+
return false;
300+
301+
VecMem.clear();
302+
VecMem.push_back(Ld0);
303+
VecMem.push_back(Ld1);
304+
return true;
305+
}
306+
307+
/// Iterate through the block and record base, offset pairs of loads as well as
308+
/// maximal sequences of sequential loads.
309+
bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) {
310+
SmallVector<LoadInst*, 8> Loads;
311+
for (auto &I : *Header) {
312+
auto *Ld = dyn_cast<LoadInst>(&I);
313+
if (!Ld)
314+
continue;
315+
Loads.push_back(Ld);
316+
}
317+
318+
std::map<LoadInst*, LoadInst*> BaseLoads;
319+
320+
for (auto *Ld0 : Loads) {
321+
for (auto *Ld1 : Loads) {
322+
if (Ld0 == Ld1)
323+
continue;
324+
325+
if (AreSequentialAccesses<LoadInst>(Ld0, Ld1, *DL, *SE)) {
326+
LoadPairs[Ld0] = Ld1;
327+
if (BaseLoads.count(Ld0)) {
328+
LoadInst *Base = BaseLoads[Ld0];
329+
BaseLoads[Ld1] = Base;
330+
SequentialLoads[Base].push_back(Ld1);
331+
} else {
332+
BaseLoads[Ld1] = Ld0;
333+
SequentialLoads[Ld0].push_back(Ld1);
334+
}
335+
}
336+
}
337+
}
338+
return LoadPairs.size() > 1;
332339
}
333340

334-
PMACPairList
335-
ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
341+
void ARMParallelDSP::CreateParallelMACPairs(Reduction &R) {
342+
OpChainList &Candidates = R.MACCandidates;
343+
PMACPairList &PMACPairs = R.PMACPairs;
336344
const unsigned Elems = Candidates.size();
337-
PMACPairList PMACPairs;
338345

339346
if (Elems < 2)
340-
return PMACPairs;
347+
return;
348+
349+
auto CanPair = [&](BinOpChain *PMul0, BinOpChain *PMul1) {
350+
if (!PMul0->AreSymmetrical(PMul1))
351+
return false;
352+
353+
// The first elements of each vector should be loads with sexts. If we
354+
// find that its two pairs of consecutive loads, then these can be
355+
// transformed into two wider loads and the users can be replaced with
356+
// DSP intrinsics.
357+
for (unsigned x = 0; x < PMul0->LHS.size(); x += 2) {
358+
auto *Ld0 = dyn_cast<LoadInst>(PMul0->LHS[x]);
359+
auto *Ld1 = dyn_cast<LoadInst>(PMul1->LHS[x]);
360+
auto *Ld2 = dyn_cast<LoadInst>(PMul0->RHS[x]);
361+
auto *Ld3 = dyn_cast<LoadInst>(PMul1->RHS[x]);
362+
363+
if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
364+
return false;
365+
366+
LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
367+
<< "\t Ld0: " << *Ld0 << "\n"
368+
<< "\t Ld1: " << *Ld1 << "\n"
369+
<< "and operands " << x + 2 << ":\n"
370+
<< "\t Ld2: " << *Ld2 << "\n"
371+
<< "\t Ld3: " << *Ld3 << "\n");
372+
373+
if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
374+
if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
375+
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
376+
PMACPairs.push_back(std::make_pair(PMul0, PMul1));
377+
return true;
378+
} else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
379+
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
380+
LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
381+
PMul1->Exchange = true;
382+
PMACPairs.push_back(std::make_pair(PMul0, PMul1));
383+
return true;
384+
}
385+
} else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) &&
386+
AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
387+
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
388+
LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
389+
LLVM_DEBUG(dbgs() << " and swapping muls\n");
390+
PMul0->Exchange = true;
391+
// Only the second operand can be exchanged, so swap the muls.
392+
PMACPairs.push_back(std::make_pair(PMul1, PMul0));
393+
return true;
394+
}
395+
}
396+
return false;
397+
};
341398

342399
SmallPtrSet<const Instruction*, 4> Paired;
343400
for (unsigned i = 0; i < Elems; ++i) {
@@ -364,77 +421,21 @@ ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
364421
dbgs() << "- "; Mul0->dump();
365422
dbgs() << "- "; Mul1->dump());
366423

367-
const ValueList &Mul0_LHS = PMul0->LHS;
368-
const ValueList &Mul0_RHS = PMul0->RHS;
369-
const ValueList &Mul1_LHS = PMul1->LHS;
370-
const ValueList &Mul1_RHS = PMul1->RHS;
371-
372-
if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
373-
!AreSymmetrical(Mul0_RHS, Mul1_RHS))
374-
continue;
375-
376424
LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
377-
// The first elements of each vector should be loads with sexts. If we
378-
// find that its two pairs of consecutive loads, then these can be
379-
// transformed into two wider loads and the users can be replaced with
380-
// DSP intrinsics.
381-
bool Found = false;
382-
for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
383-
auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
384-
auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
385-
auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
386-
auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
387-
388-
if (!Ld0 || !Ld1 || !Ld2 || !Ld3)
389-
continue;
390-
391-
LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"
392-
<< "\t Ld0: " << *Ld0 << "\n"
393-
<< "\t Ld1: " << *Ld1 << "\n"
394-
<< "and operands " << x + 2 << ":\n"
395-
<< "\t Ld2: " << *Ld2 << "\n"
396-
<< "\t Ld3: " << *Ld3 << "\n");
397-
398-
if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
399-
if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
400-
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
401-
PMACPairs.push_back(std::make_pair(PMul0, PMul1));
402-
Found = true;
403-
} else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) {
404-
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
405-
LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n");
406-
PMul1->Exchange = true;
407-
PMACPairs.push_back(std::make_pair(PMul0, PMul1));
408-
Found = true;
409-
}
410-
} else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd)) {
411-
if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
412-
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
413-
LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n");
414-
LLVM_DEBUG(dbgs() << " and swapping muls\n");
415-
PMul0->Exchange = true;
416-
// Only the second operand can be exchanged, so swap the muls.
417-
PMACPairs.push_back(std::make_pair(PMul1, PMul0));
418-
Found = true;
419-
}
420-
}
421-
}
422-
if (Found) {
425+
if (CanPair(PMul0, PMul1)) {
423426
Paired.insert(Mul0);
424427
Paired.insert(Mul1);
425428
break;
426429
}
427430
}
428431
}
429-
return PMACPairs;
430432
}
431433

432-
bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
433-
PMACPairList &PMACPairs) {
434+
bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction) {
434435
Instruction *Acc = Reduction.Phi;
435436
Instruction *InsertAfter = Reduction.AccIntAdd;
436437

437-
for (auto &Pair : PMACPairs) {
438+
for (auto &Pair : Reduction.PMACPairs) {
438439
BinOpChain *PMul0 = Pair.first;
439440
BinOpChain *PMul1 = Pair.second;
440441
LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
@@ -685,8 +686,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
685686
for (auto &R : Reductions) {
686687
if (AreAliased(AA, Reads, Writes, R.MACCandidates))
687688
return false;
688-
PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
689-
Changed |= InsertParallelMACs(R, PMACPairs);
689+
CreateParallelMACPairs(R);
690+
Changed |= InsertParallelMACs(R);
690691
}
691692

692693
LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
@@ -733,6 +734,52 @@ Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
733734
return Call;
734735
}
735736

737+
// Compare the value lists in Other to this chain.
738+
bool BinOpChain::AreSymmetrical(BinOpChain *Other) {
739+
// Element-by-element comparison of Value lists returning true if they are
740+
// instructions with the same opcode or constants with the same value.
741+
auto CompareValueList = [](const ValueList &VL0,
742+
const ValueList &VL1) {
743+
if (VL0.size() != VL1.size()) {
744+
LLVM_DEBUG(dbgs() << "Muls are mismatching operand list lengths: "
745+
<< VL0.size() << " != " << VL1.size() << "\n");
746+
return false;
747+
}
748+
749+
const unsigned Pairs = VL0.size();
750+
LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n");
751+
752+
for (unsigned i = 0; i < Pairs; ++i) {
753+
const Value *V0 = VL0[i];
754+
const Value *V1 = VL1[i];
755+
const auto *Inst0 = dyn_cast<Instruction>(V0);
756+
const auto *Inst1 = dyn_cast<Instruction>(V1);
757+
758+
LLVM_DEBUG(dbgs() << "Pair " << i << ":\n";
759+
dbgs() << "mul1: "; V0->dump();
760+
dbgs() << "mul2: "; V1->dump());
761+
762+
if (!Inst0 || !Inst1)
763+
return false;
764+
765+
if (Inst0->isSameOperationAs(Inst1)) {
766+
LLVM_DEBUG(dbgs() << "OK: same operation found!\n");
767+
continue;
768+
}
769+
770+
const APInt *C0, *C1;
771+
if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1))
772+
return false;
773+
}
774+
775+
LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n");
776+
return true;
777+
};
778+
779+
return CompareValueList(LHS, Other->LHS) &&
780+
CompareValueList(RHS, Other->RHS);
781+
}
782+
736783
Pass *llvm::createARMParallelDSPPass() {
737784
return new ARMParallelDSP();
738785
}

0 commit comments

Comments
 (0)