Revert "[SLP]Attempt to vectorize long stores, if short one failed."
This reverts commit 6f7160eedb2db02f37d4ffd52fff7b0cf88b3fdc. This still causes large compile-time regressions in some cases.
This commit is contained in:
parent
156ab4d4fb
commit
888836930b
@ -15164,6 +15164,10 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
|
||||
BoUpSLP::ValueSet VectorizedStores;
|
||||
bool Changed = false;
|
||||
|
||||
// Stores the pair of stores (first_store, last_store) in a range, that were
|
||||
// already tried to be vectorized. Allows to skip the store ranges that were
|
||||
// already tried to be vectorized but the attempts were unsuccessful.
|
||||
DenseSet<std::pair<Value *, Value *>> TriedSequences;
|
||||
struct StoreDistCompare {
|
||||
bool operator()(const std::pair<unsigned, int> &Op1,
|
||||
const std::pair<unsigned, int> &Op2) const {
|
||||
@ -15205,10 +15209,8 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
|
||||
Type *ValueTy = StoreTy;
|
||||
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
|
||||
ValueTy = Trunc->getSrcTy();
|
||||
unsigned MinVF = std::max<unsigned>(
|
||||
2, PowerOf2Ceil(TTI->getStoreMinimumVF(
|
||||
R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
|
||||
ValueTy)));
|
||||
unsigned MinVF = PowerOf2Ceil(TTI->getStoreMinimumVF(
|
||||
R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy, ValueTy));
|
||||
|
||||
if (MaxVF < MinVF) {
|
||||
LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
|
||||
@ -15234,74 +15236,40 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
|
||||
VF = Size > MaxVF ? NonPowerOf2VF : Size;
|
||||
Size *= 2;
|
||||
});
|
||||
unsigned End = Operands.size();
|
||||
unsigned Repeat = 0;
|
||||
constexpr unsigned MaxAttempts = 2;
|
||||
SmallBitVector Range(Operands.size());
|
||||
while (true) {
|
||||
++Repeat;
|
||||
for (unsigned Size : CandidateVFs) {
|
||||
int StartIdx = Range.find_first_unset();
|
||||
while (StartIdx != -1) {
|
||||
int EndIdx = Range.find_next(StartIdx);
|
||||
unsigned Sz = EndIdx == -1 ? End : EndIdx;
|
||||
for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
|
||||
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
|
||||
assert(all_of(Slice,
|
||||
[&](Value *V) {
|
||||
return cast<StoreInst>(V)
|
||||
->getValueOperand()
|
||||
->getType() ==
|
||||
cast<StoreInst>(Slice.front())
|
||||
->getValueOperand()
|
||||
->getType();
|
||||
}) &&
|
||||
"Expected all operands of same type.");
|
||||
if (vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
|
||||
// Mark the vectorized stores so that we don't vectorize them
|
||||
// again.
|
||||
VectorizedStores.insert(Slice.begin(), Slice.end());
|
||||
// Mark the vectorized stores so that we don't vectorize them
|
||||
// again.
|
||||
Changed = true;
|
||||
// If we vectorized initial block, no need to try to vectorize
|
||||
// it again.
|
||||
Range.set(Cnt, Cnt + Size);
|
||||
if (Cnt < StartIdx + MinVF)
|
||||
Range.set(StartIdx, Cnt);
|
||||
if (Cnt > EndIdx - Size - MinVF) {
|
||||
Range.set(Cnt + Size, EndIdx);
|
||||
End = Cnt;
|
||||
}
|
||||
Cnt += Size;
|
||||
continue;
|
||||
}
|
||||
++Cnt;
|
||||
}
|
||||
if (Sz >= End)
|
||||
break;
|
||||
StartIdx = Range.find_next_unset(EndIdx);
|
||||
unsigned StartIdx = 0;
|
||||
for (unsigned Size : CandidateVFs) {
|
||||
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
|
||||
ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
|
||||
assert(
|
||||
all_of(
|
||||
Slice,
|
||||
[&](Value *V) {
|
||||
return cast<StoreInst>(V)->getValueOperand()->getType() ==
|
||||
cast<StoreInst>(Slice.front())
|
||||
->getValueOperand()
|
||||
->getType();
|
||||
}) &&
|
||||
"Expected all operands of same type.");
|
||||
if (!VectorizedStores.count(Slice.front()) &&
|
||||
!VectorizedStores.count(Slice.back()) &&
|
||||
TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
|
||||
.second &&
|
||||
vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
|
||||
// Mark the vectorized stores so that we don't vectorize them again.
|
||||
VectorizedStores.insert(Slice.begin(), Slice.end());
|
||||
Changed = true;
|
||||
// If we vectorized initial block, no need to try to vectorize it
|
||||
// again.
|
||||
if (Cnt == StartIdx)
|
||||
StartIdx += Size;
|
||||
Cnt += Size;
|
||||
continue;
|
||||
}
|
||||
++Cnt;
|
||||
}
|
||||
// All values vectorize - exit.
|
||||
if (Range.all())
|
||||
// Check if the whole array was vectorized already - exit.
|
||||
if (StartIdx >= Operands.size())
|
||||
break;
|
||||
// Check if tried all attempts or no need for the last attempts at all.
|
||||
if (Repeat >= MaxAttempts)
|
||||
break;
|
||||
constexpr unsigned MaxVFScale = 4;
|
||||
constexpr unsigned StoresLimit = 16;
|
||||
const unsigned MaxTotalNum = std::min(
|
||||
std::max<unsigned>(StoresLimit, MaxVFScale * MaxVF),
|
||||
bit_floor(static_cast<unsigned>(Range.find_last_unset() -
|
||||
Range.find_first_unset() + 1)));
|
||||
if (MaxVF >= MaxTotalNum)
|
||||
break;
|
||||
// Last attempt to vectorize max number of elements, if all previous
|
||||
// attempts were unsuccessful because of the cost issues.
|
||||
CandidateVFs.clear();
|
||||
for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2)
|
||||
CandidateVFs.push_back(Size);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -100,17 +100,41 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
|
||||
define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
|
||||
; SSE-LABEL: @store_i64(
|
||||
; SSE-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
|
||||
; SSE-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
|
||||
; SSE-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
|
||||
; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
|
||||
; SSE-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
|
||||
; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
|
||||
; SSE-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
|
||||
; SSE-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
|
||||
; SSE-NEXT: [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
|
||||
; SSE-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
|
||||
; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
|
||||
; SSE-NEXT: store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
|
||||
; SSE-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
|
||||
; SSE-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
|
||||
; SSE-NEXT: [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
|
||||
; SSE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
|
||||
; SSE-NEXT: [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
|
||||
; SSE-NEXT: [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
|
||||
; SSE-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
|
||||
; SSE-NEXT: store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
|
||||
; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
|
||||
; SSE-NEXT: [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
|
||||
; SSE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
|
||||
; SSE-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
|
||||
; SSE-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
|
||||
; SSE-NEXT: [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
|
||||
; SSE-NEXT: [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
|
||||
; SSE-NEXT: [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
|
||||
; SSE-NEXT: store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
|
||||
; SSE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
|
||||
; SSE-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
|
||||
; SSE-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
|
||||
; SSE-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
|
||||
; SSE-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
|
||||
; SSE-NEXT: [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
|
||||
; SSE-NEXT: [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
|
||||
; SSE-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
|
||||
; SSE-NEXT: store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
|
||||
; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
|
||||
; SSE-NEXT: [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
|
||||
; SSE-NEXT: [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
|
||||
; SSE-NEXT: [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
|
||||
; SSE-NEXT: [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
|
||||
; SSE-NEXT: [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
|
||||
; SSE-NEXT: [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
|
||||
; SSE-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
|
||||
; SSE-NEXT: store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
|
||||
; SSE-NEXT: ret void
|
||||
;
|
||||
; AVX-LABEL: @store_i64(
|
||||
|
Loading…
x
Reference in New Issue
Block a user