Skip to content

Commit 13c4062

Browse files
[LSR] Make OptimizeLoopTermCond able to handle some non-cmp conditions
Currently OptimizeLoopTermCond can only convert a cmp instruction to using a postincrement induction variable, which means it can't handle predicated loops where the termination condition comes from get_active_lane_mask. Relax this restriction so that we can handle any kind of instruction, though only if it's the instruction immediately before the branch (except for possibly an extractelement).
1 parent 46ed084 commit 13c4062

File tree

2 files changed

+38
-19
lines changed

2 files changed

+38
-19
lines changed

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2181,8 +2181,8 @@ class LSRInstance {
21812181
SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
21822182

21832183
void OptimizeShadowIV();
2184-
bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
2185-
ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
2184+
bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
2185+
Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse);
21862186
void OptimizeLoopTermCond();
21872187

21882188
void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
@@ -2416,7 +2416,7 @@ void LSRInstance::OptimizeShadowIV() {
24162416

24172417
/// If Cond has an operand that is an expression of an IV, set the IV user and
24182418
/// stride information and return true, otherwise return false.
2419-
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
2419+
bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
24202420
for (IVStrideUse &U : IU)
24212421
if (U.getUser() == Cond) {
24222422
// NOTE: we could handle setcc instructions with multiple uses here, but
@@ -2476,7 +2476,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
24762476
/// This function solves this problem by detecting this type of loop and
24772477
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
24782478
/// the instructions for the maximum computation.
2479-
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
2479+
Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse *&CondUse) {
24802480
// Check that the loop matches the pattern we're looking for.
24812481
if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
24822482
Cond->getPredicate() != CmpInst::ICMP_NE)
@@ -2620,15 +2620,34 @@ LSRInstance::OptimizeLoopTermCond() {
26202620
// one register value.
26212621

26222622
BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
2623-
if (!TermBr)
2623+
if (!TermBr || TermBr->isUnconditional())
26242624
continue;
2625-
// FIXME: Overly conservative, termination condition could be an 'or' etc..
2626-
if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
2625+
2626+
Instruction *Cond = dyn_cast<Instruction>(TermBr->getCondition());
2627+
bool CondImmediatelyBeforeTerm = Cond && Cond->getNextNode() == TermBr;
2628+
// If the argument to TermBr is an extractelement, then the source of that
2629+
// instruction is what's generated the condition.
2630+
auto *Extract = dyn_cast_or_null<ExtractElementInst>(Cond);
2631+
if (Extract) {
2632+
Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
2633+
if (Cond && CondImmediatelyBeforeTerm)
2634+
CondImmediatelyBeforeTerm = Cond->getNextNode() == Extract;
2635+
}
2636+
// FIXME: We could do more here, like handling logical operations where one
2637+
// side is a cmp that uses an induction variable.
2638+
if (!Cond)
2639+
continue;
2640+
2641+
// If the condition instruction isn't immediately before TermBr then it has
2642+
// to either be a CmpInst, or be immediately before an extract that's
2643+
// immediately before TermBr, as currently we can only move or clone a
2644+
// CmpInst.
2645+
// FIXME: We should be able to do this when it's safe to do so.
2646+
if ((!isa<CmpInst>(Cond) || Extract) && !CondImmediatelyBeforeTerm)
26272647
continue;
26282648

26292649
// Search IVUsesByStride to find Cond's IVUse if there is one.
26302650
IVStrideUse *CondUse = nullptr;
2631-
ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
26322651
if (!FindIVUserForCond(Cond, CondUse))
26332652
continue;
26342653

@@ -2638,7 +2657,8 @@ LSRInstance::OptimizeLoopTermCond() {
26382657
// One consequence of doing this now is that it disrupts the count-down
26392658
// optimization. That's not always a bad thing though, because in such
26402659
// cases it may still be worthwhile to avoid a max.
2641-
Cond = OptimizeMax(Cond, CondUse);
2660+
if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
2661+
Cond = OptimizeMax(Cmp, CondUse);
26422662

26432663
// If this exiting block dominates the latch block, it may also use
26442664
// the post-inc value if it won't be shared with other uses.
@@ -2703,13 +2723,13 @@ LSRInstance::OptimizeLoopTermCond() {
27032723
// It's possible for the setcc instruction to be anywhere in the loop, and
27042724
// possible for it to have multiple users. If it is not immediately before
27052725
// the exiting block branch, move it.
2706-
if (Cond->getNextNode() != TermBr) {
2726+
if (!CondImmediatelyBeforeTerm) {
27072727
if (Cond->hasOneUse()) {
27082728
Cond->moveBefore(TermBr->getIterator());
27092729
} else {
27102730
// Clone the terminating condition and insert into the loopend.
2711-
ICmpInst *OldCond = Cond;
2712-
Cond = cast<ICmpInst>(Cond->clone());
2731+
Instruction *OldCond = Cond;
2732+
Cond = Cond->clone();
27132733
Cond->setName(L->getHeader()->getName() + ".termcond");
27142734
Cond->insertInto(ExitingBlock, TermBr->getIterator());
27152735

llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,12 @@ define void @lane_mask(ptr %dst, i64 %n) #0 {
1616
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
1717
; CHECK-NEXT: br label %[[LOOP:.*]]
1818
; CHECK: [[LOOP]]:
19-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
19+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP1:%.*]], %[[LOOP]] ]
2020
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[LOOP]] ]
2121
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IV]], 2
2222
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
2323
; CHECK-NEXT: tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
24-
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[VSCALEX4]]
25-
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[VSCALEX4]], [[IV]]
24+
; CHECK-NEXT: [[TMP1]] = add i64 [[IV]], [[VSCALEX4]]
2625
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP1]], i64 [[N]])
2726
; CHECK-NEXT: [[COND:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
2827
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
@@ -102,12 +101,12 @@ define void @uses_cmp_fn(ptr %dst, i64 %n) {
102101
; CHECK-NEXT: [[ENTRY:.*]]:
103102
; CHECK-NEXT: br label %[[LOOP:.*]]
104103
; CHECK: [[LOOP]]:
105-
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[LOOP]] ], [ [[DST]], %[[ENTRY]] ]
106-
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ 1, %[[ENTRY]] ]
104+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ]
105+
; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[LSR_IV]], 2
106+
; CHECK-NEXT: [[LSR_IV1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
107107
; CHECK-NEXT: store i32 0, ptr [[LSR_IV1]], align 4
108-
; CHECK-NEXT: [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV]])
109108
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 1
110-
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
109+
; CHECK-NEXT: [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV_NEXT]])
111110
; CHECK-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT:.*]]
112111
; CHECK: [[EXIT]]:
113112
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)