Merge "[AMDGPU] Reorder target-specific passes" into amd-gfx

piotrAMD · Gerrit Code Review · commit 26c8789ad97a · 2020-03-18T04:51:58.000-04:00
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -857,6 +857,8 @@ bool GCNPassConfig::addPreISel() {
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
 
+  addPass(createSinkingPass());
+
   if (EnableConditionalDiscardTransformations)
     addPass(createAMDGPUConditionalDiscardPass());
 
@@ -866,7 +868,6 @@ bool GCNPassConfig::addPreISel() {
   if (!LateCFGStructurize) {
     addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   }
-  addPass(createSinkingPass());
 
   // This is a temporary fix for the issue of dealing with in loop uniform values
   // where the uses out of the loop are non-uniform. LCSSA creates a PHI at the
diff --git a/llvm/test/CodeGen/AMDGPU/discard-optimization.ll b/llvm/test/CodeGen/AMDGPU/discard-optimization.ll
@@ -189,6 +189,35 @@ define amdgpu_ps <4 x float> @wqm_kill_to_demote2(<8 x i32> inreg %rsrc, <4 x i3
   ret <4 x float> %rtex
 }
 
+
+; GCN-LABEL: {{^}}sinking_image_sample:
+; GCN-NEXT: ; %.entry
+; GCN-NOT: image_sample
+; GCN: s_cbranch_exec
+; GCN: image_sample
+define amdgpu_ps void @sinking_image_sample(float %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, float %arg3) {
+.entry:
+  %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 7, float %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i1 false, i32 0, i32 0)
+  %tmp1 = fcmp olt float %arg3, 0.000000e+00
+  br i1 %tmp1, label %kill_br, label %next
+
+kill_br:
+  call void @llvm.amdgcn.kill(i1 false)
+  br label %exit
+
+next:
+  %tmp2 = extractelement <4 x float> %tmp0, i32 2
+  %tmp3 = extractelement <4 x float> %tmp0, i32 3
+  %tmp4 = fadd reassoc nnan nsz arcp contract float %tmp2, %tmp3
+  br label %exit
+
+exit:                                            ; preds = %bb102
+  %outp = phi float [ %tmp4, %next ], [ undef, %kill_br]
+  call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float %outp, float %outp, float %outp, float %outp, i1 immarg true, i1 immarg true)
+  ret void
+}
+
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 
diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll
@@ -6,11 +6,11 @@
 ; with exec.
 
 ; GCN-LABEL: {{^}}needs_and:
-; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}}
-; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]]
-; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]],
-; GCN: s_andn2_b64 exec, exec, [[REG3]]
 
+; GCN: s_or_b64 exec, exec, [[REG1:[^ ,]*]]
+; GCN: s_andn2_b64 exec, exec, [[REG2:[^ ,]*]]
+; GCN: s_or_b64 [[REG2:[^ ,]*]], [[REG1:[^ ,]*]], [[REG2:[^ ,]*]]
+; GCN: s_or_b64 exec, exec, [[REG2:[^ ,]*]]
 define void @needs_and(i32 %arg) {
 entry:
   br label %loop