-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Always lower s/udiv64 by constant to MUL #100723
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
||
; Sample test to check how we deal with division/modulos by 64 bit constants. | ||
|
||
define noundef i64 @srem64_3(i64 noundef %i) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you only check the second commit, you can see the ISA diff.
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) ChangesSolves #100383 Patch is 129.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100723.diff 9 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9d9886f4920a2..fd45fd9566c95 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5109,6 +5109,14 @@ class TargetLowering : public TargetLoweringBase {
return 0;
}
+ /// Indicate whether this target prefers to force the lowering of a div by
+ /// constant to a MUL of a magic value, even if the MUL operation is not legal
+ /// or custom.
+ virtual bool forceDivByConstantToMul(EVT DivVT, EVT MuLVT,
+ bool IsAfterLegalization) const {
+ return false;
+ }
+
/// Hooks for building estimates in place of slower divisions and square
/// roots.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 140c97ccd90ba..315cf2156bd8c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6405,7 +6405,8 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
if (VT.isVector())
WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
VT.getVectorElementCount());
- if (isOperationLegalOrCustom(ISD::MUL, WideVT)) {
+ if (isOperationLegalOrCustom(ISD::MUL, WideVT) ||
+ forceDivByConstantToMul(VT, WideVT, IsAfterLegalization)) {
X = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, X);
Y = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, Y);
Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y);
@@ -6588,7 +6589,8 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
if (VT.isVector())
WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
VT.getVectorElementCount());
- if (isOperationLegalOrCustom(ISD::MUL, WideVT)) {
+ if (isOperationLegalOrCustom(ISD::MUL, WideVT) ||
+ forceDivByConstantToMul(VT, WideVT, IsAfterLegalization)) {
X = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, X);
Y = DAG.getNode(ISD::ZERO_EXTEND, dl, WideVT, Y);
Y = DAG.getNode(ISD::MUL, dl, WideVT, X, Y);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1f198a92c0fa6..e4cdd38c69aa1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -372,6 +372,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
return 2;
}
+ bool forceDivByConstantToMul(EVT DivVT, EVT MuLVT,
+ bool IsAfterLegalization) const override {
+ return !IsAfterLegalization;
+ }
+
bool supportSplitCSR(MachineFunction *MF) const override;
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
void insertCopiesSplitCSR(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 96e92bb3dce0d..e4756ad3817c2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7066,202 +7066,57 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX6-LABEL: udiv_i64_oddk_denom:
; GFX6: ; %bb.0:
-; GFX6-NEXT: s_add_u32 s4, 3, 0
-; GFX6-NEXT: v_mov_b32_e32 v0, 0xe3e0f6
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT: s_addc_u32 s5, 0, 0
-; GFX6-NEXT: s_or_b32 s4, vcc_lo, vcc_hi
-; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_mov_b32 s4, 0x68958c89
-; GFX6-NEXT: s_movk_i32 s6, 0xfee0
-; GFX6-NEXT: v_mul_lo_u32 v1, v0, s6
-; GFX6-NEXT: v_mul_hi_u32 v2, v0, s4
-; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s6, s5, 0x68958c89
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, s6, v1
-; GFX6-NEXT: v_mul_lo_u32 v3, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2
-; GFX6-NEXT: v_mul_hi_u32 v5, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v6, s5, v1
-; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2
-; GFX6-NEXT: s_movk_i32 s8, 0x11f
-; GFX6-NEXT: s_mov_b32 s9, 0x976a7377
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v6, vcc
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT: v_mov_b32_e32 v3, s5
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x64c139ef
+; GFX6-NEXT: v_mov_b32_e32 v0, 0x38f83e5
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0
; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1
-; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1
-; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1
+; GFX6-NEXT: v_mul_hi_u32 v3, s3, v1
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT: s_mul_i32 s1, s3, 0x64c139ef
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, s1, v4
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: s_mul_i32 s0, s2, 0x38f83e5
+; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, s0, v4
+; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0
-; GFX6-NEXT: s_mov_b32 s4, s0
-; GFX6-NEXT: s_mov_b32 s5, s1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8
-; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9
-; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9
-; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2
-; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3
-; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s9, v3
-; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT: s_movk_i32 s2, 0x11e
-; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v4
-; GFX6-NEXT: s_mov_b32 s9, 0x976a7376
-; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s9, v5
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1]
-; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0
-; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0
-; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
-; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1]
-; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1]
-; GFX6-NEXT: v_mov_b32_e32 v6, s3
-; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s9, v3
-; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; GFX6-NEXT: s_mul_i32 s0, s3, 0x38f83e5
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, s0, v2
+; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v0, v3, vcc
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 2, v0
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX9-LABEL: udiv_i64_oddk_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_add_u32 s0, 3, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xe3e0f6
-; GFX9-NEXT: s_addc_u32 s1, 0, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: s_addc_u32 s0, s1, 0
-; GFX9-NEXT: s_mul_i32 s3, s2, 0xfffffee0
-; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0x68958c89
-; GFX9-NEXT: s_mul_i32 s1, s0, 0x68958c89
-; GFX9-NEXT: s_add_i32 s3, s8, s3
-; GFX9-NEXT: s_add_i32 s3, s3, s1
-; GFX9-NEXT: s_mul_i32 s9, s2, 0x68958c89
-; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3
-; GFX9-NEXT: s_mul_i32 s8, s2, s3
-; GFX9-NEXT: s_mul_hi_u32 s2, s2, s9
-; GFX9-NEXT: s_add_u32 s2, s2, s8
-; GFX9-NEXT: s_addc_u32 s1, 0, s1
-; GFX9-NEXT: s_mul_hi_u32 s10, s0, s9
-; GFX9-NEXT: s_mul_i32 s9, s0, s9
-; GFX9-NEXT: s_add_u32 s2, s2, s9
-; GFX9-NEXT: s_mul_hi_u32 s8, s0, s3
-; GFX9-NEXT: s_addc_u32 s1, s1, s10
-; GFX9-NEXT: s_addc_u32 s2, s8, 0
-; GFX9-NEXT: s_mul_i32 s3, s0, s3
-; GFX9-NEXT: s_add_u32 s1, s1, s3
-; GFX9-NEXT: s_addc_u32 s2, 0, s2
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s0, s0, s2
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s2, s6, s0
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s3
-; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0
-; GFX9-NEXT: s_add_u32 s2, s8, s2
-; GFX9-NEXT: s_addc_u32 s1, 0, s1
-; GFX9-NEXT: s_mul_hi_u32 s9, s7, s3
-; GFX9-NEXT: s_mul_i32 s3, s7, s3
-; GFX9-NEXT: s_add_u32 s2, s2, s3
-; GFX9-NEXT: s_mul_hi_u32 s8, s7, s0
-; GFX9-NEXT: s_addc_u32 s1, s1, s9
-; GFX9-NEXT: s_addc_u32 s2, s8, 0
-; GFX9-NEXT: s_mul_i32 s0, s7, s0
-; GFX9-NEXT: s_add_u32 s3, s1, s0
-; GFX9-NEXT: s_addc_u32 s2, 0, s2
-; GFX9-NEXT: s_mul_i32 s0, s3, 0x11f
-; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x976a7377
-; GFX9-NEXT: s_add_i32 s0, s8, s0
-; GFX9-NEXT: s_mul_i32 s8, s2, 0x976a7377
-; GFX9-NEXT: s_mul_i32 s9, s3, 0x976a7377
-; GFX9-NEXT: s_add_i32 s8, s0, s8
-; GFX9-NEXT: v_mov_b32_e32 v0, s9
-; GFX9-NEXT: s_sub_i32 s0, s7, s8
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
-; GFX9-NEXT: s_mov_b32 s1, 0x976a7377
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s6, s0, 0x11f
-; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s1, v0
-; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: s_subb_u32 s6, s6, 0
-; GFX9-NEXT: s_cmpk_gt_u32 s6, 0x11e
-; GFX9-NEXT: s_mov_b32 s10, 0x976a7376
-; GFX9-NEXT: s_cselect_b32 s9, -1, 0
-; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s10, v1
-; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x11f
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s9
-; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s3, 1
-; GFX9-NEXT: s_addc_u32 s6, s2, 0
-; GFX9-NEXT: s_add_u32 s1, s3, 2
-; GFX9-NEXT: s_addc_u32 s9, s2, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s9
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; GFX9-NEXT: s_subb_u32 s0, s7, s8
-; GFX9-NEXT: s_cmpk_gt_u32 s0, 0x11e
-; GFX9-NEXT: s_cselect_b32 s1, -1, 0
-; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0
-; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x11f
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GFX9-NEXT: v_mov_b32_e32 v4, s1
-; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v4, s2
-; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT: s_mul_hi_u32 s0, s6, 0x38f83e5
+; GFX9-NEXT: s_mul_i32 s1, s6, 0x38f83e5
+; GFX9-NEXT: s_mul_i32 s3, s7, 0x64c139ef
+; GFX9-NEXT: s_mul_hi_u32 s6, s6, 0x64c139ef
+; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0x64c139ef
+; GFX9-NEXT: s_add_u32 s3, s3, s6
+; GFX9-NEXT: s_addc_u32 s2, s2, 0
+; GFX9-NEXT: s_add_u32 s1, s1, s3
+; GFX9-NEXT: s_addc_u32 s0, s0, 0
+; GFX9-NEXT: s_add_u32 s0, s2, s0
+; GFX9-NEXT: s_addc_u32 s1, 0, 0
+; GFX9-NEXT: s_mul_i32 s3, s7, 0x38f83e5
+; GFX9-NEXT: s_mul_hi_u32 s2, s7, 0x38f83e5
+; GFX9-NEXT: s_add_u32 s0, s3, s0
+; GFX9-NEXT: s_addc_u32 s0, s2, s1
+; GFX9-NEXT: s_lshr_b32 s0, s0, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = udiv i64 %x, 1235195949943
store i64 %r, ptr addrspace(1) %out
@@ -7405,84 +7260,34 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0xd
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
-; GFX6-NEXT: s_mov_b32 s2, 0x2ff2fc01
-; GFX6-NEXT: v_bfrev_b32_e32 v0, 7
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 12
-; GFX6-NEXT: s_add_u32 s2, 0xe037f, s2
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT: s_addc_u32 s3, 0, 0
-; GFX6-NEXT: s_or_b32 s2, vcc_lo, vcc_hi
-; GFX6-NEXT: s_cmp_lg_u32 s2, 0
-; GFX6-NEXT: s_movk_i32 s2, 0xf001
-; GFX6-NEXT: v_mul_hi_u32 v1, v0, s2
-; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2
-; GFX6-NEXT: s_addc_u32 s8, s3, 0x1000ff
-; GFX6-NEXT: s_mul_i32 s3, s8, 0xfffff001
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v0
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, s3, v1
-; GFX6-NEXT: v_mul_lo_u32 v3, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2
-; GFX6-NEXT: v_mul_hi_u32 v5, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v6, s8, v1
-; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; GFX6-NEXT: v_mul_lo_u32 v5, s8, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s8, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, 0x10010011
+; GFX6-NEXT: v_mov_b32_e32 v0, 0x100100
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v6, vcc
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0
+; GFX6-NEXT: s_mul_i32 s9, s7, 0x10010011
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, s9, v3
+; GFX6-NEXT: s_mul_i32 s8, s6, 0x100100
+; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, s8, v3
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc
-; GFX6-NEXT: v_mov_b32_e32 v3, s8
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
-; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1
-; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0
-; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1
-; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1
-; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
-; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
-; GFX6-NEXT: s_movk_i32 s8, 0xfff
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
-; GFX6-NEXT: v_mul_lo_u32 v4, v1, s8
-; GFX6-NEXT: v_mul_hi_u32 v5, v0, s8
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT: v_mul_lo_u32 v8, v0, s8
-; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, 2, v0
-; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, s7
-; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s6, v8
-; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s8, v8
-; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
-; GFX6-NEXT: s_movk_i32 s6, 0xffe
-; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v5
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
-; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v8
-; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v4, -1, v5, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX6-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc
+; GFX6-NEXT: s_mul_i32 s8, s7, 0x100100
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, s8, v1
+; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v0, v2, vcc
+; GFX6-NEXT: v_mov_b32_e32 v1, s7
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v3
+; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 12
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[0:1], 11
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -7492,95 +7297,34 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
-; GFX9-NEXT: s_mov_b32 s8, 0x2ff2fc01
-; GFX9-NEXT: v_bfrev_b32_e32 v0, 7
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12
-; GFX9-NEXT: s_add_u32 s4, 0xe037f, s8
-; GFX9-NEXT: s_addc_u32 s5, 0, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_addc_u32 s4, s5, 0x1000ff
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, 0xfffff001
-; GFX9-NEXT: s_mul_i32 s5, s4, 0xfffff001
-; GFX9-NEXT: s_sub_i32 s9, s9, s8
-; GFX9-NEXT: s_add_i32 s9, s9, s5
-; GFX9-NEXT: s_mul_i32 s11, s8, 0xfffff001
-; GFX9-NEXT: s_mul_hi_u32 s5, s8, s9
-; GFX9-NEXT: s_mul_i32 s10, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s8, s8, s11
-; GFX9-NEXT: s_add_u32 s8, s8, s10
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s12, s4, s11
-; GFX9-NEXT: s_mul_i32 s11, s4, s11
-; GFX9-NEXT: s_add_u32 s8, s8, s11
-; GFX9-NEXT: s_mul_hi_u32 s10, s4, s9
-; GFX9-NEXT: s_addc_u32 s5, s5, s12
-; GFX9-NEXT: s_addc_u32 s8, s10, 0
-; GFX9-NEXT: s_mul_i32 s9, s4, s9
+; GFX9-NEXT: s_mul_i32 s9, s7, 0x10010011
+; GFX9-NEXT: s_mul_hi_u32 s10, s6, 0x10010011
+; GFX9-NEXT: s_mul_hi_u32 s8, s7, 0x10010011
+; GFX9-NEXT: s_add_u32 s9, s9, s10
+; GFX9-NEXT: s_mul_i32 s5, s6, 0x100100
+; GFX9-NEXT: s_addc_u32 s8, s8, 0
+; GFX9-NEXT: s_mul_hi_u32 s4, s6, 0x100100
; GFX9-NEXT: s_add_u32 s5, s5, s9
-; GFX9-NEXT: s_addc_u32 s8, 0, s8
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0
-; GFX9-NEXT:...
[truncated]
|
@@ -6588,7 +6589,8 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, | |||
if (VT.isVector()) | |||
WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, | |||
VT.getVectorElementCount()); | |||
if (isOperationLegalOrCustom(ISD::MUL, WideVT)) { | |||
if (isOperationLegalOrCustom(ISD::MUL, WideVT) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried expanding to MULH/MUL_LOHI but the codegen is a bit worse, there are a few more instructions. Mul seem best.
@@ -6405,7 +6405,8 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, | |||
if (VT.isVector()) | |||
WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT, | |||
VT.getVectorElementCount()); | |||
if (isOperationLegalOrCustom(ISD::MUL, WideVT)) { | |||
if (isOperationLegalOrCustom(ISD::MUL, WideVT) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would make more sense to just delete the check
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tried enabling it for all targets but it doesn't seem to always be profitable. At minimum we need a isTypeLegal check otherwise some targets even crash (riscv32).
I'll have another look
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removing the check causes about 30 failures. Using only isTypeLegal on the wide type doesn't cause more than 1-2 test changes but then it doesn't work on AMDGPU because i128 isn't legal
What I can do I think is refactor this so the isOperationLegalOrCustom check goes away into the TLI hook?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I primarily want to not have any new TLI hook. Quantity of test changes doesn't mean it's worse, just update them?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume this should be something like !LegalOperations || isOperationLegalOrCustom(MUL)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I can update the tests, but we still need some kind of legality check, otherwise we get crashes in a few places. RISCV seems especially sensitive to that and there's a few crashes I can't seem to get rid of without getting rid of the AMDGPU improvements.
Why is the TLI hook not desirable? We already have some for other expansions.
I could make an enum instead to decide which expansion method to use, and targets would default to a legal method while AMDGPU can force MUL expansion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is the TLI hook not desirable?
Because we already have way too many garbage single use, undiscoverable hacks for specific combines like this one.
We already have some for other expansions.
That's the problem. They're all hacky shortcuts to avoid touching all the targets or coming up with a reasonable heuristic in terms of the existing APIs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, that makes sense.
I think I'll need to dive a bit into the RISCV backend to see why it's so prone to crashing on this.
The heuristics also needs a bit more thought put into it. I feel like we shouldn't blindly apply this combine, but it should also not rely on legality too much otherwise it won't kick in for AMDGPU
if ((isOperationExpand(ISD::SDIV, VT) && | ||
isOperationCustom(ISD::SDIVREM, VT.getScalarType())) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This logic probably isn't exactly right; it probably shouldn't be looking at the scalar type for no reason. However the DAG legalization rules don't really let you figure out if a vector operation is going to be scalarized or not
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I also don't especially like it, it feels very targeted, but I also couldn't find any alternative that didn't cause crashes or regressions in other backends.
Checking the scalar type is indeed because the flow I observed is (vector SDIV) -> (scalar SDIV) -> (scalar SDVIREM)
entry: | ||
%div = udiv i64 %i, 6 | ||
ret i64 %div | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this need power of 2 tests? What about values around int32_max?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added tests for s/u div/rem for 2, 64, int32 min, int32 max.
Heads up: This patch also affected ARM, which might cause a SelectionDAG issue for a Halide use case.
|
Enclosed is a Halide-generated .ll that will assert-fail when run through |
This reverts commit 92fbc96. The patch also affected ARM and caused an assertion failure during CurDAG->Legalize (#100723 (comment)).
Reverted
The
Cc @davemgreen in case there is test coverage opportunity for llvm/test/CodeGen/ARM |
Reland llvm#100723, fixing the ARM issue at the cost of a small regression in AMDGPU. Solves llvm#100383
Solves #100383