llvm
diff --git a/‎llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Lines changed: 6 additions & 3 deletions b/‎llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Lines changed: 6 additions & 3 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
Lines changed: 671 additions & 648 deletions b/‎llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
Lines changed: 671 additions & 648 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/bitop3.ll
Lines changed: 8 additions & 18 deletions b/‎llvm/test/CodeGen/AMDGPU/bitop3.ll
Lines changed: 8 additions & 18 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
Lines changed: 72 additions & 73 deletions b/‎llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
Lines changed: 72 additions & 73 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/build_vector.ll
Lines changed: 7 additions & 7 deletions b/‎llvm/test/CodeGen/AMDGPU/build_vector.ll
Lines changed: 7 additions & 7 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
Lines changed: 15 additions & 15 deletions b/‎llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
Lines changed: 15 additions & 15 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
Lines changed: 1 addition & 1 deletion b/‎llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/test/CodeGen/AMDGPU/fabs.f16.ll
Lines changed: 14 additions & 14 deletions b/‎llvm/test/CodeGen/AMDGPU/fabs.f16.ll
Lines changed: 14 additions & 14 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/fabs.ll
Lines changed: 7 additions & 7 deletions b/‎llvm/test/CodeGen/AMDGPU/fabs.ll
Lines changed: 7 additions & 7 deletions
@@ -325,8 +325,8 @@ class Vectorizer {
       Instruction *ChainElem, Instruction *ChainBegin,
       const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);
 
-  /// Merge the equivalence classes if casts could be inserted in one to match
-  /// the scalar bitwidth of the instructions in the other class.
+  /// Merge equivalence classes if casts could be inserted in one to match
+  /// the total bitwidth of the instructions.
   void insertCastsToMergeClasses(EquivalenceClassMap &EQClasses);
 
   /// Merges the equivalence classes if they have underlying objects that differ
@@ -1346,7 +1346,10 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
   DenseSet<EqClassKey> ClassesToErase;
   for (auto EC1 : EQClasses) {
     for (auto EC2 : EQClasses) {
-      if (ClassesToErase.contains(EC2.first) || EC1 <= EC2)
+      // Skip if EC2 was already merged before, EC1 follows EC2 in the
+      // collection or EC1 is the same as EC2.
+      if (ClassesToErase.contains(EC2.first) || EC1 <= EC2 ||
+          EC1.first == EC2.first)
         continue;
 
       auto [Ptr1, AS1, TySize1, IsLoad1] = EC1.first;
 
@@ -113,15 +113,10 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
-; GFX950-SDAG-LABEL: and_and_and:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
-; GFX950-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX950-GISEL-LABEL: and_and_and:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
-; GFX950-GISEL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: and_and_and:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GCN-NEXT:    ; return to shader part epilog
   %and1 = and i32 %a, %c
   %and2 = and i32 %and1, %b
   %ret_cast = bitcast i32 %and2 to float
@@ -131,15 +126,10 @@ define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
 ; ========= Multi bit functions =========
 
 define amdgpu_ps float @test_12(i32 %a, i32 %b) {
-; GFX950-SDAG-LABEL: test_12:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
-; GFX950-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX950-GISEL-LABEL: test_12:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
-; GFX950-GISEL-NEXT:    ; return to shader part epilog
+; GCN-LABEL: test_12:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GCN-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %and1 = and i32 %nota, %b
   %ret_cast = bitcast i32 %and1 to float
 
@@ -271,13 +271,13 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out,
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mov_b32_e32 v0, s2
-; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
 
@@ -10,8 +10,8 @@ define amdgpu_ps i32 @s_or_i32_disjoint(i32 inreg %a, i32 inreg %b) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; CHECK-NEXT:   %3:sreg_32 = disjoint S_OR_B32 [[COPY1]], [[COPY]], implicit-def dead $scc
-  ; CHECK-NEXT:   $sgpr0 = COPY %3
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY1]], [[COPY]], implicit-def dead $scc
+  ; CHECK-NEXT:   $sgpr0 = COPY [[S_OR_B32_]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $sgpr0
   %result = or disjoint i32 %a, %b
   ret i32 %result
@@ -26,10 +26,10 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; CHECK-NEXT:   %5:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
-  ; CHECK-NEXT:   %6:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
-  ; CHECK-NEXT:   $sgpr0 = COPY %5
-  ; CHECK-NEXT:   $sgpr1 = COPY %6
+  ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc
+  ; CHECK-NEXT:   $sgpr0 = COPY [[S_OR_B32_]]
+  ; CHECK-NEXT:   $sgpr1 = COPY [[S_OR_B32_1]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
   %result = or disjoint <2 x i32> %a, %b
   ret <2 x i32> %result
@@ -42,8 +42,8 @@ define i32 @v_or_i32_disjoint(i32 %a, i32 %b) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   %10:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = COPY %10
+  ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %result = or disjoint i32 %a, %b
   ret i32 %result
@@ -58,10 +58,10 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
-  ; CHECK-NEXT:   %13:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = COPY %12
-  ; CHECK-NEXT:   $vgpr1 = COPY %13
+  ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
+  ; CHECK-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
+  ; CHECK-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   %result = or disjoint <2 x i32> %a, %b
   ret <2 x i32> %result
@@ -78,9 +78,9 @@ define amdgpu_ps i64 @s_or_i64_disjoint(i64 inreg %a, i64 inreg %b) {
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
-  ; CHECK-NEXT:   %7:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY %7.sub1
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY %7.sub0
+  ; CHECK-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0
   ; CHECK-NEXT:   $sgpr0 = COPY [[COPY5]]
   ; CHECK-NEXT:   $sgpr1 = COPY [[COPY4]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
 
@@ -4,7 +4,7 @@
 define amdgpu_kernel void @divrem24_assume(ptr addrspace(1) %arg, i32 %arg1) {
 ; CHECK-LABEL: @divrem24_assume(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
+; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[ARG1:%.*]], 42
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = uitofp i32 [[TMP]] to float
 
@@ -197,26 +197,26 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
-; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
-; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_mov_b32_e32 v2, s0
-; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; CI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; CI-NEXT:    v_mov_b32_e32 v2, s1
+; CI-NEXT:    v_mov_b32_e32 v3, s0
+; CI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; CI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_fabs_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
-; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s3, 0x7fff7fff
+; VI-NEXT:    s_and_b32 s1, s2, 0x7fff7fff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_fabs_v4f16:
 
@@ -115,13 +115,13 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_bitset0_b32 s3, 31
-; VI-NEXT:    s_bitset0_b32 s2, 31
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_mov_b32_e32 v2, s0
-; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT:    s_and_b32 s1, s2, 0x7fffffff
+; VI-NEXT:    v_mov_b32_e32 v2, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, ptr addrspace(1) %out