Preserve metadata and address other review comments

gandhi56 · gandhi56 · commit 884231351e93 · 2025-04-28T10:04:52.000-04:00
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1319,23 +1319,36 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
   if (EQClasses.size() < 2)
     return;
 
+  auto CopyMetaDataFromTo = [&](Instruction *Src, Instruction *Dst) {
+    SmallVector<std::pair<unsigned, MDNode *>, 4> MD;
+    Src->getAllMetadata(MD);
+    for (const auto [ID, Node] : MD) {
+      Dst->setMetadata(ID, Node);
+    }
+  };
+
   // For each class, determine if all instructions are of type int, FP or ptr.
   // This information will help us determine the type instructions should be
   // casted into.
   MapVector<EqClassKey, Bitset<3>> ClassAllTy;
-  for (auto C : EQClasses) {
-    if (all_of(EQClasses[C.first], [](Instruction *I) {
-          return I->getType()->isIntOrIntVectorTy();
-        }))
-      ClassAllTy[C.first].set(0);
-    else if (all_of(EQClasses[C.first], [](Instruction *I) {
-               return I->getType()->isFPOrFPVectorTy();
-             }))
-      ClassAllTy[C.first].set(1);
-    else if (all_of(EQClasses[C.first], [](Instruction *I) {
-               return I->getType()->isPtrOrPtrVectorTy();
-             }))
-      ClassAllTy[C.first].set(2);
+  for (const auto &C : EQClasses) {
+    auto CommonTypeKind = [](Instruction *I) {
+      if (I->getType()->isIntOrIntVectorTy())
+        return 0;
+      if (I->getType()->isFPOrFPVectorTy())
+        return 1;
+      if (I->getType()->isPtrOrPtrVectorTy())
+        return 2;
+      return -1; // Invalid type kind
+    };
+
+    int FirstTypeKind = CommonTypeKind(EQClasses[C.first][0]);
+    if (FirstTypeKind != -1 &&
+      all_of(EQClasses[C.first], [&](Instruction *I) {
+        return CommonTypeKind(I) == FirstTypeKind;
+      })) {
+      ClassAllTy[C.first].set(FirstTypeKind);
+    }
   }
 
   // Loop over all equivalence classes and try to merge them. Keep track of
@@ -1359,6 +1372,11 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
       if (Ptr1 != Ptr2 || AS1 != AS2 || IsLoad1 != IsLoad2 || TySize1 < TySize2)
         continue;
 
+      // An All-FP class should only be merged into another All-FP class.
+      if ((ClassAllTy[EC1.first].test(1) && !ClassAllTy[EC2.first].test(1)) ||
+          (!ClassAllTy[EC1.first].test(2) && ClassAllTy[EC2.first].test(2)))
+        continue;
+
       // Ensure all instructions in EC2 can be bitcasted into NewTy.
       /// TODO: NewTyBits is needed as stuctured binded variables cannot be
       /// captured by a lambda until C++20.
@@ -1384,8 +1402,8 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
       }
 
       for (auto *Inst : EC2.second) {
-        auto *Ptr = getLoadStorePointerOperand(Inst);
-        auto *OrigTy = Inst->getType();
+        Value *Ptr = getLoadStorePointerOperand(Inst);
+        Type *OrigTy = Inst->getType();
         if (OrigTy == NewTy)
           continue;
         if (auto *LI = dyn_cast<LoadInst>(Inst)) {
@@ -1404,6 +1422,7 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
               SI->getValueOperand()->getName() + ".cast");
           auto *NewStore = Builder.CreateStore(
               Cast, getLoadStorePointerOperand(SI), SI->isVolatile());
+          CopyMetaDataFromTo(SI, NewStore);
           SI->eraseFromParent();
           EQClasses[EC1.first].emplace_back(NewStore);
         }
@@ -1413,7 +1432,7 @@ void Vectorizer::insertCastsToMergeClasses(EquivalenceClassMap &EQClasses) {
       // basic block. This is important to ensure that the instructions are
       // vectorized in the correct order.
       std::sort(EQClasses[EC1.first].begin(), EQClasses[EC1.first].end(),
-                [](Instruction *A, Instruction *B) {
+                [](const Instruction *A, const Instruction *B) {
                   return A && B && A->comesBefore(B);
                 });
       ClassesToErase.insert(EC2.first);
diff --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -113,10 +113,15 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: and_and_and:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-SDAG-LABEL: and_and_and:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GFX950-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: and_and_and:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GFX950-GISEL-NEXT:    ; return to shader part epilog
   %and1 = and i32 %a, %c
   %and2 = and i32 %and1, %b
   %ret_cast = bitcast i32 %and2 to float
@@ -126,10 +131,15 @@ define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
 ; ========= Multi bit functions =========
 
 define amdgpu_ps float @test_12(i32 %a, i32 %b) {
-; GCN-LABEL: test_12:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-SDAG-LABEL: test_12:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GFX950-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: test_12:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GFX950-GISEL-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %and1 = and i32 %nota, %b
   %ret_cast = bitcast i32 %and1 to float
diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll
@@ -48,8 +48,8 @@ define i32 @v_or_i32_disjoint(i32 %a, i32 %b) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
+  ; CHECK-NEXT:   %10:vgpr_32 = disjoint V_OR_B32_e64 [[COPY1]], [[COPY]], implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY %10
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
   %result = or disjoint i32 %a, %b
   ret i32 %result
@@ -64,10 +64,10 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) {
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
-  ; CHECK-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
-  ; CHECK-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
+  ; CHECK-NEXT:   %12:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec
+  ; CHECK-NEXT:   %13:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY %12
+  ; CHECK-NEXT:   $vgpr1 = COPY %13
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
   %result = or disjoint <2 x i32> %a, %b
   ret <2 x i32> %result
diff --git a/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll b/llvm/test/CodeGen/AMDGPU/divrem24-assume.ll
@@ -4,7 +4,7 @@
 define amdgpu_kernel void @divrem24_assume(ptr addrspace(1) %arg, i32 %arg1) {
 ; CHECK-LABEL: @divrem24_assume(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[ARG1:%.*]], 42
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = uitofp i32 [[TMP]] to float
diff --git a/llvm/test/CodeGen/NVPTX/dag-cse.ll b/llvm/test/CodeGen/NVPTX/dag-cse.ll
@@ -9,8 +9,8 @@
 ; Verify that loads with different memory types are not subject to CSE
 ; once they are promoted to the same type.
 ;
-; CHECK: ld.global.u16  %[[B1:rs[0-9]+]], [a];
-; CHECK: st.global.u16  [b], %[[B1]];
+; CHECK: ld.global.v2.u8  {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a];
+; CHECK: st.global.v2.u8  [b], {%[[B1]], %[[B2]]};
 ;
 ; CHECK: ld.global.u32 %[[C:r[0-9]+]], [a];
 ; CHECK: st.global.u32 [c], %[[C]];

Original file line number	Diff line number	Diff line change
`@@ -9,8 +9,8 @@`
`9`	`9`	`; Verify that loads with different memory types are not subject to CSE`
`10`	`10`	`; once they are promoted to the same type.`
`11`	`11`	`;`
`12`		`-; CHECK: ld.global.u16 %[[B1:rs[0-9]+]], [a];`
`13`		`-; CHECK: st.global.u16 [b], %[[B1]];`
	`12`	`+; CHECK: ld.global.v2.u8 {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a];`
	`13`	`+; CHECK: st.global.v2.u8 [b], {%[[B1]], %[[B2]]};`
`14`	`14`	`;`
`15`	`15`	`; CHECK: ld.global.u32 %[[C:r[0-9]+]], [a];`
`16`	`16`	`; CHECK: st.global.u32 [c], %[[C]];`