llvm · Pierre-vh · Apr 25, 2025 · Apr 15, 2025 · Apr 16, 2025 · Apr 22, 2025
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
@@ -451,6 +451,9 @@ class TargetMachine {
   /// Entry point for module splitting. Targets can implement custom module
   /// splitting logic, mainly used by LTO for --lto-partitions.
   ///
+  /// On success, this guarantees that between 1 and \p NumParts modules were
+  /// created and passed to \p ModuleCallBack.
+  ///
   /// \returns `true` if the module was split, `false` otherwise. When  `false`
   /// is returned, it is assumed that \p ModuleCallback has never been called
   /// and \p M has not been modified.

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp
@@ -1478,6 +1478,10 @@ static void splitAMDGPUModule(
              << "' - Partition summaries will not be printed\n";
   }
 
+  // One module will import all GlobalValues that are not Functions
+  // and are not subject to conservative import.
+  bool ImportAllGVs = true;
+
   for (unsigned PID = 0; PID < NumParts; ++PID) {
     SplitModuleTimer SMT2("modules_creation",
                           "creating modules for each partition");
@@ -1487,6 +1491,13 @@ static void splitAMDGPUModule(
     for (unsigned NodeID : (*Proposal)[PID].set_bits())
       FnsInPart.insert(&SG.getNode(NodeID).getFunction());
 
+    // Don't create empty modules.
+    if (FnsInPart.empty()) {
+      LLVM_DEBUG(dbgs() << "[split] P" << PID
+                        << " is empty, not creating module\n");
+      continue;
+    }
+
-    for (unsigned NodeID : (*Proposal)[PID].set_bits())
-      FnsInPart.insert(&SG.getNode(NodeID).getFunction());
-
-    // Don't create empty modules.
-    if (FnsInPart.empty()) {
-      LLVM_DEBUG(dbgs() << "[split] P" << PID
-                        << " is empty, not creating module\n");
-      continue;
-    }
+    // Don't create empty modules.
+    if ((*Proposal)[PID].none()) {
+      LLVM_DEBUG(dbgs() << "[split] P" << PID
+                        << " is empty, not creating module\n");
+      continue;
+    }
+    for (unsigned NodeID : (*Proposal)[PID].set_bits())
+      FnsInPart.insert(&SG.getNode(NodeID).getFunction()); 
-    for (unsigned NodeID : (*Proposal)[PID].set_bits())
-      FnsInPart.insert(&SG.getNode(NodeID).getFunction());
-
-    // Don't create empty modules.
-    if (FnsInPart.empty()) {
-      LLVM_DEBUG(dbgs() << "[split] P" << PID
-                        << " is empty, not creating module\n");
-      continue;
-    }
+    // Don't create empty modules.
+    if ((*Proposal)[PID].none()) {
+      LLVM_DEBUG(dbgs() << "[split] P" << PID
+                        << " is empty, not creating module\n");
+      continue;
+    }
+    for (unsigned NodeID : (*Proposal)[PID].set_bits())
+      FnsInPart.insert(&SG.getNode(NodeID).getFunction()); 
     ValueToValueMapTy VMap;
     CostType PartCost = 0;
     std::unique_ptr<Module> MPart(
@@ -1500,10 +1511,12 @@ static void splitAMDGPUModule(
             return false;
           }
 
-          // Everything else goes in the first partition.
-          return needsConservativeImport(GV) || PID == 0;
+          // Everything else goes in the first non-empty module we create.
+          return ImportAllGVs || needsConservativeImport(GV);
         }));
 
+    ImportAllGVs = false;
+
     // FIXME: Aliases aren't seen often, and their handling isn't perfect so
     // bugs are possible.
 

diff --git a/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll b/llvm/test/tools/llvm-split/AMDGPU/address-taken-externalize-with-call.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; 3 kernels:
 ;   - A does a direct call to HelperA
@@ -11,14 +10,11 @@
 ; The helper functions will get externalized, so C/A will end up
 ; in the same partition.
 
-; P0 is empty.
-; CHECK0: declare
+; CHECK0: define amdgpu_kernel void @B(ptr %dst)
 
-; CHECK1: define amdgpu_kernel void @B(ptr %dst)
-
-; CHECK2: define hidden void @HelperA()
-; CHECK2: define amdgpu_kernel void @A()
-; CHECK2: define amdgpu_kernel void @C()
+; CHECK1: define hidden void @HelperA()
+; CHECK1: define amdgpu_kernel void @A()
+; CHECK1: define amdgpu_kernel void @C()
 
 define internal void @HelperA() {
   ret void

diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging-weak_odr.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0
 ; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s
@@ -15,19 +14,18 @@
 ; Also check w/o large kernels processing to verify they are indeed handled
 ; differently.
 
-; P0 is empty
-; CHECK0: declare
+; Only two partitions created for the first command.
 
-; CHECK1: define internal void @HelperC()
-; CHECK1: define weak_odr amdgpu_kernel void @C
+; CHECK0: define internal void @HelperC()
+; CHECK0: define weak_odr amdgpu_kernel void @C
 
-; CHECK2: define internal void @large2()
-; CHECK2: define internal void @large1()
-; CHECK2: define internal void @large0()
-; CHECK2: define internal void @HelperA()
-; CHECK2: define internal void @HelperB()
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define weak_odr amdgpu_kernel void @B
+; CHECK1: define internal void @large2()
+; CHECK1: define internal void @large1()
+; CHECK1: define internal void @large0()
+; CHECK1: define internal void @HelperA()
+; CHECK1: define internal void @HelperB()
+; CHECK1: define amdgpu_kernel void @A
+; CHECK1: define weak_odr amdgpu_kernel void @B
 
 ; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
 ; NOLARGEKERNELS-CHECK0: define weak_odr amdgpu_kernel void @C

diff --git a/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll b/llvm/test/tools/llvm-split/AMDGPU/large-kernels-merging.ll
@@ -1,7 +1,6 @@
 ; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
 ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
 ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
-; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 --implicit-check-not=define %s
 
 ; RUN: llvm-split -o %t.nolarge %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-large-threshold=0 -amdgpu-module-splitting-max-depth=0
 ; RUN: llvm-dis -o - %t.nolarge0 | FileCheck --check-prefix=NOLARGEKERNELS-CHECK0 --implicit-check-not=define %s
@@ -15,19 +14,18 @@
 ; Also check w/o large kernels processing to verify they are indeed handled
 ; differently.
 
-; P0 is empty
-; CHECK0: declare
+; Only 2 partitions for the first command.
 
-; CHECK1: define internal void @HelperC()
-; CHECK1: define amdgpu_kernel void @C
+; CHECK0: define internal void @HelperC()
+; CHECK0: define amdgpu_kernel void @C
 
-; CHECK2: define internal void @large2()
-; CHECK2: define internal void @large1()
-; CHECK2: define internal void @large0()
-; CHECK2: define internal void @HelperA()
-; CHECK2: define internal void @HelperB()
-; CHECK2: define amdgpu_kernel void @A
-; CHECK2: define amdgpu_kernel void @B
+; CHECK1: define internal void @large2()
+; CHECK1: define internal void @large1()
+; CHECK1: define internal void @large0()
+; CHECK1: define internal void @HelperA()
+; CHECK1: define internal void @HelperB()
+; CHECK1: define amdgpu_kernel void @A
+; CHECK1: define amdgpu_kernel void @B
 
 ; NOLARGEKERNELS-CHECK0: define internal void @HelperC()
 ; NOLARGEKERNELS-CHECK0: define amdgpu_kernel void @C

diff --git a/llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll b/llvm/test/tools/llvm-split/AMDGPU/preserve-globals.ll
@@ -0,0 +1,21 @@
+; RUN: llvm-split -o %t %s -j 3 -mtriple amdgcn-amd-amdhsa -amdgpu-module-splitting-max-depth=0 -amdgpu-module-splitting-large-threshold=1.2 -amdgpu-module-splitting-merge-threshold=0.5
+; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 --implicit-check-not=define %s
+; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 --implicit-check-not=define %s
+
+; Only 2 out of 3 partitions are created, check the external global is preserved in the first partition.
+
+; CHECK0: @foobar = linkonce_odr global i64 52
+; CHECK0: define amdgpu_kernel void @B
+
+; CHECK1-NOT: @foobar = linkonce_odr global i64 52
+; CHECK1: define amdgpu_kernel void @A
+
+@foobar = linkonce_odr global i64 52
+
+define amdgpu_kernel void @A() {
+  ret void
+}
+
+define amdgpu_kernel void @B() {
+  ret void
+}