PR #22029: [XLA:GPU] Add support for SM101a and SM120a architectures (Blackwell)

sergey-kozub · Google-ML-Automation · commit 9c9cb93bc48a · 2025-01-29T11:41:16.000-08:00
Imported from GitHub PR #22029 In addition to SM120a, also add SM101a mentioned in the PTX 8.7 spec (https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes), which is a slight variation of SM100a. Bumping the max supported PTX version to 8.7, as the LLVM PR (llvm/llvm-project#124155) adding the support is now integrated to OpenXLA. Copybara import of the project: -- be59b7a by Sergey Kozub <skozub@nvidia.com>: [XLA:GPU] Add support for SM101a and SM120a architectures (Blackwell) Merging this change closes #22029 FUTURE_COPYBARA_INTEGRATE_REVIEW=#22029 from openxla:devel/sm120a be59b7a PiperOrigin-RevId: 721049239
diff --git a/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc b/xla/service/gpu/llvm_gpu_backend/nvptx_backend.cc
@@ -238,8 +238,8 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) {
   int sm_version = 30;
   // If the current compute capability isn't known, fallback to the
   // most recent version before it.
-  int supported_versions[] = {100, 90, 89, 87, 86, 80, 75, 72, 70, 62,
-                              61,  60, 53, 52, 50, 37, 35, 32, 30};
+  int supported_versions[] = {120, 101, 100, 90, 89, 87, 86, 80, 75, 72, 70,
+                              62,  61,  60,  53, 52, 50, 37, 35, 32, 30};
   for (int v : supported_versions) {
     if (v <= compute_capability_version) {
       sm_version = v;
@@ -261,7 +261,7 @@ std::string GetSmName(se::CudaComputeCapability compute_capability) {
   // On Hopper, default to sm_90a so that all instructions can be used. But
   // only sm_90 is forward compatible, so don't use sm_90a with newer hardware:
   // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#ptx-compatibility
-  // Similarly for sm_100a (Blackwell).
+  // Similarly for sm_100a, sm_101a and sm_120a (Blackwell).
   absl::string_view extension =
       stream_executor::ShouldUsePtxExtension(compute_capability) ? "a" : "";
   return absl::StrCat("sm_", sm_version, extension);
@@ -333,7 +333,7 @@ absl::StatusOr<std::string> CompileToPtx(
 
 namespace {
 constexpr stream_executor::SemanticVersion kFallbackPtxVersion{6, 5, 0};
-constexpr stream_executor::SemanticVersion kMaxPtxVersion{8, 6, 0};
+constexpr stream_executor::SemanticVersion kMaxPtxVersion{8, 7, 0};
 }  // namespace
 
 stream_executor::SemanticVersion
@@ -357,7 +357,8 @@ DetermineHighestSupportedPtxVersionFromCudaVersion(
     return {cuda_version.major() - 4, cuda_version.minor(), 0};
   }
   // CUDA 12.6 -> PTX 8.5
-  if (cuda_version < stream_executor::SemanticVersion{12, 7, 0}) {
+  // CUDA 12.8 -> PTX 8.7
+  if (cuda_version < stream_executor::SemanticVersion{12, 9, 0}) {
     return {cuda_version.major() - 4, cuda_version.minor() - 1, 0};
   }
 
diff --git a/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc b/xla/service/gpu/llvm_gpu_backend/nvptx_backend_test.cc
@@ -29,9 +29,11 @@ namespace se = ::stream_executor;
 TEST(UtilsTest, TestGetSmName) {
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{9, 0}), "sm_90a");
   ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{10, 0}), "sm_100a");
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{10, 1}), "sm_101a");
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{12, 0}), "sm_120a");
   // Do not use the extension for a yet-unknown compute capability.
   // https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes-ptx-release-history
-  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{10, 9}), "sm_100");
+  ASSERT_EQ(nvptx::GetSmName(se::CudaComputeCapability{12, 9}), "sm_120");
 }
 
 using VersionPair = std::pair<se::SemanticVersion, se::SemanticVersion>;
@@ -63,6 +65,7 @@ INSTANTIATE_TEST_SUITE_P(VersionTest, PtxVersionFromCudaVersionTest,
                              {{12, 4, 0}, {8, 4, 0}},
                              {{12, 5, 0}, {8, 5, 0}},
                              {{12, 6, 0}, {8, 5, 0}},
+                             {{12, 8, 0}, {8, 7, 0}},
                          }),
                          [](::testing::TestParamInfo<VersionPair> data) {
                            se::SemanticVersion cuda_version = data.param.first;
diff --git a/xla/stream_executor/cuda/ptx_compiler_helpers.cc b/xla/stream_executor/cuda/ptx_compiler_helpers.cc
@@ -101,10 +101,12 @@ void WarnIfBadPtxasVersion(absl::string_view method,
   });
 }
 
-// The extension is used for compute capabilities 9.0 and 10.0.
+// The extension is used for compute capabilities 9.0, 10.0, 10.1 and 12.0.
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#ptx-compatibility
 bool ShouldUsePtxExtension(const CudaComputeCapability& cc) {
-  return (cc.major == 9 && cc.minor == 0) || (cc.major == 10 && cc.minor == 0);
+  return (cc.major == 9 && cc.minor == 0) ||
+         (cc.major == 10 && (cc.minor == 0 || cc.minor == 1)) ||
+         (cc.major == 12 && cc.minor == 0);
 }
 
 }  // namespace stream_executor

Original file line number	Diff line number	Diff line change
`@@ -101,10 +101,12 @@ void WarnIfBadPtxasVersion(absl::string_view method,`
`101`	`101`	`});`
`102`	`102`	`}`
`103`	`103`
`104`		`-// The extension is used for compute capabilities 9.0 and 10.0.`
	`104`	`+// The extension is used for compute capabilities 9.0, 10.0, 10.1 and 12.0.`
`105`	`105`	`// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#ptx-compatibility`
`106`	`106`	`bool ShouldUsePtxExtension(const CudaComputeCapability& cc) {`
`107`		`- return (cc.major == 9 && cc.minor == 0) \|\| (cc.major == 10 && cc.minor == 0);`
	`107`	`+ return (cc.major == 9 && cc.minor == 0) \|\|`
	`108`	`+ (cc.major == 10 && (cc.minor == 0 \|\| cc.minor == 1)) \|\|`
	`109`	`+ (cc.major == 12 && cc.minor == 0);`
`108`	`110`	`}`
`109`	`111`
`110`	`112`	`} // namespace stream_executor`