[cuDNN][SDPA] Loosen constraints for GQA for cuDNN Attention (pytorch#150337)

eqy · timocafe · commit 989b692c34e9 · 2025-04-16T03:46:46.000-07:00
cuDNN attention doesn't require key and value tensors to have the same number of heads Pull Request resolved: pytorch#150337 Approved by: https://github.com/drisspg
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -553,9 +553,10 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) {
       TORCH_WARN("Experimental cuDNN SDPA nested tensor support is not enabled.");
     }
     return false;
-  } else if (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad()) {
+  } else if (has_for_nested_inputs(params) && (params.query.requires_grad() || params.key.requires_grad() || params.value.requires_grad())) {
     if (debug) {
       TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");
+      return false;
     }
   }
 
@@ -645,7 +646,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
   constexpr auto dense_constraints =
       c10::array_of<bool (*)(sdp_params const&, bool)>(
       check_last_dim_stride_equals_1_dense<true /*ignore_singleton_dim=*/>,
-      check_batch_size_and_num_heads_dense<true /*enable_gqa*/>
+      check_batch_size_and_num_heads_dense<true /*enable_gqa*/, false /*requires_same_num_heads*/>
   );
 
   if (has_only_dense_inputs(params)) {
diff --git a/aten/src/ATen/native/transformers/sdp_utils_cpp.h b/aten/src/ATen/native/transformers/sdp_utils_cpp.h
@@ -333,13 +333,14 @@ inline bool check_safe_kv_broadcast(at::Tensor const& param, bool debug) {
   return true;
 }
 
+template <bool requires_same_num_heads=true>
 inline bool check_grouped_query_attention(sdp_params const& params, bool debug) {
   const auto q_num_heads = params.query.sym_size(-3);
   const auto k_num_heads = params.key.sym_size(-3);
   const auto v_num_heads = params.value.sym_size(-3);
   const bool same_kv_heads = k_num_heads == v_num_heads;
 
-  if (!(same_kv_heads)){
+  if (requires_same_num_heads && !(same_kv_heads)){
     if (debug) {
       TORCH_WARN(
           "Both fused kernels require key and value to have the same num_heads and batch_size but got: ",
@@ -355,10 +356,10 @@ inline bool check_grouped_query_attention(sdp_params const& params, bool debug)
   }
   // Check if grouped query attention is supported and validate the number of
   // heads
-  if (q_num_heads % k_num_heads != 0) {
+  if (q_num_heads % k_num_heads != 0 || (!requires_same_num_heads && (q_num_heads % v_num_heads != 0))) {
     if (debug) {
       TORCH_WARN(
-          "FlashAttentionV2 only supports grouped query attention, where the number of heads in key/value must divide number of heads in query.",
+          "The number of heads in key/value must divide number of heads in query.",
           "Got input Key sizes(): ",
           params.key.sym_size(-3),
           ", Value sizes(): ",
@@ -372,7 +373,7 @@ inline bool check_grouped_query_attention(sdp_params const& params, bool debug)
   return true;
 }
 
-template <bool supports_gqa>
+template <bool supports_gqa, bool requires_same_num_heads=true>
 inline bool check_batch_size_and_num_heads_dense(sdp_params const& params, bool debug) {
   // This is expected to be called after check_tensor_shapes ensuring that the
   // size() calls won't error since the inputs are all 4 dimensional
@@ -407,9 +408,10 @@ inline bool check_batch_size_and_num_heads_dense(sdp_params const& params, bool
   }
 
   if(params.enable_gqa && supports_gqa){
-    return check_grouped_query_attention(params, debug);
+    return check_grouped_query_attention<requires_same_num_heads>(params, debug);
   }
 
+  // same num heads condition for non-gqa case
   if (!same_num_heads){
     if (debug) {
       TORCH_WARN(
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -2479,7 +2479,8 @@ def test_cudnn_attention_gqa(self, device):
         # Sample call to SDPA - GQ
         query = torch.rand(batch, 32, seq_len_q, D, device='cuda', dtype=torch.bfloat16)
         key = torch.rand(batch, 8, seq_len_kv, D, device='cuda', dtype=torch.bfloat16)
-        value = torch.rand(batch, 8, seq_len_kv, D, device='cuda', dtype=torch.bfloat16)
+        # cuDNN supports h_k != h_v
+        value = torch.rand(batch, 4, seq_len_kv, D, device='cuda', dtype=torch.bfloat16)
         with sdpa_kernel([SDPBackend.MATH]):
             output_math = scaled_dot_product_attention(query, key, value, is_causal=True, enable_gqa=True)
 

Original file line number	Diff line number	Diff line change
`@@ -553,9 +553,10 @@ bool check_for_nested_inputs(sdp_params const& params, bool debug) {`
`553`	`553`	`TORCH_WARN("Experimental cuDNN SDPA nested tensor support is not enabled.");`
`554`	`554`	`}`
`555`	`555`	`return false;`
`556`		`- } else if (params.query.requires_grad() \|\| params.key.requires_grad() \|\| params.value.requires_grad()) {`
	`556`	`+ } else if (has_for_nested_inputs(params) && (params.query.requires_grad() \|\| params.key.requires_grad() \|\| params.value.requires_grad())) {`
`557`	`557`	`if (debug) {`
`558`	`558`	`TORCH_WARN("Experimental cuDNN SDPA nested tensor support does not support backward.");`
	`559`	`+ return false;`
`559`	`560`	`}`
`560`	`561`	`}`
`561`	`562`
`@@ -645,7 +646,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {`
`645`	`646`	`constexpr auto dense_constraints =`
`646`	`647`	`c10::array_of<bool (*)(sdp_params const&, bool)>(`
`647`	`648`	`check_last_dim_stride_equals_1_dense<true /ignore_singleton_dim=/>,`
`648`		`- check_batch_size_and_num_heads_dense<true /enable_gqa/>`
	`649`	`+ check_batch_size_and_num_heads_dense<true /enable_gqa/, false /requires_same_num_heads/>`
`649`	`650`	`);`
`650`	`651`
`651`	`652`	`if (has_only_dense_inputs(params)) {`