Qualcomm AI Engine Direct - Add source transform for kv cache and sdpa

shewu-quic · web-flow · commit 9a4f32d56e33 · 2024-08-07T21:47:46.000-07:00
Differential Revision: D60913232 Pull Request resolved: #4555
diff --git a/backends/qualcomm/passes/replace_inf_buffer.py b/backends/qualcomm/passes/replace_inf_buffer.py
@@ -8,14 +8,18 @@
 
 
 class ReplaceInfBuffer(ExportPass):
+    """
+    Due to limitation in Qnn, we need to change inf or -inf to arbitrary value in quantization.
+    """
+
     def __init__(self):
         super(ReplaceInfBuffer, self).__init__()
 
     def call(self, graph_module: torch.fx.GraphModule):
         for buf_name, tensor in graph_module.named_buffers():
             if tensor.is_floating_point():
-                tensor[tensor == float("inf")] = torch.finfo(torch.float32).max
-                tensor[tensor == float("-inf")] = torch.finfo(torch.float32).min
+                tensor[tensor == float("inf")] = 255
+                tensor[tensor == float("-inf")] = -255
                 setattr(graph_module, buf_name, tensor)
 
         graph_module.recompile()
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -0,0 +1,118 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Sequence
+
+import torch
+from executorch.backends.qualcomm.quantizer.quantizer import (
+    get_16a8w_qnn_ptq_config,
+    get_default_8bit_qnn_ptq_config,
+    QuantizationConfig,
+)
+from executorch.backends.qualcomm.quantizer.utils import QUANT_ANNOTATION_KEY
+from torch.ao.quantization.quantizer import (
+    QuantizationAnnotation,
+    SharedQuantizationSpec,
+)
+from torch.fx import Node
+
+
+def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
+    """
+    This function is specific for llama matmul op 16a8w.
+    """
+
+    def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+        input_act1 = node.args[1]
+        input_spec1 = quantization_config.weight
+        input_qspec_map[input_act1] = input_spec1
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    def annotate_index_put(node: Node, quantization_config: QuantizationConfig) -> None:
+        input = node.args[0]
+        value = node.args[2]
+        input_qspec_map = {}
+        input_qspec_map[input] = quantization_config.input_activation
+        input_qspec_map[value] = SharedQuantizationSpec((input, node))
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=SharedQuantizationSpec((input, node)),
+            _annotated=True,
+        )
+
+    def annotate_single_in_single_out(
+        node: Node, quantization_config: QuantizationConfig
+    ) -> None:
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_qspec_map[input_act] = quantization_config.input_activation
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    def annotate_cat(node: Node, quantization_config: QuantizationConfig):
+        input_nodes = node.args[0]
+        assert isinstance(input_nodes, Sequence)
+        first_input_node = input_nodes[0]
+        input_qspec_map = {}
+        assert isinstance(first_input_node, Node)
+        assert isinstance(node, Node)
+        input_qspec_map[first_input_node] = quantization_config.input_activation
+        share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
+            (first_input_node, node)
+        )
+        for input_node in input_nodes[1:]:
+            if input_node not in input_qspec_map:
+                assert isinstance(input_node, Node)
+                input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=share_qparams_with_input_act0_qspec,
+            _annotated=True,
+        )
+
+    def is_edge_condition(node: Node):
+        if not isinstance(node, Node) or node.op != "call_function":
+            return True
+        return False
+
+    def annotate_matmul_input1(node: Node, quantization_config: QuantizationConfig):
+        if is_edge_condition(node):
+            return
+        if node.target == torch.ops.aten.index_put_.default:
+            annotate_index_put(node, quantization_config)
+            annotate_matmul_input1(node.args[0], quantization_config)
+        elif node.target == torch.ops.aten.cat.default:
+            annotate_cat(node, quantization_config)
+            # Expect that the inputs of the cat op are select ops
+            for arg in node.args[0][1:]:
+                annotate_single_in_single_out(arg, quantization_config)
+            annotate_matmul_input1(node.args[0][0], quantization_config)
+        else:
+            annotate_single_in_single_out(node, quantization_config)
+            annotate_matmul_input1(node.args[0], quantization_config)
+
+    # Annotate 16a8w for matmul op to get better performance
+    quantization_config_16a8w = get_16a8w_qnn_ptq_config()
+    # Annotate 8a8w for second input of matmul until past_kv_cache
+    quantization_config_8a8w = get_default_8bit_qnn_ptq_config(act_symmetric=True)
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
+            if "nn_module_stack" in node.meta:
+                module_values_list = list(node.meta["nn_module_stack"].values())
+                full_qualified_name = module_values_list[-1][0]
+                if "SDPA" in full_qualified_name:
+                    annotate_matmul(node, quantization_config_16a8w)
+                    annotate_matmul_input1(node.args[1], quantization_config_8a8w)
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -206,6 +206,13 @@ def _transform(edge_program: ExportedProgram) -> None:
     FoldQDQ()(graph_module)
     LayoutTransform(edge_program)(graph_module)
 
+    # Since QDQ nodes are stripped, update graph signature again to validate program
+    edge_program._graph_signature = _get_updated_graph_signature(
+        edge_program.graph_signature,
+        edge_program.graph_module,
+    )
+    edge_program._validate()
+
 
 def capture_program(
     module: torch.nn.Module,
@@ -222,12 +229,6 @@ def capture_program(
     core_ep.transform(ConvertBinaryOpsWithScalar())
     edge_ep = core_ep.to_edge(qnn_edge_config())
     _transform(edge_ep.exported_program)
-    # Since QDQ nodes are stripped, update graph signature again to validate program
-    edge_ep.exported_program._graph_signature = _get_updated_graph_signature(
-        edge_ep.exported_program.graph_signature,
-        edge_ep.exported_program.graph_module,
-    )
-    edge_ep.exported_program._validate()
     return edge_ep
 
 
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -52,7 +52,9 @@
 from .source_transformation.rope import materialze_broadcast_of_rope_freq_cis
 from .source_transformation.sdpa import (
     replace_causal_mask,
+    replace_kv_cache_with_simple_kv_cache,
     replace_sdpa_with_custom_op,
+    replace_sdpa_with_flex_sdpa,
     replace_sdpa_with_simple_sdpa,
 )
 
@@ -385,7 +387,12 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
         transforms.append(replace_sdpa_with_custom_op)
 
     if args.use_kv_cache:
-        if args.qnn or args.coreml or args.mps:
+        if args.qnn:
+            transforms.append(replace_kv_cache_with_simple_kv_cache)
+            transforms.append(replace_sdpa_with_flex_sdpa)
+            transforms.append(replace_causal_mask)
+
+        elif args.coreml or args.mps:
             # Currently qnn/coreml/mps doesn't support sdpa op, use the simpler decomposition
             # to get free perf gain.
             transforms.append(replace_sdpa_with_simple_sdpa)
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
@@ -161,6 +161,9 @@ def __init__(
         else:
             cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
 
+        self.max_batch_size = max_batch_size
+        self.n_heads = n_heads
+        self.head_dim = head_dim
         self.transpose_cache = transpose_cache
         self.enable_dynamic_shape = enable_dynamic_shape
         self.register_buffer(
diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py
@@ -9,6 +9,7 @@
 # Example script for exporting Llama2 to flatbuffer
 
 import math
+from typing import Tuple
 
 import torch
 
@@ -112,6 +113,61 @@ def forward(
         return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
 
 
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    if n_rep == 1:
+        return hidden_states
+
+    new_kv = []
+    batch, n_heads, seqlen, head_dim = hidden_states.shape
+    n_heads *= n_rep
+    for h in hidden_states[0]:
+        new_kv += [h] * n_rep
+    return torch.cat(new_kv, 0).reshape(batch, n_heads, seqlen, head_dim)
+
+
+class SDPAFlex(torch.nn.Module):
+
+    def __init__(
+        self,
+        kv_cache: KVCache,
+        dim: int,
+        n_rep: int,
+    ):
+        super().__init__()
+        self.kv_cache = kv_cache
+        self.dim = dim
+        self.n_rep = n_rep
+
+    def forward(
+        self,
+        input_pos: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz,
+        seqlen,
+        mask,
+    ):
+        q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+
+        k, v = self.kv_cache.update(input_pos, k, v)
+        k = repeat_kv(k, self.n_rep)
+        v = repeat_kv(v, self.n_rep)
+        attn_mask = mask[input_pos]
+
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        attn_weight = q @ k.transpose(-2, -1) * scale_factor
+        attn_weight += attn_mask
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        y = attn_weight @ v
+
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+
+
 def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
     for name, child in module.named_children():
         if isinstance(child, SDPA):
@@ -125,6 +181,71 @@ def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
     return module
 
 
+def replace_sdpa_with_flex_sdpa(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, SDPA):
+            setattr(
+                module,
+                name,
+                SDPAFlex(child.kv_cache, child.dim, child.n_rep),
+            )
+        else:
+            replace_sdpa_with_flex_sdpa(child)
+    return module
+
+
+class KVCacheSimple(torch.nn.Module):
+    def __init__(
+        self,
+        max_batch_size: int,
+        max_seq_length: int,
+        n_heads: int,
+        head_dim: int,
+        dtype=torch.float32,
+    ):
+        super().__init__()
+        cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
+        self.register_buffer(
+            "past_k_caches",
+            torch.zeros(cache_shape, dtype=dtype, device="cpu"),
+            persistent=False,
+        )
+        self.register_buffer(
+            "past_v_caches",
+            torch.zeros(cache_shape, dtype=dtype, device="cpu"),
+            persistent=False,
+        )
+
+    def update(
+        self, input_pos: torch.Tensor, k_val: torch.Tensor, v_val: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        k_out = torch.ops.aten.index_put_(self.past_k_caches, [None, input_pos], k_val)
+        v_out = torch.ops.aten.index_put_(self.past_v_caches, [None, input_pos], v_val)
+
+        k_out = k_out.transpose(1, 2)
+        v_out = v_out.transpose(1, 2)
+        return k_out, v_out
+
+
+def replace_kv_cache_with_simple_kv_cache(module: torch.nn.Module):
+    for name, child in module.named_children():
+        if isinstance(child, KVCache):
+            setattr(
+                module,
+                name,
+                KVCacheSimple(
+                    child.max_batch_size,
+                    child.max_seq_length,
+                    child.n_heads,
+                    child.head_dim,
+                    child.k_cache.dtype,
+                ),
+            )
+        else:
+            replace_kv_cache_with_simple_kv_cache(child)
+    return module
+
+
 def replace_causal_mask(module: torch.nn.Module):
     for buffer_fqn_name, buffer in module.named_buffers():
         buffer_name = buffer_fqn_name.split(".")[-1]
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
@@ -116,9 +116,6 @@ def get_qnn_partitioner(
             QnnPartitioner,
         )
 
-        # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.quantizer.quantizer`
-        from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
-
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.serialization.qnn_compile_spec_schema`
         from executorch.backends.qualcomm.serialization.qnn_compile_spec_schema import (
             QcomChipset,
@@ -138,16 +135,6 @@ def get_qnn_partitioner(
     skip_node_op_set = {}
     if pt2e_quantize is not None:
         use_fp16 = False
-        # TODO: fix the lowering error without skipping nodes
-
-        if quant_dtype == QuantDtype.use_8a8w:
-            raise NotImplementedError("8a8w for llama is still under development")
-
-        elif quant_dtype == QuantDtype.use_16a16w:
-            raise NotImplementedError("16a16w for llama is still under development")
-
-        elif quant_dtype == QuantDtype.use_16a4w:
-            raise NotImplementedError("16a4w for llama is still under development")
 
     return QnnPartitioner(
         generate_qnn_executorch_compiler_spec(
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py