pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/docker/common/install_android.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/docker/common/install_android.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/docker/conda-env-ci.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/conda-env-ci.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 85 additions & 0 deletions b/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 85 additions & 0 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/android-perf.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 22 additions & 2 deletions b/‎.github/workflows/trunk.yml
Lines changed: 22 additions & 2 deletions
diff --git a/‎backends/arm/operator_support/convolution_support.py
Lines changed: 17 additions & 0 deletions b/‎backends/arm/operator_support/convolution_support.py
Lines changed: 17 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/pool_2d_support.py
Lines changed: 42 additions & 2 deletions b/‎backends/arm/operator_support/pool_2d_support.py
Lines changed: 42 additions & 2 deletions
diff --git a/‎backends/arm/operator_support/reduce_sum_support.py
Lines changed: 5 additions & 0 deletions b/‎backends/arm/operator_support/reduce_sum_support.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/to_copy_support.py
Lines changed: 13 additions & 13 deletions b/‎backends/arm/operator_support/to_copy_support.py
Lines changed: 13 additions & 13 deletions
@@ -1 +1 @@
-08434df1f2f88c9770e59246caa2ff9c6f613270
+295f2ed4d103017f7e19a7b8263ece606cd629db
@@ -70,6 +70,7 @@ install_sdk() {
   # These are the tools needed to build Android apps
   yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "platforms;android-34"
   yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "build-tools;33.0.1"
+  yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "build-tools;35.0.0"
   # And some more tools for future emulator tests
   yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "platform-tools"
   yes | /opt/cmdline-tools/bin/sdkmanager --sdk_root="${SDK_INSTALLATION_DIR}" --install "tools"
 
@@ -1,4 +1,4 @@
-cmake=3.22.1
+cmake=3.26.4
 ninja=1.10.2
 libuv
 llvm-openmp
 
@@ -0,0 +1,85 @@
+#!/bin/bash
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+export EXECUTORCH_ROOT="$(dirname "${BASH_SOURCE[0]}")/../.."
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+which "${PYTHON_EXECUTABLE}"
+
+# Update tokenizers submodule
+pushd $EXECUTORCH_ROOT/extension/llm/tokenizers
+echo "Update tokenizers submodule"
+git submodule update --init
+popd
+
+# Install ET with CMake
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_INSTALL_PREFIX=cmake-out \
+    -DEXECUTORCH_ENABLE_LOGGING=1 \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -Bcmake-out .
+cmake --build cmake-out -j16 --target install --config Release
+
+# Install llama runner with torchao
+cmake -DPYTHON_EXECUTABLE=python \
+    -DCMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())') \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_XNNPACK=OFF \
+    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+    -DEXECUTORCH_BUILD_TORCHAO=ON \
+    -Bcmake-out/examples/models/llama \
+    examples/models/llama
+cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+# Download stories llama110m artifacts
+download_stories_model_artifacts
+
+echo "Creating tokenizer.bin"
+$PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+
+# Export model
+LLAMA_CHECKPOINT=stories110M.pt
+LLAMA_PARAMS=params.json
+MODEL_OUT=model.pte
+TOKENIZER=tokenizer.bin
+
+# Set low-bit quantization parameters
+QLINEAR_BITWIDTH=3 # Can be 1-8
+QLINEAR_GROUP_SIZE=128 # Must be multiple of 16
+QEMBEDDING_BITWIDTH=4 # Can be 1-8
+QEMBEDDING_GROUP_SIZE=32 # Must be multiple of 16
+
+${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
+    --checkpoint "${LLAMA_CHECKPOINT:?}" \
+    --params "${LLAMA_PARAMS:?}" \
+    -kv \
+    --use_sdpa_with_kv_cache \
+    --output_name=${MODEL_OUT} \
+    -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
+    --group_size ${QLINEAR_GROUP_SIZE} \
+    -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
+    --disable_dynamic_shape \
+    -d fp32
+
+# Test run
+./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path=$TOKENIZER --prompt="Once upon a time,"
@@ -2,7 +2,7 @@ name: android-perf
 
 on:
   schedule:
-    - cron: 0 0 * * *
+    - cron: 0 0,8,16 * * *
   pull_request:
     paths:
       - .github/workflows/android-perf.yml
 
@@ -23,8 +23,8 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
       matrix:
-        # Mac runners are expensive and limited, and non reliable. 
-        # Do some basic testing for macos jobs, and rely mostly on 
+        # Mac runners are expensive and limited, and non reliable.
+        # Do some basic testing for macos jobs, and rely mostly on
         # test-models-linux-aarch64 job instead.
         model: [emformer_join, ic4, llama2, mobilebert, mv3, resnet50, vit, w2l]
         backend: [xnnpack-quantization-delegation]
@@ -288,6 +288,26 @@ jobs:
         # Test ANE llama
         ${CONDA_RUN} sh .ci/scripts/test_ane_static_llama.sh
 
+  test-llama-torchao-lowbit:
+    name: test-llama-torchao-lowbit
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    with:
+      runner: macos-m1-stable
+      python-version: '3.11'
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+        bash .ci/scripts/setup-conda.sh
+        eval "$(conda shell.bash hook)"
+
+        # Install requirements
+        ${CONDA_RUN} python install_executorch.py
+        ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
+
+        # Run test
+        ${CONDA_RUN} sh .ci/scripts/test_llama_torchao_lowbit.sh
+
   test-llama-runner-linux:
     # Test Both linux x86 and linux aarch64
     name: test-llama-runner-linux
 
@@ -34,6 +34,9 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
 
         for pad in output_padding:
             if pad != 0:
+                self.reporter.report_reject(
+                    node, "Convolutions with non-zero output padding not implemented."
+                )
                 return False
 
         # Hardware specific constraints
@@ -56,19 +59,33 @@ def _is_node_supported_u55(self, node: fx.Node):
             # Depthwise convolution
             for dim in shape_in[1:]:
                 if not 1 <= dim <= 65536:
+                    self.reporter.report_reject(
+                        node,
+                        f"Depthwise convolution must have CWH <= 65536, got {dim})",
+                    )
                     return False
         else:
             # Convolution
             if not 1 <= C_in <= 65536:
+                self.reporter.report_reject(
+                    node, f"Convolution must have C <= 65536, got {C_in})"
+                )
                 return False
 
         kernel_w = kernel[2]
         kernel_h = kernel[3] if len(kernel) > 3 else 1
         # Kernel condition misses constraint on sum of absolute weights
         if not 1 <= kernel_h <= 64 or not 1 <= kernel_w * kernel_h <= 4096:
+            self.reporter.report_reject(
+                node,
+                f"Convolution needs to have kernel_y<=64, kernel_x*kernel_y<=4096, got kernel ({kernel_w}, {kernel_h})",
+            )
             return False
 
         if not self._stride_condition(node):
+            self.reporter.report_reject(
+                node, "Failed condition on stride, pad and dilation combination."
+            )
             return False
 
         return True
 
@@ -54,12 +54,35 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         if len(node.args) > 3:
             # Padding case
             if not all(1 <= k <= 8 for k in kernel):
+                self.reporter.report_reject(
+                    node, f"Avgpool2d with padding needs kernel dims < 8, got {kernel}"
+                )
                 return False
         else:
             if not kernel_check(kernel):
+                self.reporter.report_reject(
+                    node,
+                    f"Avgpool2d needs kernel_y < 256, kernel_x*kernel_y<=65536, got {kernel}",
+                )
                 return False
 
-        return dim_check(shape) and shape[0] == 1 and stride_check(stride)
+        if not dim_check(shape):
+            self.reporter.report_reject(
+                node,
+                f"Avgpool2d needs N == 1, rest dims <= 65536, got shape {list(shape)}",
+            )
+            return False
+        if not stride_check(stride):
+            self.reporter.report_reject(
+                node, f"Avgpool2d needs stride <= 3, got {stride}"
+            )
+            return False
+        if not shape[0] == 1:
+            self.reporter.report_reject(
+                node, f"Avgpool2d needs N==1, got N=={shape[0]}"
+            )
+            return False
+        return True
 
 
 @register_tosa_support_check
@@ -82,4 +105,21 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         kernel = cast(tuple[int, int], node.args[1])
         stride = cast(tuple[int, int], node.args[2])
 
-        return kernel_check(kernel) and dim_check(shape) and stride_check(stride)
+        if not kernel_check(kernel):
+            self.reporter.report_reject(
+                node,
+                f"Maxpool2d needs kernel_y < 256, kernel_x*kernel_y<=65536, got {kernel}",
+            )
+            return False
+        if not dim_check(shape):
+            self.reporter.report_reject(
+                node,
+                f"Maxpool2d needs N == 1, rest dims <= 65536, got shape {list(shape)}",
+            )
+            return False
+        if not stride_check(stride):
+            self.reporter.report_reject(
+                node, f"Maxpool2d needs stride <= 3, got {stride}"
+            )
+            return False
+        return True
@@ -34,6 +34,9 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
 
         for dim in dim_list:
             if not 1 <= input_shape[dim] <= 65536:
+                self.reporter.report_reject(
+                    node, f"sum needs dims < 65536, got shape {input_shape}"
+                )
                 return False
 
             # We can't be certain of which dim is the last in memory yet,
@@ -45,7 +48,9 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
             for length in input_shape[dim + 1 :]:
                 post_R_product *= length
             if not 1 <= pre_R_product <= 65536:
+                self.reporter.report_reject(node, "Failed dim check")
                 return False
             if not 1 <= post_R_product <= 65536:
+                self.reporter.report_reject(node, "Failed dim check")
                 return False
         return True
@@ -75,9 +75,6 @@ def is_node_tosa_supported(
     ) -> bool:
         assert node.target in self.targets
 
-        if tosa_spec not in self.tosa_specs:
-            return False
-
         assert tosa_spec.support_integer()
         supported_dtypes = (
             self.ALL_SUPPORTED_TYPES
@@ -97,30 +94,32 @@ def is_node_tosa_supported(
         assert isinstance(input_val, torch._subclasses.FakeTensor)
         input_dtype = input_val.dtype
         if input_dtype not in supported_dtypes:
-            logger.info(
-                f"Input dtype {input_val.dtype} is not supported in "
-                f"{node.target.name()}."  # type: ignore[union-attr]  # pyre-ignore[16]
+            self.reporter.report_reject(
+                node,
+                f"Input dtype {input_val.dtype} is not supported in {node.target}.",
             )
             return False
 
         # Check output type
         output_val = node.meta["val"]
         assert isinstance(output_val, torch._subclasses.FakeTensor)
         if output_val.dtype not in supported_dtypes[input_dtype]:
-            logger.info(
+            self.reporter.report_reject(
+                node,
                 f"Output dtype {output_val.dtype} is not supported in "
-                f"{node.target.name()} for input dtype {input_dtype}. "  # type: ignore[union-attr]  # pyre-ignore[16]
+                f"{node.target} for input dtype {input_dtype}. "
                 f"Supported output types: "
-                f"{''.join(str(t) for t in supported_dtypes[input_dtype])}"
+                f"{''.join(str(t) for t in supported_dtypes[input_dtype])}",
             )
             return False
 
         # Check memory format (to_copy)
         if "memory_format" in node.kwargs:
             if node.kwargs["memory_format"] in (torch.preserve_format,):
-                logger.info(
+                self.reporter.report_reject(
+                    node,
                     f"Argument 'memory_format' is not supported for "
-                    f"{node.target.name()} right now."  # type: ignore[union-attr]  # pyre-ignore[16]
+                    f"{node.target} right now.",
                 )
                 return False
 
@@ -129,9 +128,10 @@ def is_node_tosa_supported(
             dim_order = node.kwargs["dim_order"]
             # pyre-ignore[6]
             if dim_order != list(range(len(dim_order))):  # type: ignore[arg-type]
-                logger.info(
+                self.reporter.report_reject(
+                    node,
                     f"Argument {dim_order=} is not supported for "
-                    f"{node.target.name()} right now."  # type: ignore[union-attr]  # pyre-ignore[16]
+                    f"{node.target} right now.",
                 )
                 return False
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-08434df1f2f88c9770e59246caa2ff9c6f613270`
	`1`	`+295f2ed4d103017f7e19a7b8263ece606cd629db`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-cmake=3.22.1`
	`1`	`+cmake=3.26.4`
`2`	`2`	`ninja=1.10.2`
`3`	`3`	`libuv`
`4`	`4`	`llvm-openmp`