pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_model.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 68 additions & 10 deletions b/‎.github/workflows/trunk.yml
Lines changed: 68 additions & 10 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 16 additions & 38 deletions b/‎CMakeLists.txt
Lines changed: 16 additions & 38 deletions
diff --git a/‎CODEOWNERS
Lines changed: 8 additions & 5 deletions b/‎CODEOWNERS
Lines changed: 8 additions & 5 deletions
diff --git a/‎backends/arm/_passes/TARGETS
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/TARGETS
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 2 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 9 additions & 9 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 9 additions & 9 deletions
@@ -39,8 +39,7 @@ set_up_aot() {
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
-      -DPYTHON_EXECUTABLE=python3 \
-      -DEXECUTORCH_SEPARATE_FLATCC_HOST_PROJECT=OFF
+      -DPYTHON_EXECUTABLE=python3
   cmake --build $PWD --target "PyQnnManagerAdaptor" "PyQnnWrapperAdaptor" -j$(nproc)
   # install Python APIs to correct import path
   # The filename might vary depending on your Python and host version.
 
@@ -201,7 +201,7 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only $EXTRA_FLAGS
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
 
@@ -555,11 +555,11 @@ jobs:
     strategy:
       matrix:
         hf_model_id: [
-          google/gemma-2-2b,
-          Qwen/Qwen2.5-0.5B,
+          google/gemma-3-1b-it,
+          Qwen/Qwen3-0.6B,
           HuggingFaceTB/SmolLM2-135M,
           meta-llama/Llama-3.2-1B,
-          allenai/OLMo-1B-hf
+          allenai/OLMo-1B-hf,
         ]
       fail-fast: false
     with:
@@ -569,44 +569,102 @@ jobs:
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
+      upload-artifact: profiling-artifacts-${{ strategy.job-index }}
       script: |
         echo "::group::Set up ExecuTorch"
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        # Build executor_runner with ETdump enabled
+        PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DEXECUTORCH_ENABLE_LOGGING=1 \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+          -DEXECUTORCH_BUILD_XNNPACK=ON \
+          -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
+          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+          -Bcmake-out .
+        cmake --build cmake-out -j16 --target install --config Release
         echo "::endgroup::"
 
         echo "::group::Set up Hugging Face"
         pip install -U "huggingface_hub[cli]"
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         git clone https://github.com/huggingface/optimum-executorch
-        cd optimum-executorch
+        pushd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout 577a2b19670e4c643a5c6ecb09bf47b9a699e7c6
+        git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8
         pip install .[tests]
+        popd
+
+        if [ "${{ matrix.hf_model_id }}" == "google/gemma-3-1b-it" ]; then
+          # Fixes for gemma-3 is not available in the released version
+          git clone https://github.com/huggingface/transformers.git
+          pushd transformers
+          git checkout a57274466f7f72efaa2662d1738cdaf28ae8071f
+          pip install -e .
+          popd
+        fi
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export and Run ${{ matrix.hf_model_id }}"
+        echo "::group::Export to ExecuTorch"
         # Pass matrix variable as environment variable
         export MODEL_ID="${{ matrix.hf_model_id }}"
+        export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_8da4w"
+        pushd optimum-executorch
+
+        optimum-cli export executorch \
+          --model ${MODEL_ID} \
+          --task text-generation \
+          --recipe xnnpack \
+          --use_custom_sdpa \
+          --output_dir ${OUTPUT_DIR} \
+          --qlinear
+
+        ls -FlAGhp ${OUTPUT_DIR}
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Inference using python API"
+        pushd optimum-executorch
         python -c "
         import os
         from optimum.executorch import ExecuTorchModelForCausalLM
         from transformers import AutoTokenizer
 
         model_id = os.getenv('MODEL_ID')
-        print(f'Loading model: {model_id}')
-        model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        pte_dir = os.getenv('OUTPUT_DIR')
+        print(f'Loading model {model_id} from {pte_dir}.')
+        model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
         generated_text = model.text_generation(
-          tokenizer=tokenizer,
+          tokenizer=AutoTokenizer.from_pretrained(model_id),
           prompt='Simply put, the theory of relativity states that',
           max_seq_len=64
         )
         print(generated_text)
         "
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Inference using executor_runner with ETDump"
+        ./cmake-out/executor_runner \
+          --model_path ${OUTPUT_DIR}/model.pte \
+          --etdump_path ${OUTPUT_DIR}/etdump.etdp
+
+        export TSV_PATH=artifacts-to-be-uploaded/${MODEL_ID}_op_prof.tsv
+        mkdir -p $(dirname "$TSV_PATH")
+        python3 -m devtools.inspector.inspector_cli \
+          --etdump_path ${OUTPUT_DIR}/etdump.etdp \
+          --tsv_path ${TSV_PATH}
+
         echo "::endgroup::"
 
 
 
@@ -49,6 +49,21 @@ project(executorch)
 
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
 
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+announce_configured_options(CMAKE_CXX_STANDARD)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Debug)
+endif()
+announce_configured_options(CMAKE_BUILD_TYPE)
+
+announce_configured_options(CMAKE_CXX_COMPILER_ID)
+announce_configured_options(CMAKE_TOOLCHAIN_FILE)
+announce_configured_options(BUCK2)
+announce_configured_options(PYTHON_EXECUTABLE)
+
 load_build_preset()
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
 
@@ -63,14 +78,6 @@ include(ExternalProject)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-if(NOT CMAKE_CXX_STANDARD)
-  set(CMAKE_CXX_STANDARD 17)
-endif()
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Debug)
-endif()
-
 # Setup RPATH.
 # See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
 # Use separate rpaths during build and install phases
@@ -128,22 +135,6 @@ else()
   set(CMAKE_CXX_FLAGS_RELEASE "-O2 ${CMAKE_CXX_FLAGS_RELEASE}")
 endif()
 
-#
-# pthreadpool: build pthreadpool library. Disable on unsupported platforms
-#
-cmake_dependent_option(
-  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
-  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
-)
-
-#
-# cpuinfo: build cpuinfo library. Disable on unsupported platforms
-#
-cmake_dependent_option(
-  EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
-  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
-)
-
 add_subdirectory(third-party)
 
 if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
@@ -475,14 +466,6 @@ install(
 )
 install(FILES tools/cmake/executorch-config.cmake DESTINATION lib/cmake/ExecuTorch)
 
-#
-# executor_runner: Host tool that demonstrates program execution.
-#
-cmake_dependent_option(
-  EXECUTORCH_BUILD_EXECUTOR_RUNNER "Build the executor_runner executable" ON
-  "NOT CMAKE_TOOLCHAIN_IOS" OFF
-)
-
 # Add googletest if any test targets should be built
 if(BUILD_TESTING)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/googletest)
@@ -571,9 +554,7 @@ if(EXECUTORCH_BUILD_EXTENSION_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/tensor)
 endif()
 
-if(EXECUTORCH_BUILD_PTHREADPOOL
-   AND EXECUTORCH_BUILD_CPUINFO
-)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
 endif()
 
@@ -738,6 +719,3 @@ if(EXECUTORCH_BUILD_VULKAN)
 endif()
 
 include(Test.cmake)
-
-# Print all summary
-executorch_print_configuration_summary()
@@ -15,10 +15,6 @@
 /backends/vulkan @SS-JIA
 /backends/xnnpack @digantdesai @mcr229
 
-/build @GregoryComer @kirklandsign
-
-/codegen @larryliu0820 @lucylq
-
 /devtools @tarun292 @Gasoonjia
 
 /docs @mergennachin
@@ -41,7 +37,6 @@
 /exir/backend @cccclai @kimishpatel @JacobSzwejbka @tarun292
 /exir @JacobSzwejbka @tarun292 @larryliu0820
 
-
 /extension/android @kirklandsign
 /extension/android_test @kirklandsign
 /extension/apple @shoumikhin
@@ -83,3 +78,11 @@
 /test @larryliu0820 @kirklandsign
 
 /util @tarun292
+
+# Build System -----------------------------------------------------------------
+
+CMakeLists.txt @jathu @larryliu0820 @kirklandsign
+CMakePresets.json @jathu @larryliu0820 @kirklandsign
+
+/codegen @larryliu0820 @lucylq
+/tools/cmake @jathu @larryliu0820 @kirklandsign
@@ -7,6 +7,7 @@ python_library(
         "//executorch/backends/arm:tosa_quant_utils",
         "//executorch/backends/arm:tosa_utils",
         "//executorch/backends/transforms:fuse_view_copy",
+        "//executorch/backends/transforms:remove_getitem_op",
         "//executorch/backends/transforms:replace_scalar_with_tensor",
         "//executorch/backends/xnnpack/_passes:xnnpack_passes",
         "//executorch/exir:lib",
 
@@ -32,6 +32,7 @@
 from .decompose_softmax_pass import DecomposeSoftmaxPass  # noqa
 from .decompose_softmax_unstable_pass import DecomposeSoftmaxUnstablePass  # noqa
 from .decompose_sqrt_pass import DecomposeSqrtPass  # noqa
+from .decompose_sum_pass import DecomposeSumPass  # noqa
 from .decompose_var_pass import DecomposeVarPass  # noqa
 from .fold_qdq_with_annotated_qparams_pass import (  # noqa
     FoldAndAnnotateQParamsPass,
@@ -44,10 +45,8 @@
 from .fuse_quantized_activation_pass import FuseQuantizedActivationPass  # noqa
 from .insert_rescales_pass import InsertRescalePass  # noqa
 from .insert_table_ops import InsertTableOpsPass  # noqa
-from .keep_dims_false_to_squeeze_pass import KeepDimsFalseToSqueezePass  # noqa
 from .match_arg_ranks_pass import MatchArgRanksPass  # noqa
 from .match_where_self_arg_dtype_pass import MatchWhereSelfDtypePass  # noqa
-from .meandim_to_averagepool_pass import ConvertMeanDimToAveragePoolPass  # noqa
 from .mm_to_bmm_pass import ConvertMmToBmmPass  # noqa
 from .remove_clone_pass import RemoveClonePass  # noqa
 from .replace_scalar_with_tensor_pass import (  # noqa
 
@@ -17,7 +17,6 @@
     ConvertAnyDefaultDimDimsPass,
     ConvertExpandCopyToRepeatPass,
     ConvertFullLikeToFullPass,
-    ConvertMeanDimToAveragePoolPass,
     ConvertMinMaxPass,
     ConvertMmToBmmPass,
     ConvertSplitToSlicePass,
@@ -37,6 +36,7 @@
     DecomposeSoftmaxPass,
     DecomposeSoftmaxUnstablePass,
     DecomposeSqrtPass,
+    DecomposeSumPass,
     DecomposeVarPass,
     FoldAndAnnotateQParamsPass,
     FuseBatchnorm2DPass,
@@ -45,7 +45,6 @@
     FuseQuantizedActivationPass,
     InsertRescalePass,
     InsertTableOpsPass,
-    KeepDimsFalseToSqueezePass,
     MatchArgRanksPass,
     MatchWhereSelfDtypePass,
     QuantizeOperatorArguments,
@@ -60,7 +59,7 @@
     UnsqueezeScalarPlaceholdersPass,
 )
 
-from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.transforms.decompose_sdpa import (
     DecomposeScaledDotProductAttention,
 )
@@ -87,13 +86,13 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
-        self.add_pass(ConvertMeanDimToAveragePoolPass())
+        self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertFullLikeToFullPass())
         self.add_pass(ConvertToClampPass())
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ConvertAnyDefaultDimDimsPass())
         self.add_pass(MatchWhereSelfDtypePass())
-        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+        if self.tosa_spec.is_U55_subset:
             self.add_pass(CastToInt32Pass())
 
         self.add_pass(ReplaceScalarWithTensorArgPassTOSABI())
@@ -110,7 +109,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
-        self.add_pass(KeepDimsFalseToSqueezePass())
+        self.add_pass(DecomposeSumPass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
@@ -140,7 +139,6 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeNotEqualPass())
-        self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxPass())
         self.add_pass(DecomposeGeluPass())
@@ -163,7 +161,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
-        self.add_pass(KeepDimsFalseToSqueezePass())
+        self.add_pass(DecomposeSumPass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
@@ -212,12 +210,14 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeSqrtPass())
         self.add_pass(DecomposeSiluPass())
 
-        if isinstance(self.tosa_spec, Tosa_0_80) and self.tosa_spec.is_U55_subset:
+        if self.tosa_spec.is_U55_subset:
             # Numerically stable softmax uses amax which is not supported on Ethos-U55
             self.add_pass(DecomposeSoftmaxUnstablePass())
         else:
             self.add_pass(DecomposeSoftmaxPass())
 
         self.add_pass(ConvertMinMaxPass())
         self.add_pass(ReplaceInfValues())
+        self.add_pass(DecomposeSumPass())
+
         return self._transform(graph_module)
Original file line number	Diff line number	Diff line change
`@@ -201,7 +201,7 @@ test_model_with_qnn() {`
`201`	`201`	`# TODO(guangyang): Make QNN chipset matches the target device`
`202`	`202`	`QNN_CHIPSET=SM8450`
`203`	`203`
`204`		`- "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only $EXTRA_FLAGS`
	`204`	`+ "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --ci --compile_only $EXTRA_FLAGS`
`205`	`205`	`EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)`
`206`	`206`	`}`
`207`	`207`