pytorch
diff --git a/‎.github/workflows/android-release-artifacts.yml
Lines changed: 10 additions & 0 deletions b/‎.github/workflows/android-release-artifacts.yml
Lines changed: 10 additions & 0 deletions
diff --git a/‎.github/workflows/doc-build.yml
Lines changed: 14 additions & 0 deletions b/‎.github/workflows/doc-build.yml
Lines changed: 14 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 4 additions & 4 deletions b/‎CONTRIBUTING.md
Lines changed: 4 additions & 4 deletions
diff --git a/‎README-wheel.md
Lines changed: 1 addition & 1 deletion b/‎README-wheel.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/apple/coreml/runtime/test/setup.md
Lines changed: 8 additions & 8 deletions b/‎backends/apple/coreml/runtime/test/setup.md
Lines changed: 8 additions & 8 deletions
diff --git a/‎backends/apple/coreml/setup.md
Lines changed: 2 additions & 2 deletions b/‎backends/apple/coreml/setup.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/mps/mps_preprocess.py
Lines changed: 15 additions & 1 deletion b/‎backends/apple/mps/mps_preprocess.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎backends/apple/mps/setup.md
Lines changed: 7 additions & 7 deletions b/‎backends/apple/mps/setup.md
Lines changed: 7 additions & 7 deletions
diff --git a/‎backends/cadence/aot/pass_utils.py
Lines changed: 4 additions & 3 deletions b/‎backends/cadence/aot/pass_utils.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/cadence/aot/replace_ops.py
Lines changed: 80 additions & 27 deletions b/‎backends/cadence/aot/replace_ops.py
Lines changed: 80 additions & 27 deletions
@@ -11,6 +11,11 @@ on:
         description: Upload the AAR to maven staging repository
         required: false
         type: boolean
+      flavor:
+        type: choice
+        options:
+          - "xnnpack"
+          - "vulkan+xnnpack"
   schedule:
     - cron: 0 10 * * *
 
@@ -86,6 +91,11 @@ jobs:
           sed -i "s/\(coordinates(\"org.pytorch\", \"executorch-android\", \"\)\([0-9]\+.[0-9]\+.[0-9]\+\)\(\")\)/\1$VERSION\3/" extension/android/executorch_android/build.gradle
         fi
 
+        FLAVOR="${{ inputs.flavor }}"
+        if [[ "$FLAVOR" == "vulkan+xnnpack" ]]; then
+          export EXECUTORCH_BUILD_VULKAN=ON
+        fi
+
         # Build AAR Package
         mkdir aar-out
         export BUILD_AAR_DIR=aar-out
 
@@ -14,6 +14,20 @@ on:
     - cron: '0 0 * * *'
 
 jobs:
+  check-urls:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Check URLs
+        run: bash ./scripts/check_urls.sh
+
+  check-xrefs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - name: Check Links
+        run: bash ./scripts/check_xrefs.sh
+
   build:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
 
@@ -45,11 +45,11 @@ executorch
 │   └── <a href="devtools/visualization">visualization</a> - Visualization tools for representing model structure and performance metrics.
 ├── <a href="docs">docs</a> - Static docs tooling and documentation source files.
 ├── <a href="examples">examples</a> - Examples of various user flows, such as model export, delegates, and runtime execution.
-├── <a href="exir">exir</a> - Ahead-of-time library: model capture and lowering APIs. EXport Intermediate Representation (EXIR) is a format for representing the result of <a href="https://pytorch.org/docs/stable/export.html">torch.export</a>. This directory contains utilities and passes for lowering the EXIR graphs into different <a href="/docs/source/ir-exir.md">dialects</a> and eventually suitable to run on target hardware.
+├── <a href="exir">exir</a> - Ahead-of-time library: model capture and lowering APIs. EXport Intermediate Representation (EXIR) is a format for representing the result of <a href="https://pytorch.org/docs/stable/export.html">torch.export</a>. This directory contains utilities and passes for lowering the EXIR graphs into different <a href="docs/source/ir-exir.md">dialects</a> and eventually suitable to run on target hardware.
 │   ├── <a href="exir/_serialize">_serialize</a> - Serialize final export artifact.
 │   ├── <a href="exir/backend">backend</a> - Backend delegate ahead of time APIs.
 │   ├── <a href="exir/capture">capture</a> - Program capture.
-│   ├── <a href="exir/dialects">dialects</a> - Op sets for various dialects in the export process. Please refer to the <a href="/docs/source/ir-exir.md">EXIR spec</a> and the <a href="/docs/source/compiler-backend-dialect.md">backend dialect</a> doc for more details.
+│   ├── <a href="exir/dialects">dialects</a> - Op sets for various dialects in the export process. Please refer to the <a href="docs/source/ir-exir.md">EXIR spec</a> and the <a href="docs/source/compiler-backend-dialect.md">backend dialect</a> doc for more details.
 │   ├── <a href="exir/emit">emit</a> - Conversion from ExportedProgram to ExecuTorch execution instructions.
 │   ├── <a href="exir/operator">operator</a> - Operator node manipulation utilities.
 │   ├── <a href="exir/passes">passes</a> - Built-in compiler passes.
@@ -68,7 +68,7 @@ executorch
 │   ├── <a href="extension/memory_allocator">memory_allocator</a> - 1st party memory allocator implementations.
 │   ├── <a href="extension/module">module</a> - A simplified C++ wrapper for the runtime. An abstraction that deserializes and executes an ExecuTorch artifact (.pte file). Refer to the <a href="docs/source/extension-module.md">module documentation</a> for more information.
 │   ├── <a href="extension/parallel">parallel</a> - C++ threadpool integration.
-│   ├── <a href="extension/pybindings">pybindings</a> - Python API for executorch runtime. This is powering up the <a href="docs/source/runtime-python-api-reference.md">runtime Python API</a> for ExecuTorch.
+│   ├── <a href="extension/pybindings">pybindings</a> - Python API for executorch runtime. This is powering up the <a href="docs/source/runtime-python-api-reference.rst">runtime Python API</a> for ExecuTorch.
 │   ├── <a href="extension/pytree">pytree</a> - C++ and Python flattening and unflattening lib for pytrees.
 │   ├── <a href="extension/runner_util">runner_util</a> - Helpers for writing C++ PTE-execution tools.
 │   ├── <a href="extension/tensor">tensor</a> - Tensor maker and <code>TensorPtr</code>, details in <a href="docs/source/extension-tensor.md">this documentation</a>. For how to use <code>TensorPtr</code> and <code>Module</code>, please refer to the <a href="docs/source/using-executorch-cpp.md">"Using ExecuTorch with C++"</a> doc.
@@ -114,7 +114,7 @@ If you're completely new to open-source projects, GitHub, or ExecuTorch, please
 1. If you've changed APIs or added a new tool or feature, [update the
    documentation](#updating-documentation).
 1. If you added an experimental API or deprecated an existing API, follow the
-   [API Life Cycle and Deprecation Policy](/docs/source/api-life-cycle.md).
+   [API Life Cycle and Deprecation Policy](docs/source/api-life-cycle.md).
 1. Make sure your code follows the [style guides](#coding-style) and passes the
    [lint checks](#lintrunner).
 1. If you haven't already, complete the [Contributor License Agreement ("CLA")](#contributor-license-agreement-cla).
 
@@ -25,6 +25,6 @@ tutorials and documentation. Here are some starting points:
 * [Exporting to ExecuTorch](https://pytorch.org/executorch/main/tutorials/export-to-executorch-tutorial)
   * Learn the fundamentals of exporting a PyTorch `nn.Module` to ExecuTorch, and
     optimizing its performance using quantization and hardware delegation.
-* Running LLaMA on [iOS](docs/source/llm/llama-demo-ios) and [Android](docs/source/llm/llama-demo-android) devices.
+* Running LLaMA on [iOS](docs/source/llm/llama-demo-ios.md) and [Android](docs/source/llm/llama-demo-android.md) devices.
   * Build and run LLaMA in a demo mobile app, and learn how to integrate models
     with your own apps.
@@ -4,18 +4,18 @@ This is a tutorial for setting up tests for the **Core ML** backend.
 
 ## Running tests
 
-1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment.
+1. Follow the instructions described in [Setting Up ExecuTorch](../../../../../docs/source/getting-started-setup.rst) to set up ExecuTorch environment.
 
 2. Run `install_requirements.sh` to install dependencies required by the **Core ML** backend.
 
 ```bash
 cd executorch
 
-sh backends/apple/coreml/scripts/install_requirements.sh   
+sh backends/apple/coreml/scripts/install_requirements.sh
 
-``` 
+```
 
-3. Follow the instructions described in [Building with CMake](/docs/source/runtime-build-and-cross-compilation.md#building-with-cmake) to set up CMake build system.
+3. Follow the instructions described in [Building with CMake](../../../../../docs/source/using-executorch-cpp.md#building-with-cmake) to set up CMake build system.
 
 4. Install [Xcode](https://developer.apple.com/xcode/).
 
@@ -26,7 +26,7 @@ sh backends/apple/coreml/scripts/install_requirements.sh
 ```bash
 cd executorch
 
-# Builds macOS universal test bundle. 
+# Builds macOS universal test bundle.
 
 sh backends/apple/coreml/srcipts/build_tests.sh
 
@@ -40,15 +40,15 @@ cd executorch
 sh backends/apple/coreml/srcipts/run_tests.sh
 
 ```
- 
+
 ## Updating tests
 
 1. Open the Xcode workspace.
 
 ```bash
 cd executorch
 
-# Builds macOS universal test bundle. 
+# Builds macOS universal test bundle.
 
 open backends/apple/coreml/runtime/workspace/executorchcoreml.xcworkspace
 
@@ -62,4 +62,4 @@ cd executorch
 # There is no need to build the tests.
 sh backends/apple/coreml/srcipts/run_tests.sh
 
-```
+```
@@ -4,7 +4,7 @@ This is a tutorial for setting up the Core ML backend.
 
 ## AOT Setup
 
-1. Follow the instructions described in [Setting Up ExecuTorch](/docs/source/getting-started-setup.md) to set up ExecuTorch environment.
+1. Follow the instructions described in [Setting Up ExecuTorch](../../../docs/source/getting-started-setup.rst) to set up ExecuTorch environment.
 
 
 2. Run the example script to validate that the **Core ML** backend is set up correctly.
@@ -28,7 +28,7 @@ delegated_program_manager = edge_program_manager.to_backend(CoreMLPartitioner())
 
 ## Integrating Core ML delegate into runtime.
 
-1. Follow the instructions described in [Building with CMake](/docs/source/runtime-build-and-cross-compilation.md#building-with-cmake) to set up CMake build system.
+1. Follow the instructions described in [Building with CMake](../../../docs/source/using-executorch-cpp.md#building-with-cmake) to set up CMake build system.
 
 2. Install [Xcode](https://developer.apple.com/xcode/).
 
 
@@ -6,6 +6,7 @@
 from typing import ClassVar, Dict, final, List, Tuple
 
 import torch
+from executorch import exir
 
 from executorch.backends.apple.mps.operators.node_visitor import (
     get_node_visitors,
@@ -35,6 +36,7 @@
 
 from executorch.exir.passes.memory_format_ops_pass import DimOrderOpsRevertPass
 from executorch.exir.program._program import _transform
+from executorch.exir.verification.verifier import EXIREdgeDialectVerifier
 from torch.export.exported_program import ExportedProgram
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
@@ -87,7 +89,19 @@ def preprocess(
         #    the `output_ids` array in the schema.
 
         # TODO: Remove this once we have a better support for the dim-order ops.
-        edge_program = _transform(edge_program, DimOrderOpsRevertPass())
+        # Need to override the verifier to skip the non dim-order ops from tripping the default verifier.
+        edge_program = _transform(
+            edge_program,
+            DimOrderOpsRevertPass(),
+            override_verifiers=[
+                EXIREdgeDialectVerifier(
+                    edge_compile_config=exir.EdgeCompileConfig(
+                        _check_ir_validity=False,  # Disable the edge dialect verifier, since we are in the mps backend.
+                    ),
+                    class_only=True,
+                )
+            ],
+        )
 
         mps_graph = MPSGraph(
             version="0",
 
@@ -12,11 +12,11 @@ The MPS backend device maps machine learning computational graphs and primitives
 :::
 :::{grid-item-card}  Tutorials we recommend you complete before this:
 :class-card: card-prerequisites
-* [Introduction to ExecuTorch](intro-how-it-works.md)
-* [Setting up ExecuTorch](getting-started-setup.md)
-* [Building ExecuTorch with CMake](runtime-build-and-cross-compilation.md)
-* [ExecuTorch iOS Demo App](demo-apps-ios.md)
-* [ExecuTorch iOS LLaMA Demo App](llm/llama-demo-ios.md)
+* [Introduction to ExecuTorch](../../../docs/source/intro-how-it-works.md)
+* [Setting up ExecuTorch](../../../docs/source/getting-started-setup.rst)
+* [Building ExecuTorch with CMake](../../../docs/source/using-executorch-cpp.md#building-with-cmake)
+* [ExecuTorch iOS Demo App](../../../docs/source/demo-apps-ios.md)
+* [ExecuTorch iOS LLaMA Demo App](../../../docs/source/llm/llama-demo-ios.md)
 :::
 ::::
 
@@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp
 ```
 
 ### Profiling:
-1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
+1. [Optional] Generate an [ETRecord](../../../docs/source/etrecord.rst) while you're exporting your model.
 ```bash
 cd executorch
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
 ```
-2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](../../../docs/source/etdump.md).
 ```
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
 ```
 
@@ -35,8 +35,8 @@ class CadencePassAttribute:
 ALL_CADENCE_PASSES: dict[ExportPass, CadencePassAttribute] = {}
 
 
-def get_cadence_pass_attribute(p: ExportPass) -> CadencePassAttribute:
-    return ALL_CADENCE_PASSES[p]
+def get_cadence_pass_attribute(p: ExportPass) -> Optional[CadencePassAttribute]:
+    return ALL_CADENCE_PASSES.get(p, None)
 
 
 # A decorator that registers a pass.
@@ -61,7 +61,8 @@ def create_cadence_pass_filter(
     def _filter(p: ExportPass) -> bool:
         pass_attribute = get_cadence_pass_attribute(p)
         return (
-            pass_attribute.opt_level is not None
+            pass_attribute is not None
+            and pass_attribute.opt_level is not None
             and pass_attribute.opt_level <= opt_level
             and (not pass_attribute.debug_pass or debug)
         )
 
@@ -17,8 +17,9 @@
 # pyre-unsafe
 
 import math
+import operator
 from operator import neg
-from typing import cast, Dict, Iterable, Sequence, Set, Tuple
+from typing import cast, Dict, Iterable, Optional, Sequence, Set, Tuple
 
 import torch
 import torch.fx
@@ -1806,30 +1807,6 @@ def call_operator(self, op, args, kwargs, meta):
         return super().call_operator(op, tuple(new_args), kwargs, meta)
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=0))
-class ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass(ExportPass):
-    """
-    Replace the aten.linalg_vector_norm op with a custom op.
-    aten.linalg_vector_norm is not supported by Jarvis, so we
-    need to replace it with native_batch_norm at all optimization levels.
-    """
-
-    def call_operator(self, op, args, kwargs, meta):
-        if op != exir_ops.edge.aten.linalg_vector_norm.default:
-            return super().call_operator(op, args, kwargs, meta)
-
-        assert (
-            len(args) == 1
-        ), "aten.linalg_vector_norm should have 1 argument (a tensor), we do not support any custom variants"
-
-        return super().call_operator(
-            exir_ops.edge.cadence.linalg_vector_norm.default,
-            args,
-            kwargs,
-            meta,
-        )
-
-
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
     """
@@ -2206,6 +2183,82 @@ def call_operator(
         )
 
 
+# Adapted from fbcode/pyspeech/opt_passes/replace_ops.py
+@register_cadence_pass(CadencePassAttribute(opt_level=2))
+class ReplaceSplitWithSlicePass(ExportPass):
+    """
+    split_with_sizes() delegates to slice() op, so perform this replacement here.
+    This avoids the expense of delegation from ATen.
+    """
+
+    # For split_with_sizes, return the slice dim and extent for each split.
+    def get_split_sizes(
+        self, graph_module: torch.fx.GraphModule, node: torch.fx.Node
+    ) -> Optional[list[tuple[int, ...]]]:
+        # Parse the args of the split_with_sizes op
+        tensor_arg, split_sizes = node.args[0:2]
+        assert isinstance(tensor_arg, torch.fx.Node)
+        in_shape = get_shape(graph_module, tensor_arg)
+        split_dim = 0 if len(node.args) < 3 else node.args[2]
+        if in_shape is None:
+            return None
+
+        # Canonicalize the split dimension
+        assert isinstance(split_dim, int)
+        split_dim = split_dim if split_dim >= 0 else len(in_shape) + split_dim
+
+        # Create the slice op args corresponding to each split
+        slice_ops = []
+        split_start = 0
+        assert isinstance(split_sizes, list)
+        for split_size in split_sizes:
+            split_end = split_start + split_size
+            slice_args = (split_dim, split_start, split_end)
+            slice_ops.append(slice_args)
+            split_start = split_end
+
+        return slice_ops
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        graph = graph_module.graph
+        for node in graph.nodes:
+            if not isinstance(node.target, EdgeOpOverload):
+                continue
+            if (
+                get_edge_overload_packet(node.target)
+                != exir_ops.edge.aten.split_with_sizes_copy
+            ):
+                continue
+            # All the users of this split_with_sizes op must be getitem ops
+            if any(user.target != operator.getitem for user in node.users):
+                continue
+
+            # Get the slice dim and extent for each split
+            slice_ops = self.get_split_sizes(graph_module, node)
+            if slice_ops is None:
+                continue
+
+            # Go over each getitem user, and replace it with slice op
+            for user in list(node.users.keys()):
+                assert user.target == operator.getitem
+                item_idx = user.args[1]
+                assert item_idx < len(slice_ops)
+                cur_slice = slice_ops[item_idx]
+                with graph.inserting_before(user):
+                    cur_slice_node = graph.call_function(
+                        exir_ops.edge.aten.slice_copy.Tensor,
+                        (node.args[0], cur_slice[0], cur_slice[1], cur_slice[2], 1),
+                    )
+                user.replace_all_uses_with(cur_slice_node)
+                graph.erase_node(user)
+
+            graph.erase_node(node)
+
+        graph_module.recompile()
+        result = super().call(graph_module)
+        return result
+
+
 # This class encapsulates all the functions that replace/switch one op in the
 # graph with another.
 class CadenceReplaceOpsInGraph:
@@ -2243,7 +2296,7 @@ class CadenceReplaceOpsInGraph:
         ReplacePT2DequantWithCadenceDequantPass,
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
-        ReplaceAtenLinalgVectorNormWithCadenceLinalgVectorNormPass,
         ReplaceWhereWithFullArgsWithWhereScalar,
-        # ReplaceGeluWithApproximateGeluPass,
+        ReplaceGeluWithApproximateGeluPass,
+        ReplaceSplitWithSlicePass,
     ]