pytorch
diff --git a/‎backends/xnnpack/operators/op_static_constant_pad.py
Lines changed: 13 additions & 1 deletion b/‎backends/xnnpack/operators/op_static_constant_pad.py
Lines changed: 13 additions & 1 deletion
diff --git a/‎backends/xnnpack/test/ops/test_static_constant_pad.py
Lines changed: 45 additions & 0 deletions b/‎backends/xnnpack/test/ops/test_static_constant_pad.py
Lines changed: 45 additions & 0 deletions
diff --git a/‎docs/source/_static/img/android_studio.jpeg
139 KB b/‎docs/source/_static/img/android_studio.jpeg
139 KB
diff --git a/‎docs/source/_static/img/android_studio.mp4
1.92 MB b/‎docs/source/_static/img/android_studio.mp4
1.92 MB
diff --git a/‎docs/source/backend-delegates-xnnpack-reference.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/backend-delegates-xnnpack-reference.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/backends-qualcomm.md
Lines changed: 0 additions & 5 deletions b/‎docs/source/backends-qualcomm.md
Lines changed: 0 additions & 5 deletions
diff --git a/‎docs/source/index.md
Lines changed: 287 additions & 0 deletions b/‎docs/source/index.md
Lines changed: 287 additions & 0 deletions
@@ -7,6 +7,7 @@
 from typing import cast, Dict, List
 
 import torch
+
 from executorch.backends.xnnpack.operators.node_visitor import (
     get_tensor_value,
     NodeVisitor,
@@ -17,7 +18,11 @@
     XNNStaticConstantPad,
     XNode,
 )
-from executorch.backends.xnnpack.utils.utils import check_or_raise, get_input_node
+from executorch.backends.xnnpack.utils.utils import (
+    check_or_raise,
+    get_input_node,
+    PERM_NCHW_TO_NHWC,
+)
 
 
 @register_node_visitor
@@ -113,8 +118,15 @@ def define_node(
         # b)
         # tuple[0] = prepadding dim[-1]
         # tuple[1] = postpadding dim[-1]
+        is_channels_last = node.meta.get("XNN_NHWC_NODE", False)
         pre_paddings = all_paddings[-2::-2]  # even index elements in reverse order
         post_paddings = all_paddings[::-2]  # odd index elements in reverse order
+        if is_channels_last:
+            check_or_raise(len(pre_paddings) == 4, "Expecting prepaddings to be 4D")
+            check_or_raise(len(post_paddings) == 4, "Expecting postpaddings to be 4D")
+
+            pre_paddings = [pre_paddings[i] for i in PERM_NCHW_TO_NHWC]
+            post_paddings = [post_paddings[i] for i in PERM_NCHW_TO_NHWC]
 
         # the padding value, which defaults to 0.0
         padding_value = cast(float, node.args[2]) if len(node.args) > 2 else 0.0
 
@@ -14,6 +14,30 @@ class TestStaticConstantPad(unittest.TestCase):
     def setUp(self):
         torch._dynamo.reset()
 
+    class NHWCStaticConstantPad(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(in_channels=2, out_channels=2, kernel_size=1)
+            self.conv2 = torch.nn.Conv2d(in_channels=13, out_channels=13, kernel_size=1)
+
+        def forward(self, x):
+            a = self.conv1(x)
+            pad_6 = (1, 2, 3, 4, 5, 6)
+            a = torch.nn.functional.pad(
+                input=a,
+                pad=pad_6,
+                mode="constant",
+                value=3.1,
+            )
+            # tensorshape = [1, 13, 10, 7]
+            a = self.conv2(a)
+
+            return a
+
+        def sample_inputs(self):
+            # NCHW
+            return (torch.randn(1, 2, 3, 4),)
+
     class StaticConstantPadFunctional(torch.nn.Module):
         def __init__(self):
             super().__init__()
@@ -205,3 +229,24 @@ def test_qs8_static_constant_pad_2d(self):
             .serialize()
             .run_method_and_compare_outputs()
         )
+
+    def test_fp32_static_constant_pad_nhwc(self):
+        conv = self.NHWCStaticConstantPad()
+        inputs = conv.sample_inputs()
+        (
+            Tester(conv, inputs)
+            .export()
+            .check_count({"torch.ops.aten.pad.default": 1})
+            .dump_artifact()
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(
+                [
+                    "executorch_exir_dialects_edge__ops_aten_constant_pad_nd_default",
+                    "executorch_exir_dialects_edge__ops_aten_convolution_default",
+                ]
+            )
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
@@ -142,5 +142,5 @@ def _qdq_quantized_linear(
 You can read more indepth explanations on PyTorch 2 quantization [here](https://pytorch.org/tutorials/prototype/pt2e_quant_ptq.html).
 
 ## See Also
-- [Integrating XNNPACK Delegate Android App](demo-apps-android.md)
+- [Integrating XNNPACK Delegate in Android AAR](using-executorch-android.md)
 - [Complete the Lowering to XNNPACK Tutorial](tutorial-xnnpack-delegate-lowering.md)
@@ -351,11 +351,6 @@ The command-line arguments are written in [utils.py](https://github.com/pytorch/
 The model, inputs, and output location are passed to `qnn_executorch_runner` by `--model_path`, `--input_list_path`, and `--output_folder_path`.
 
 
-### Running a model via ExecuTorch's android demo-app
-
-An Android demo-app using Qualcomm AI Engine Direct Backend can be found in
-`examples`. Please refer to android demo app [tutorial](demo-apps-android.md).
-
 ## Supported model list
 
 Please refer to `$EXECUTORCH_ROOT/examples/qualcomm/scripts/` and `EXECUTORCH_ROOT/examples/qualcomm/oss_scripts/` to the list of supported models.
 
@@ -0,0 +1,287 @@
+(home)=
+# Welcome to the ExecuTorch Documentation
+
+**ExecuTorch** is PyTorch's solution to training and inference on the
+Edge.
+
+## Key Value Propositions
+
+- **Portability:** Compatibility with a wide variety of computing
+  platforms, from high-end mobile phones to highly constrained
+  embedded systems and microcontrollers.
+- **Productivity:** Enabling developers to use the same toolchains and
+  Developer Tools from PyTorch model authoring and conversion, to
+  debugging and deployment to a wide variety of platforms.
+- **Performance:** Providing end users with a seamless and
+  high-performance experience due to a lightweight runtime and
+  utilizing full hardware capabilities such as CPUs, NPUs, and DSPs.
+
+ExecuTorch provides support for:
+
+* **Strong Model Support** LLMs (Large Language Models),
+  CV (Computer Vision), ASR (Automatic Speech Recognition), TTS (Text To Speech)
+* **All Major Platforms** Android, Mac, Linux, Windows
+* **Rich Acceleration Support** Apple, Arm, Cadence, MediaTek,
+  Qualcomm, Vulkan, XNNPACK
+
+### Documentation Navigation
+#### Introduction
+- [Overview](intro-overview)
+- [How it Works](intro-how-it-works)
+- [Getting Started with Architecture](getting-started-architecture)
+- [Concepts](concepts)
+#### Usage
+- [Getting Started](getting-started)
+- [Using Executorch Export](using-executorch-export)
+- [Using Executorch on Android](using-executorch-android)
+- [Using Executorch on iOS](using-executorch-ios)
+- [Using Executorch with C++](using-executorch-cpp)
+- [Runtime Integration](using-executorch-runtime-integration)
+- [Troubleshooting](using-executorch-troubleshooting)
+- [Building from Source](using-executorch-building-from-source)
+- [FAQs](using-executorch-faqs)
+#### Examples
+- [Android Demo Apps](https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app)
+- [iOS Demo Apps](demo-apps-ios.md)
+#### Backends
+- [Overview](backends-overview)
+- [XNNPACK](backends-xnnpack)
+- [Core ML](backends-coreml)
+- [MPS](backends-mps)
+- [Vulkan](backends-vulkan)
+- [ARM Ethos-U](backends-arm-ethos-u)
+- [Qualcomm](backends-qualcomm)
+- [MediaTek](backends-mediatek)
+- [Cadence](backends-cadence)
+#### Developer Tools
+- [Overview](devtools-overview)
+- [Bundled IO](bundled-io)
+- [ETRecord](etrecord)
+- [ETDump](etdump)
+- [Runtime Profiling](runtime-profiling)
+- [Model Debugging](model-debugging)
+- [Model Inspector](model-inspector)
+- [Memory Planning Inspection](memory-planning-inspection)
+- [Delegate Debugging](delegate-debugging)
+- [Tutorial](devtools-tutorial)
+#### Runtime
+- [Overview](runtime-overview)
+- [Extension Module](extension-module)
+- [Extension Tensor](extension-tensor)
+- [Running a Model (C++ Tutorial)](running-a-model-cpp-tutorial)
+- [Backend Delegate Implementation and Linking](runtime-backend-delegate-implementation-and-linking)
+- [Platform Abstraction Layer](runtime-platform-abstraction-layer)
+#### Portable C++ Programming
+- [PTE File Format](pte-file-format)
+#### API Reference
+- [Export to Executorch API Reference](export-to-executorch-api-reference)
+- [Executorch Runtime API Reference](executorch-runtime-api-reference)
+- [Runtime Python API Reference](runtime-python-api-reference)
+- [API Life Cycle](api-life-cycle)
+- [Javadoc](https://pytorch.org/executorch/main/javadoc/)
+#### Quantization
+- [Overview](quantization-overview)
+#### Kernel Library
+- [Overview](kernel-library-overview)
+- [Custom ATen Kernel](kernel-library-custom-aten-kernel)
+- [Selective Build](kernel-library-selective-build)
+#### Working with LLMs
+- [Llama](llm/llama)
+- [Llama on Android](llm/llama-demo-android)
+- [Llama on iOS](llm/llama-demo-ios)
+- [Llama on Android via Qualcomm backend](llm/build-run-llama3-qualcomm-ai-engine-direct-backend)
+- [Intro to LLMs in Executorch](llm/getting-started)
+#### Backend Development
+- [Delegates Integration](backend-delegates-integration)
+- [XNNPACK Reference](backend-delegates-xnnpack-reference)
+- [Dependencies](backend-delegates-dependencies)
+- [Compiler Delegate and Partitioner](compiler-delegate-and-partitioner)
+- [Debug Backend Delegate](debug-backend-delegate)
+#### IR Specification
+- [EXIR](ir-exir)
+- [Ops Set Definition](ir-ops-set-definition)
+#### Compiler Entry Points
+- [Backend Dialect](compiler-backend-dialect)
+- [Custom Compiler Passes](compiler-custom-compiler-passes)
+- [Memory Planning](compiler-memory-planning)
+#### Contributing
+- [Contributing](contributing)
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Introduction
+:hidden:
+
+intro-overview
+intro-how-it-works
+getting-started-architecture
+concepts
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Usage
+:hidden:
+
+getting-started
+using-executorch-export
+using-executorch-android
+using-executorch-ios
+using-executorch-cpp
+using-executorch-runtime-integration
+using-executorch-troubleshooting
+using-executorch-building-from-source
+using-executorch-faqs
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Examples
+:hidden:
+
+Building an ExecuTorch Android Demo App <https://github.com/pytorch-labs/executorch-examples/tree/main/dl3/android/DeepLabV3Demo#executorch-android-demo-app>
+demo-apps-ios.md
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Backends
+:hidden:
+
+backends-overview
+backends-xnnpack
+backends-coreml
+backends-mps
+backends-vulkan
+backends-arm-ethos-u
+backends-qualcomm
+backends-mediatek
+backends-cadence
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Developer Tools
+:hidden:
+
+devtools-overview
+bundled-io
+etrecord
+etdump
+runtime-profiling
+model-debugging
+model-inspector
+memory-planning-inspection
+delegate-debugging
+devtools-tutorial
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Runtime
+:hidden:
+
+runtime-overview
+extension-module
+extension-tensor
+running-a-model-cpp-tutorial
+runtime-backend-delegate-implementation-and-linking
+runtime-platform-abstraction-layer
+portable-cpp-programming
+pte-file-format
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: API Reference
+:hidden:
+
+export-to-executorch-api-reference
+executorch-runtime-api-reference
+runtime-python-api-reference
+api-life-cycle
+Javadoc <https://pytorch.org/executorch/main/javadoc/>
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Quantization
+:hidden:
+
+quantization-overview
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Kernel Library
+:hidden:
+
+kernel-library-overview
+kernel-library-custom-aten-kernel
+kernel-library-selective-build
+```
+
+```{toctree}
+:glob:
+:maxdepth: 2
+:caption: Working with LLMs
+:hidden:
+
+Llama <llm/llama>
+Llama on Android <llm/llama-demo-android>
+Llama on iOS <llm/llama-demo-ios>
+Llama on Android via Qualcomm backend <llm/build-run-llama3-qualcomm-ai-engine-direct-backend>
+Intro to LLMs in Executorch <llm/getting-started>
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Backend Development
+:hidden:
+
+backend-delegates-integration
+backend-delegates-xnnpack-reference
+backend-delegates-dependencies
+compiler-delegate-and-partitioner
+debug-backend-delegate
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: IR Specification
+:hidden:
+
+ir-exir
+ir-ops-set-definition
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Compiler Entry Points
+:hidden:
+
+compiler-backend-dialect
+compiler-custom-compiler-passes
+compiler-memory-planning
+```
+
+```{toctree}
+:glob:
+:maxdepth: 1
+:caption: Contributing
+:hidden:
+
+contributing
+```