pytorch
diff --git a/‎.jenkins/validate_tutorials_built.py
-3 b/‎.jenkins/validate_tutorials_built.py
-3
diff --git a/‎_static/css/custom.css
+73 b/‎_static/css/custom.css
+73
diff --git a/‎_static/img/torchserve-ipex-images-2/1.png
685 KB b/‎_static/img/torchserve-ipex-images-2/1.png
685 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/10.png
355 KB b/‎_static/img/torchserve-ipex-images-2/10.png
355 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/11.png
138 KB b/‎_static/img/torchserve-ipex-images-2/11.png
138 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/12.png
278 KB b/‎_static/img/torchserve-ipex-images-2/12.png
278 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/13.png
501 KB b/‎_static/img/torchserve-ipex-images-2/13.png
501 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/14.png
311 KB b/‎_static/img/torchserve-ipex-images-2/14.png
311 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/15.png
451 KB b/‎_static/img/torchserve-ipex-images-2/15.png
451 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/16.png
80.9 KB b/‎_static/img/torchserve-ipex-images-2/16.png
80.9 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/17.png
288 KB b/‎_static/img/torchserve-ipex-images-2/17.png
288 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/18.png
160 KB b/‎_static/img/torchserve-ipex-images-2/18.png
160 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/19.png
95.1 KB b/‎_static/img/torchserve-ipex-images-2/19.png
95.1 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/2.png
732 KB b/‎_static/img/torchserve-ipex-images-2/2.png
732 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/20.png
317 KB b/‎_static/img/torchserve-ipex-images-2/20.png
317 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/21.png
606 KB b/‎_static/img/torchserve-ipex-images-2/21.png
606 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/22.png
2.35 MB b/‎_static/img/torchserve-ipex-images-2/22.png
2.35 MB
diff --git a/‎_static/img/torchserve-ipex-images-2/23.png
271 KB b/‎_static/img/torchserve-ipex-images-2/23.png
271 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/24.png
539 KB b/‎_static/img/torchserve-ipex-images-2/24.png
539 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/3.png
210 KB b/‎_static/img/torchserve-ipex-images-2/3.png
210 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/4.png
515 KB b/‎_static/img/torchserve-ipex-images-2/4.png
515 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/5.png
1.54 MB b/‎_static/img/torchserve-ipex-images-2/5.png
1.54 MB
diff --git a/‎_static/img/torchserve-ipex-images-2/6.png
530 KB b/‎_static/img/torchserve-ipex-images-2/6.png
530 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/7.png
291 KB b/‎_static/img/torchserve-ipex-images-2/7.png
291 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/8.png
640 KB b/‎_static/img/torchserve-ipex-images-2/8.png
640 KB
diff --git a/‎_static/img/torchserve-ipex-images-2/9.png
221 KB b/‎_static/img/torchserve-ipex-images-2/9.png
221 KB
diff --git a/‎advanced_source/ddp_pipeline.py
+1-1 b/‎advanced_source/ddp_pipeline.py
+1-1
diff --git a/‎beginner_source/basics/autogradqs_tutorial.py
+5-5 b/‎beginner_source/basics/autogradqs_tutorial.py
+5-5
diff --git a/‎beginner_source/basics/buildmodel_tutorial.py
+1-1 b/‎beginner_source/basics/buildmodel_tutorial.py
+1-1
diff --git a/‎beginner_source/basics/optimization_tutorial.py
+1-1 b/‎beginner_source/basics/optimization_tutorial.py
+1-1
diff --git a/‎beginner_source/basics/quickstart_tutorial.py
+1-1 b/‎beginner_source/basics/quickstart_tutorial.py
+1-1
diff --git a/‎beginner_source/blitz/cifar10_tutorial.py
+2-2 b/‎beginner_source/blitz/cifar10_tutorial.py
+2-2
diff --git a/‎beginner_source/ddp_series_fault_tolerance.rst
+212 b/‎beginner_source/ddp_series_fault_tolerance.rst
+212
@@ -22,10 +22,7 @@
     "former_torchies/tensor_tutorial_old",
     "examples_autograd/polynomial_autograd",
     "examples_autograd/polynomial_custom_function",
-    "forward_ad_usage",
     "parametrizations",
-    "reinforcement_q_learning",
-    "text_to_speech_with_torchaudio",
     "mnist_train_nas",  # used by ax_multiobjective_nas_tutorial.py
     "fx_conv_bn_fuser",
     "super_resolution_with_onnxruntime",
 
@@ -0,0 +1,73 @@
+/* sphinx-design styles for cards/tabs
+*/
+
+:root {
+    --sd-color-info: #ee4c2c;
+    --sd-color-primary: #6c6c6d;
+    --sd-color-primary-highlight: #f3f4f7;
+    --sd-color-card-border-hover: #ee4c2c;
+    --sd-color-card-border: #f3f4f7;
+    --sd-color-card-background: #fff;
+    --sd-color-card-text: inherit;
+    --sd-color-card-header: transparent;
+    --sd-color-card-footer: transparent;
+    --sd-color-tabs-label-active: hsla(231, 99%, 66%, 1);
+    --sd-color-tabs-label-hover: hsla(231, 99%, 66%, 1);
+    --sd-color-tabs-label-inactive: hsl(0, 0%, 66%);
+    --sd-color-tabs-underline-active: hsla(231, 99%, 66%, 1);
+    --sd-color-tabs-underline-hover: rgba(178, 206, 245, 0.62);
+    --sd-color-tabs-underline-inactive: transparent;
+    --sd-color-tabs-overline: rgb(222, 222, 222);
+    --sd-color-tabs-underline: rgb(222, 222, 222);
+}
+
+.sd-text-info {
+    color: #ee4c2c;
+}
+
+
+.sd-card {
+    position: relative;
+    background-color: #fff;
+    opacity: 1.0;
+    border-radius: 0px;
+    width: 30%;
+    border: none;
+    padding-bottom: 0px;
+}
+
+
+.sd-card-img {
+    opacity: 0.5;
+    width: 200px;
+    padding: 0px;
+}
+
+.sd-card-img:hover {
+    opacity: 1.0;
+    background-color: #f3f4f7;
+}
+
+
+.sd-card:after {
+    display: block;
+    opacity: 1;
+    content: '';
+    border-bottom: solid 1px #ee4c2c;  
+    background-color: #fff;
+    transform: scaleX(0);  
+    transition: transform .250s ease-in-out;
+    transform-origin:  0% 50%;
+}
+
+.sd-card:hover {
+    background-color: #fff;
+    opacity: 1;
+    border-top: 1px solid #f3f4f7;
+    border-left: 1px solid #f3f4f7;
+    border-right: 1px solid #f3f4f7;
+}
+
+.sd-card:hover:after {
+    transform: scaleX(1);
+}
@@ -52,7 +52,7 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
         pe = pe.unsqueeze(0).transpose(0, 1)
-        self.register_buffer('pe', pe)
+        self.pe = nn.Parameter(pe, requires_grad=False)
 
     def forward(self, x):
         x = x + self.pe[:x.size(0), :]
 
@@ -203,14 +203,14 @@
 # compute the product:
 #
 
-inp = torch.eye(5, requires_grad=True)
-out = (inp+1).pow(2)
-out.backward(torch.ones_like(inp), retain_graph=True)
+inp = torch.eye(4, 5, requires_grad=True)
+out = (inp+1).pow(2).t()
+out.backward(torch.ones_like(out), retain_graph=True)
 print(f"First call\n{inp.grad}")
-out.backward(torch.ones_like(inp), retain_graph=True)
+out.backward(torch.ones_like(out), retain_graph=True)
 print(f"\nSecond call\n{inp.grad}")
 inp.grad.zero_()
-out.backward(torch.ones_like(inp), retain_graph=True)
+out.backward(torch.ones_like(out), retain_graph=True)
 print(f"\nCall after zeroing gradients\n{inp.grad}")
 
 
 
@@ -77,7 +77,7 @@ def forward(self, x):
 # along with some `background operations <https://github.com/pytorch/pytorch/blob/270111b7b611d174967ed204776985cefca9c144/torch/nn/modules/module.py#L866>`_.
 # Do not call ``model.forward()`` directly!
 #
-# Calling the model on the input returns a 10-dimensional tensor with raw predicted values for each class.
+# Calling the model on the input returns a 2-dimensional tensor with dim=0 corresponding to each output of 10 raw predicted values for each class, and dim=1 corresponding to the individual values of each output.  .
 # We get the prediction probabilities by passing it through an instance of the ``nn.Softmax`` module.
 
 X = torch.rand(1, 28, 28, device=device)
 
@@ -28,7 +28,7 @@
 from torch import nn
 from torch.utils.data import DataLoader
 from torchvision import datasets
-from torchvision.transforms import ToTensor, Lambda
+from torchvision.transforms import ToTensor
 
 training_data = datasets.FashionMNIST(
     root="data",
 
@@ -93,7 +93,7 @@
 # Define model
 class NeuralNetwork(nn.Module):
     def __init__(self):
-        super(NeuralNetwork, self).__init__()
+        super().__init__()
         self.flatten = nn.Flatten()
         self.linear_relu_stack = nn.Sequential(
             nn.Linear(28*28, 512),
 
@@ -105,7 +105,7 @@ def imshow(img):
 
 # get some random training images
 dataiter = iter(trainloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # show images
 imshow(torchvision.utils.make_grid(images))
@@ -210,7 +210,7 @@ def forward(self, x):
 # Okay, first step. Let us display an image from the test set to get familiar.
 
 dataiter = iter(testloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)
 
 # print images
 imshow(torchvision.utils.make_grid(images))
 
@@ -0,0 +1,212 @@
+`Introduction <ddp_series_intro.html>`__ \|\| `What is DDP <ddp_series_theory.html>`__ \|\| `Single-Node
+Multi-GPU Training <ddp_series_multigpu.html>`__ \|\| **Fault
+Tolerance** \|\| `Multi-Node
+training <../intermediate/ddp_series_multinode.html>`__ \|\| `minGPT Training <../intermediate/ddp_series_minGPT.html>`__
+
+
+Fault-tolerant Distributed Training with ``torchrun``
+=====================================================
+
+Authors: `Suraj Subramanian <https://github.com/suraj813>`__
+
+.. grid:: 2
+
+   .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+      :margin: 0
+      
+      -  Launching multi-GPU training jobs with ``torchrun``
+      -  Saving and loading snapshots of your training job
+      -  Structuring your training script for graceful restarts
+
+      .. grid:: 1
+
+         .. grid-item::
+
+            :octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu_torchrun.py>`__
+
+   .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+      :margin: 0
+
+      * High-level `overview <ddp_series_theory.html>`__ of DDP
+      * Familiarity with `DDP code <ddp_series_multigpu.html>`__
+      * A machine with multiple GPUs (this tutorial uses an AWS p3.8xlarge instance)
+      * PyTorch `installed <https://pytorch.org/get-started/locally/>`__ with CUDA
+
+Follow along with the video below or on `youtube <https://www.youtube.com/watch/9kIvQOiwYzg>`__.
+
+.. raw:: html
+
+   <div style="margin-top:10px; margin-bottom:10px;">
+     <iframe width="560" height="315" src="https://www.youtube.com/embed/9kIvQOiwYzg" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+   </div>
+
+In distributed training, a single process failure can
+disrupt the entire training job. Since the susceptibility for failure can be higher here, making your training
+script robust is particularly important here. You might also prefer your training job to be *elastic* i.e. 
+
+
+PyTorch offers a utility called ``torchrun`` that provides fault-tolerance and 
+elastic training. When a failure occurs, ``torchrun`` logs the errors and
+attempts to automatically restart all the processes from the last saved
+“snapshot” of the training job. 
+
+The snapshot saves more than just the model state; it can include
+details about the number of epochs run, optimizer states or any other
+stateful attribute of the training job necessary for its continuity.
+
+Why use ``torchrun``
+~~~~~~~~~~~~~~~~~~~~
+
+``torchrun`` handles the minutiae of distributed training so that you
+don't need to. For instance,
+
+-  You don't need to set environment variables or explicitly pass the ``rank`` and ``world_size``; torchrun assigns this along with several other `environment variables <https://pytorch.org/docs/stable/elastic/run.html#environment-variables>`__.
+-  No need to call ``mp.spawn`` in your script; you only need a generic ``main()`` entrypoint, and launch the script with ``torchrun``. This way the same script can be run in non-distributed as well as single-node and multinode setups. 
+-  Gracefully restarting training from the last saved training snapshot
+
+
+Graceful restarts
+~~~~~~~~~~~~~~~~~~~~~
+For graceful restarts, you should structure your train script like:
+
+.. code:: python
+
+   def main():
+     load_snapshot(snapshot_path)
+     initialize()
+     train()
+
+   def train():
+     for batch in iter(dataset):
+       train_step(batch)
+
+       if should_checkpoint:
+         save_snapshot(snapshot_path)
+
+If a failure occurs, ``torchrun`` will terminate all the processes and restart them. 
+Each process entrypoint first loads and initializes the last saved snapshot, and continues training from there.
+So at any failure, you only lose the training progress from the last saved snapshot. 
+
+In elastic training, whenever there are any membership changes (adding or removing nodes), ``torchrun`` will terminate and spawn processes
+on available devices. Having this structure ensures your training job can continue without manual intervention.
+
+
+
+
+
+Diff for `multigpu.py <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py>`__ v/s `multigpu_torchrun.py <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu_torchrun.py>`__
+-----------------------------------------------------------
+
+Process group initialization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  ``torchrun`` assigns ``RANK`` and ``WORLD_SIZE`` automatically,
+   amongst `other env
+   variables <https://pytorch.org/docs/stable/elastic/run.html#environment-variables>`__
+
+.. code:: diff
+
+   - def ddp_setup(rank, world_size):
+   + def ddp_setup():
+   -     """
+   -     Args:
+   -         rank: Unique identifier of each process
+   -         world_size: Total number of processes
+   -     """
+   -     os.environ["MASTER_ADDR"] = "localhost"
+   -     os.environ["MASTER_PORT"] = "12355"
+   -     init_process_group(backend="nccl", rank=rank, world_size=world_size)
+   +     init_process_group(backend="nccl")
+
+
+Use Torchrun-provided env variables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: diff
+
+   - self.gpu_id = gpu_id
+   + self.gpu_id = int(os.environ["LOCAL_RANK"])
+
+Saving and loading snapshots
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Regularly storing all the relevant information in snapshots allows our
+training job to seamlessly resume after an interruption.
+
+.. code:: diff
+
+   + def _save_snapshot(self, epoch):
+   +     snapshot = {}
+   +     snapshot["MODEL_STATE"] = self.model.module.state_dict()
+   +     snapshot["EPOCHS_RUN"] = epoch
+   +     torch.save(snapshot, "snapshot.pt")
+   +     print(f"Epoch {epoch} | Training snapshot saved at snapshot.pt")
+
+   + def _load_snapshot(self, snapshot_path):
+   +     snapshot = torch.load(snapshot_path)
+   +     self.model.load_state_dict(snapshot["MODEL_STATE"])
+   +     self.epochs_run = snapshot["EPOCHS_RUN"]
+   +     print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
+
+
+Loading a snapshot in the Trainer constructor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When restarting an interrupted training job, your script will first try
+to load a snapshot to resume training from.
+
+.. code:: diff
+
+   class Trainer:
+      def __init__(self, snapshot_path, ...):
+      ...
+   +  if os.path.exists(snapshot_path):
+   +     self._load_snapshot(snapshot_path)
+      ...
+
+
+Resuming training
+~~~~~~~~~~~~~~~~~
+
+Training can resume from the last epoch run, instead of starting all
+over from scratch.
+
+.. code:: diff
+
+   def train(self, max_epochs: int):
+   -  for epoch in range(max_epochs):
+   +  for epoch in range(self.epochs_run, max_epochs):
+         self._run_epoch(epoch)
+
+
+Running the script
+~~~~~~~~~~~~~~~~~~
+Simply call your entrypoint function as you would for a non-multiprocessing script; ``torchrun`` automatically
+spawns the processes.
+
+.. code:: diff
+
+   if __name__ == "__main__":
+      import sys
+      total_epochs = int(sys.argv[1])
+      save_every = int(sys.argv[2])
+   -  world_size = torch.cuda.device_count()
+   -  mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
+   +  main(save_every, total_epochs)
+
+
+.. code:: diff
+
+   - python multigpu.py 50 10
+   + torchrun --standalone --nproc_per_node=4 multigpu_torchrun.py 50 10
+
+Further Reading
+---------------
+
+-  `Multi-Node training with DDP <../intermediate/ddp_series_multinode.html>`__  (next tutorial in this series)
+-  `Multi-GPU Training with DDP <ddp_series_multigpu.html>`__ (previous tutorial in this series)
+-  `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`__
+-  `Torchrun launch
+   options <https://github.com/pytorch/pytorch/blob/bbe803cb35948df77b46a2d38372910c96693dcd/torch/distributed/run.py#L401>`__
+-  `Migrating from torch.distributed.launch to
+   torchrun <https://pytorch.org/docs/stable/elastic/train_script.html#elastic-train-script>`__
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ def forward(self, x):`
`77`	`77`	# along with some `background operations <https://github.com/pytorch/pytorch/blob/270111b7b611d174967ed204776985cefca9c144/torch/nn/modules/module.py#L866>`_.
`78`	`78`	# Do not call ``model.forward()`` directly!
`79`	`79`	`#`
`80`		`-# Calling the model on the input returns a 10-dimensional tensor with raw predicted values for each class.`
	`80`	`+# Calling the model on the input returns a 2-dimensional tensor with dim=0 corresponding to each output of 10 raw predicted values for each class, and dim=1 corresponding to the individual values of each output. .`
`81`	`81`	# We get the prediction probabilities by passing it through an instance of the ``nn.Softmax`` module.
`82`	`82`
`83`	`83`	`X = torch.rand(1, 28, 28, device=device)`