Skip to content

Commit eae061b

Browse files
author
Svetlana Karslioglu
authored
Merge branch 'master' into maskedtensor_tutorial_1
2 parents 0aee70f + 7d8cb43 commit eae061b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+1547
-102
lines changed

.jenkins/validate_tutorials_built.py

-3
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,7 @@
2222
"former_torchies/tensor_tutorial_old",
2323
"examples_autograd/polynomial_autograd",
2424
"examples_autograd/polynomial_custom_function",
25-
"forward_ad_usage",
2625
"parametrizations",
27-
"reinforcement_q_learning",
28-
"text_to_speech_with_torchaudio",
2926
"mnist_train_nas", # used by ax_multiobjective_nas_tutorial.py
3027
"fx_conv_bn_fuser",
3128
"super_resolution_with_onnxruntime",

_static/css/custom.css

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/* sphinx-design styles for cards/tabs
2+
*/
3+
4+
:root {
5+
--sd-color-info: #ee4c2c;
6+
--sd-color-primary: #6c6c6d;
7+
--sd-color-primary-highlight: #f3f4f7;
8+
--sd-color-card-border-hover: #ee4c2c;
9+
--sd-color-card-border: #f3f4f7;
10+
--sd-color-card-background: #fff;
11+
--sd-color-card-text: inherit;
12+
--sd-color-card-header: transparent;
13+
--sd-color-card-footer: transparent;
14+
--sd-color-tabs-label-active: hsla(231, 99%, 66%, 1);
15+
--sd-color-tabs-label-hover: hsla(231, 99%, 66%, 1);
16+
--sd-color-tabs-label-inactive: hsl(0, 0%, 66%);
17+
--sd-color-tabs-underline-active: hsla(231, 99%, 66%, 1);
18+
--sd-color-tabs-underline-hover: rgba(178, 206, 245, 0.62);
19+
--sd-color-tabs-underline-inactive: transparent;
20+
--sd-color-tabs-overline: rgb(222, 222, 222);
21+
--sd-color-tabs-underline: rgb(222, 222, 222);
22+
}
23+
24+
.sd-text-info {
25+
color: #ee4c2c;
26+
}
27+
28+
29+
.sd-card {
30+
position: relative;
31+
background-color: #fff;
32+
opacity: 1.0;
33+
border-radius: 0px;
34+
width: 30%;
35+
border: none;
36+
padding-bottom: 0px;
37+
}
38+
39+
40+
.sd-card-img {
41+
opacity: 0.5;
42+
width: 200px;
43+
padding: 0px;
44+
}
45+
46+
.sd-card-img:hover {
47+
opacity: 1.0;
48+
background-color: #f3f4f7;
49+
}
50+
51+
52+
.sd-card:after {
53+
display: block;
54+
opacity: 1;
55+
content: '';
56+
border-bottom: solid 1px #ee4c2c;
57+
background-color: #fff;
58+
transform: scaleX(0);
59+
transition: transform .250s ease-in-out;
60+
transform-origin: 0% 50%;
61+
}
62+
63+
.sd-card:hover {
64+
background-color: #fff;
65+
opacity: 1;
66+
border-top: 1px solid #f3f4f7;
67+
border-left: 1px solid #f3f4f7;
68+
border-right: 1px solid #f3f4f7;
69+
}
70+
71+
.sd-card:hover:after {
72+
transform: scaleX(1);
73+
}
685 KB
Loading
355 KB
Loading
138 KB
Loading
278 KB
Loading
501 KB
Loading
311 KB
Loading
451 KB
Loading
80.9 KB
Loading
288 KB
Loading
160 KB
Loading
95.1 KB
Loading
732 KB
Loading
317 KB
Loading
606 KB
Loading
2.35 MB
Loading
271 KB
Loading
539 KB
Loading
210 KB
Loading
515 KB
Loading
1.54 MB
Loading
530 KB
Loading
291 KB
Loading
640 KB
Loading
221 KB
Loading

advanced_source/ddp_pipeline.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def __init__(self, d_model, dropout=0.1, max_len=5000):
5252
pe[:, 0::2] = torch.sin(position * div_term)
5353
pe[:, 1::2] = torch.cos(position * div_term)
5454
pe = pe.unsqueeze(0).transpose(0, 1)
55-
self.register_buffer('pe', pe)
55+
self.pe = nn.Parameter(pe, requires_grad=False)
5656

5757
def forward(self, x):
5858
x = x + self.pe[:x.size(0), :]

beginner_source/basics/autogradqs_tutorial.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -203,14 +203,14 @@
203203
# compute the product:
204204
#
205205

206-
inp = torch.eye(5, requires_grad=True)
207-
out = (inp+1).pow(2)
208-
out.backward(torch.ones_like(inp), retain_graph=True)
206+
inp = torch.eye(4, 5, requires_grad=True)
207+
out = (inp+1).pow(2).t()
208+
out.backward(torch.ones_like(out), retain_graph=True)
209209
print(f"First call\n{inp.grad}")
210-
out.backward(torch.ones_like(inp), retain_graph=True)
210+
out.backward(torch.ones_like(out), retain_graph=True)
211211
print(f"\nSecond call\n{inp.grad}")
212212
inp.grad.zero_()
213-
out.backward(torch.ones_like(inp), retain_graph=True)
213+
out.backward(torch.ones_like(out), retain_graph=True)
214214
print(f"\nCall after zeroing gradients\n{inp.grad}")
215215

216216

beginner_source/basics/buildmodel_tutorial.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def forward(self, x):
7777
# along with some `background operations <https://github.com/pytorch/pytorch/blob/270111b7b611d174967ed204776985cefca9c144/torch/nn/modules/module.py#L866>`_.
7878
# Do not call ``model.forward()`` directly!
7979
#
80-
# Calling the model on the input returns a 10-dimensional tensor with raw predicted values for each class.
80+
# Calling the model on the input returns a 2-dimensional tensor with dim=0 corresponding to each output of 10 raw predicted values for each class, and dim=1 corresponding to the individual values of each output. .
8181
# We get the prediction probabilities by passing it through an instance of the ``nn.Softmax`` module.
8282

8383
X = torch.rand(1, 28, 28, device=device)

beginner_source/basics/optimization_tutorial.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from torch import nn
2929
from torch.utils.data import DataLoader
3030
from torchvision import datasets
31-
from torchvision.transforms import ToTensor, Lambda
31+
from torchvision.transforms import ToTensor
3232

3333
training_data = datasets.FashionMNIST(
3434
root="data",

beginner_source/basics/quickstart_tutorial.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@
9393
# Define model
9494
class NeuralNetwork(nn.Module):
9595
def __init__(self):
96-
super(NeuralNetwork, self).__init__()
96+
super().__init__()
9797
self.flatten = nn.Flatten()
9898
self.linear_relu_stack = nn.Sequential(
9999
nn.Linear(28*28, 512),

beginner_source/blitz/cifar10_tutorial.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def imshow(img):
105105

106106
# get some random training images
107107
dataiter = iter(trainloader)
108-
images, labels = dataiter.next()
108+
images, labels = next(dataiter)
109109

110110
# show images
111111
imshow(torchvision.utils.make_grid(images))
@@ -210,7 +210,7 @@ def forward(self, x):
210210
# Okay, first step. Let us display an image from the test set to get familiar.
211211

212212
dataiter = iter(testloader)
213-
images, labels = dataiter.next()
213+
images, labels = next(dataiter)
214214

215215
# print images
216216
imshow(torchvision.utils.make_grid(images))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
`Introduction <ddp_series_intro.html>`__ \|\| `What is DDP <ddp_series_theory.html>`__ \|\| `Single-Node
2+
Multi-GPU Training <ddp_series_multigpu.html>`__ \|\| **Fault
3+
Tolerance** \|\| `Multi-Node
4+
training <../intermediate/ddp_series_multinode.html>`__ \|\| `minGPT Training <../intermediate/ddp_series_minGPT.html>`__
5+
6+
7+
Fault-tolerant Distributed Training with ``torchrun``
8+
=====================================================
9+
10+
Authors: `Suraj Subramanian <https://github.com/suraj813>`__
11+
12+
.. grid:: 2
13+
14+
.. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
15+
:margin: 0
16+
17+
- Launching multi-GPU training jobs with ``torchrun``
18+
- Saving and loading snapshots of your training job
19+
- Structuring your training script for graceful restarts
20+
21+
.. grid:: 1
22+
23+
.. grid-item::
24+
25+
:octicon:`code-square;1.0em;` View the code used in this tutorial on `GitHub <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu_torchrun.py>`__
26+
27+
.. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
28+
:margin: 0
29+
30+
* High-level `overview <ddp_series_theory.html>`__ of DDP
31+
* Familiarity with `DDP code <ddp_series_multigpu.html>`__
32+
* A machine with multiple GPUs (this tutorial uses an AWS p3.8xlarge instance)
33+
* PyTorch `installed <https://pytorch.org/get-started/locally/>`__ with CUDA
34+
35+
Follow along with the video below or on `youtube <https://www.youtube.com/watch/9kIvQOiwYzg>`__.
36+
37+
.. raw:: html
38+
39+
<div style="margin-top:10px; margin-bottom:10px;">
40+
<iframe width="560" height="315" src="https://www.youtube.com/embed/9kIvQOiwYzg" frameborder="0" allow="accelerometer; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
41+
</div>
42+
43+
In distributed training, a single process failure can
44+
disrupt the entire training job. Since the susceptibility for failure can be higher here, making your training
45+
script robust is particularly important here. You might also prefer your training job to be *elastic* i.e.
46+
47+
48+
PyTorch offers a utility called ``torchrun`` that provides fault-tolerance and
49+
elastic training. When a failure occurs, ``torchrun`` logs the errors and
50+
attempts to automatically restart all the processes from the last saved
51+
“snapshot” of the training job.
52+
53+
The snapshot saves more than just the model state; it can include
54+
details about the number of epochs run, optimizer states or any other
55+
stateful attribute of the training job necessary for its continuity.
56+
57+
Why use ``torchrun``
58+
~~~~~~~~~~~~~~~~~~~~
59+
60+
``torchrun`` handles the minutiae of distributed training so that you
61+
don't need to. For instance,
62+
63+
- You don't need to set environment variables or explicitly pass the ``rank`` and ``world_size``; torchrun assigns this along with several other `environment variables <https://pytorch.org/docs/stable/elastic/run.html#environment-variables>`__.
64+
- No need to call ``mp.spawn`` in your script; you only need a generic ``main()`` entrypoint, and launch the script with ``torchrun``. This way the same script can be run in non-distributed as well as single-node and multinode setups.
65+
- Gracefully restarting training from the last saved training snapshot
66+
67+
68+
Graceful restarts
69+
~~~~~~~~~~~~~~~~~~~~~
70+
For graceful restarts, you should structure your train script like:
71+
72+
.. code:: python
73+
74+
def main():
75+
load_snapshot(snapshot_path)
76+
initialize()
77+
train()
78+
79+
def train():
80+
for batch in iter(dataset):
81+
train_step(batch)
82+
83+
if should_checkpoint:
84+
save_snapshot(snapshot_path)
85+
86+
If a failure occurs, ``torchrun`` will terminate all the processes and restart them.
87+
Each process entrypoint first loads and initializes the last saved snapshot, and continues training from there.
88+
So at any failure, you only lose the training progress from the last saved snapshot.
89+
90+
In elastic training, whenever there are any membership changes (adding or removing nodes), ``torchrun`` will terminate and spawn processes
91+
on available devices. Having this structure ensures your training job can continue without manual intervention.
92+
93+
94+
95+
96+
97+
Diff for `multigpu.py <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu.py>`__ v/s `multigpu_torchrun.py <https://github.com/pytorch/examples/blob/main/distributed/ddp-tutorial-series/multigpu_torchrun.py>`__
98+
-----------------------------------------------------------
99+
100+
Process group initialization
101+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
102+
103+
- ``torchrun`` assigns ``RANK`` and ``WORLD_SIZE`` automatically,
104+
amongst `other env
105+
variables <https://pytorch.org/docs/stable/elastic/run.html#environment-variables>`__
106+
107+
.. code:: diff
108+
109+
- def ddp_setup(rank, world_size):
110+
+ def ddp_setup():
111+
- """
112+
- Args:
113+
- rank: Unique identifier of each process
114+
- world_size: Total number of processes
115+
- """
116+
- os.environ["MASTER_ADDR"] = "localhost"
117+
- os.environ["MASTER_PORT"] = "12355"
118+
- init_process_group(backend="nccl", rank=rank, world_size=world_size)
119+
+ init_process_group(backend="nccl")
120+
121+
122+
Use Torchrun-provided env variables
123+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
124+
125+
.. code:: diff
126+
127+
- self.gpu_id = gpu_id
128+
+ self.gpu_id = int(os.environ["LOCAL_RANK"])
129+
130+
Saving and loading snapshots
131+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
132+
133+
Regularly storing all the relevant information in snapshots allows our
134+
training job to seamlessly resume after an interruption.
135+
136+
.. code:: diff
137+
138+
+ def _save_snapshot(self, epoch):
139+
+ snapshot = {}
140+
+ snapshot["MODEL_STATE"] = self.model.module.state_dict()
141+
+ snapshot["EPOCHS_RUN"] = epoch
142+
+ torch.save(snapshot, "snapshot.pt")
143+
+ print(f"Epoch {epoch} | Training snapshot saved at snapshot.pt")
144+
145+
+ def _load_snapshot(self, snapshot_path):
146+
+ snapshot = torch.load(snapshot_path)
147+
+ self.model.load_state_dict(snapshot["MODEL_STATE"])
148+
+ self.epochs_run = snapshot["EPOCHS_RUN"]
149+
+ print(f"Resuming training from snapshot at Epoch {self.epochs_run}")
150+
151+
152+
Loading a snapshot in the Trainer constructor
153+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
154+
155+
When restarting an interrupted training job, your script will first try
156+
to load a snapshot to resume training from.
157+
158+
.. code:: diff
159+
160+
class Trainer:
161+
def __init__(self, snapshot_path, ...):
162+
...
163+
+ if os.path.exists(snapshot_path):
164+
+ self._load_snapshot(snapshot_path)
165+
...
166+
167+
168+
Resuming training
169+
~~~~~~~~~~~~~~~~~
170+
171+
Training can resume from the last epoch run, instead of starting all
172+
over from scratch.
173+
174+
.. code:: diff
175+
176+
def train(self, max_epochs: int):
177+
- for epoch in range(max_epochs):
178+
+ for epoch in range(self.epochs_run, max_epochs):
179+
self._run_epoch(epoch)
180+
181+
182+
Running the script
183+
~~~~~~~~~~~~~~~~~~
184+
Simply call your entrypoint function as you would for a non-multiprocessing script; ``torchrun`` automatically
185+
spawns the processes.
186+
187+
.. code:: diff
188+
189+
if __name__ == "__main__":
190+
import sys
191+
total_epochs = int(sys.argv[1])
192+
save_every = int(sys.argv[2])
193+
- world_size = torch.cuda.device_count()
194+
- mp.spawn(main, args=(world_size, total_epochs, save_every,), nprocs=world_size)
195+
+ main(save_every, total_epochs)
196+
197+
198+
.. code:: diff
199+
200+
- python multigpu.py 50 10
201+
+ torchrun --standalone --nproc_per_node=4 multigpu_torchrun.py 50 10
202+
203+
Further Reading
204+
---------------
205+
206+
- `Multi-Node training with DDP <../intermediate/ddp_series_multinode.html>`__ (next tutorial in this series)
207+
- `Multi-GPU Training with DDP <ddp_series_multigpu.html>`__ (previous tutorial in this series)
208+
- `torchrun <https://pytorch.org/docs/stable/elastic/run.html>`__
209+
- `Torchrun launch
210+
options <https://github.com/pytorch/pytorch/blob/bbe803cb35948df77b46a2d38372910c96693dcd/torch/distributed/run.py#L401>`__
211+
- `Migrating from torch.distributed.launch to
212+
torchrun <https://pytorch.org/docs/stable/elastic/train_script.html#elastic-train-script>`__

0 commit comments

Comments
 (0)