Skip to content

Commit 36e1ed1

Browse files
H-Huangfacebook-github-bot
authored andcommitted
Get PT-D benchmark to support being run internally
Summary: Pytorch Distributed benchmarks have a hard dependency on `submitit` which is not supported internally (we don't use slurm). It outputs this error: `ModuleNotFoundError: No module named 'submitit'` These changes remove that hard dependency and add a scheduler option and allow it to be run internally and sets up the code for future work. The scheduler defaults to slurm so there is no change, but this will eventually be updated to default to local. Reviewed By: davidberard98 Differential Revision: D47682027 fbshipit-source-id: 9197e2ecb58d507236707b0be7a6bb2dea496269
1 parent 1dada69 commit 36e1ed1

File tree

2 files changed

+40
-12
lines changed

2 files changed

+40
-12
lines changed

torchbenchmark/util/distributed/submit.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,28 @@
11
import argparse
22
import importlib
33
import os
4-
import submitit
54
import sys
65
import torch
76
import uuid
87

98
from pathlib import Path
109
from typing import List
1110

11+
try:
12+
import submitit
13+
except ImportError:
14+
submitit = None
1215

1316
def parse_args(args: List[str]=None):
14-
parser = argparse.ArgumentParser(description='Submitit for PyTorch Distributed Benchmark', add_help=False)
17+
parser = argparse.ArgumentParser(description='PyTorch Distributed Benchmark', add_help=False)
18+
19+
parser.add_argument(
20+
"--scheduler",
21+
default="slurm",
22+
type=str,
23+
choices=["local", "slurm"],
24+
help="Where to launch the job on a specific infrastructure"
25+
)
1526

1627
parser.add_argument(
1728
"--ngpus",

userbenchmark/distributed/run.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
from typing import List
2-
import submitit
32
import torch
43
from torchbenchmark.util.distributed.submit import parse_args, get_init_file, TrainerWrapper
54
from ..utils import dump_output
65

76
BM_NAME = "distributed"
87

98
def gen_metrics_from_result(result):
10-
assert isinstance(result, List), "The result of submitit should be a list."
9+
assert isinstance(result, List), "The result should be a list."
1110
metrics = {}
1211
for result_id, r in enumerate(result):
1312
for metric_name in r:
@@ -17,6 +16,31 @@ def gen_metrics_from_result(result):
1716
def run(args: List[str]):
1817
args, model_args = parse_args(args)
1918

19+
if args.scheduler == "slurm":
20+
result = slurm_run(args, model_args)
21+
elif args.scheduler == "local":
22+
result = local_run(args, model_args)
23+
else:
24+
raise ValueError(f"Unsupported scheduler: {args.scheduler}")
25+
26+
version = torch.version.git_version if hasattr(torch.version, "git_verison") else "Internal"
27+
28+
# dump the output file
29+
output = {
30+
"name": BM_NAME,
31+
"environ": {"pytorch_git_version": version},
32+
"args": vars(args),
33+
"metrics": gen_metrics_from_result(result),
34+
}
35+
dump_output(BM_NAME, output)
36+
37+
def local_run(args, model_args):
38+
# TODO: Currently this does nothing but to support the path for "--scheduler local"
39+
print("Current local run is not implemented, use '--scheduler slurm'. Skipping local run.")
40+
return []
41+
42+
def slurm_run(args, model_args):
43+
import submitit
2044
# Note that the folder will depend on the job_id, to easily track experiments
2145
executor = submitit.AutoExecutor(folder=args.job_dir, cluster=args.cluster, slurm_max_num_timeout=3000)
2246

@@ -45,11 +69,4 @@ def run(args: List[str]):
4569

4670
# waits for completion and returns output
4771
result = job.results()
48-
# dump the output file
49-
output = {
50-
"name": BM_NAME,
51-
"environ": {"pytorch_git_version": torch.version.git_version},
52-
"args": vars(args),
53-
"metrics": gen_metrics_from_result(result),
54-
}
55-
dump_output(BM_NAME, output)
72+
return result

0 commit comments

Comments
 (0)