Get PT-D benchmark to support being run internally

H-Huang · facebook-github-bot · commit 36e1ed1c435f · 2023-07-21T15:47:13.000-07:00
Summary:
Pytorch Distributed benchmarks have a hard dependency on `submitit` which is not supported internally (we don't use slurm).

It outputs this error: `ModuleNotFoundError: No module named 'submitit'`

These changes remove that hard dependency and add a scheduler option and allow it to be run internally and sets up the code for future work.
The scheduler defaults to slurm so there is no change, but this will eventually be updated to default to local.

Reviewed By: davidberard98

Differential Revision: D47682027

fbshipit-source-id: 9197e2ecb58d507236707b0be7a6bb2dea496269
diff --git a/torchbenchmark/util/distributed/submit.py b/torchbenchmark/util/distributed/submit.py
@@ -1,17 +1,28 @@
 import argparse
 import importlib
 import os
-import submitit
 import sys
 import torch
 import uuid
 
 from pathlib import Path
 from typing import List
 
+try:
+    import submitit
+except ImportError:
+    submitit = None
 
 def parse_args(args: List[str]=None):
-    parser = argparse.ArgumentParser(description='Submitit for PyTorch Distributed Benchmark', add_help=False)
+    parser = argparse.ArgumentParser(description='PyTorch Distributed Benchmark', add_help=False)
+
+    parser.add_argument(
+        "--scheduler",
+        default="slurm",
+        type=str,
+        choices=["local", "slurm"],
+        help="Where to launch the job on a specific infrastructure"
+    )
 
     parser.add_argument(
         "--ngpus",
diff --git a/userbenchmark/distributed/run.py b/userbenchmark/distributed/run.py
@@ -1,13 +1,12 @@
 from typing import List
-import submitit
 import torch
 from torchbenchmark.util.distributed.submit import parse_args, get_init_file, TrainerWrapper
 from ..utils import dump_output
 
 BM_NAME = "distributed"
 
 def gen_metrics_from_result(result):
-    assert isinstance(result, List), "The result of submitit should be a list."
+    assert isinstance(result, List), "The result should be a list."
     metrics = {}
     for result_id, r in enumerate(result):
         for metric_name in r:
@@ -17,6 +16,31 @@ def gen_metrics_from_result(result):
 def run(args: List[str]):
     args, model_args = parse_args(args)
 
+    if args.scheduler == "slurm":
+        result = slurm_run(args, model_args)
+    elif args.scheduler == "local":
+        result = local_run(args, model_args)
+    else:
+        raise ValueError(f"Unsupported scheduler: {args.scheduler}")
+
+    version = torch.version.git_version if hasattr(torch.version, "git_verison") else "Internal"
+
+    # dump the output file
+    output = {
+        "name": BM_NAME,
+        "environ": {"pytorch_git_version": version},
+        "args": vars(args),
+        "metrics": gen_metrics_from_result(result),
+    }
+    dump_output(BM_NAME, output)
+
+def local_run(args, model_args):
+    # TODO: Currently this does nothing but to support the path for "--scheduler local"
+    print("Current local run is not implemented, use '--scheduler slurm'. Skipping local run.")
+    return []
+
+def slurm_run(args, model_args):
+    import submitit
     # Note that the folder will depend on the job_id, to easily track experiments
     executor = submitit.AutoExecutor(folder=args.job_dir, cluster=args.cluster, slurm_max_num_timeout=3000)
 
@@ -45,11 +69,4 @@ def run(args: List[str]):
 
     # waits for completion and returns output
     result = job.results()
-    # dump the output file
-    output = {
-        "name": BM_NAME,
-        "environ": {"pytorch_git_version": torch.version.git_version},
-        "args": vars(args),
-        "metrics": gen_metrics_from_result(result),
-    }
-    dump_output(BM_NAME, output)
+    return result