Skip to content

Commit 05e1912

Browse files
d4l3kfacebook-github-bot
authored andcommitted
torchx/schedulers: stub out ray scheduler internally
Summary: This fixes autodeps and pyre when dealing with the optional ray_scheduler. Reviewed By: msaroufim Differential Revision: D33832719 fbshipit-source-id: d731ca516a6d8e1a1f1e23fe0a2d3aa7ddecd21a
1 parent 9d8c74d commit 05e1912

File tree

15 files changed

+918
-258
lines changed

15 files changed

+918
-258
lines changed

dev-requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ kfp==1.8.9
1010
moto==2.2.12
1111
pyre-extensions==0.0.21
1212
pytorch-lightning==1.5.6
13-
ray==1.9.0
1413
s3fs==2021.10.1
14+
ray[default]==1.9.2
1515
torch-model-archiver==0.4.2
1616
torch==1.10.0
1717
torchserve==0.4.2

docs/source/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ Works With
9696
schedulers/local
9797
schedulers/kubernetes
9898
schedulers/slurm
99+
schedulers/ray
99100

100101
.. _Pipelines:
101102
.. toctree::

docs/source/schedulers/ray.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Ray
2+
=================
3+
4+
.. automodule:: torchx.schedulers.ray_scheduler
5+
.. currentmodule:: torchx.schedulers.ray_scheduler
6+
7+
.. autoclass:: RayScheduler
8+
:members:

scripts/component_integration_tests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,13 @@ def get_k8s_sched_info(image: str) -> SchedulerInfo:
3737
return SchedulerInfo(name="kubernetes", image=image, cfg=cfg)
3838

3939

40+
def get_ray_sched_info(image: str) -> SchedulerInfo:
41+
cfg = {
42+
"namespace": "torchx-dev",
43+
}
44+
return SchedulerInfo(name="ray", image=image, cfg=cfg)
45+
46+
4047
def get_local_cwd_sched_info(image: str) -> SchedulerInfo:
4148
return SchedulerInfo(name="local_cwd", image=image)
4249

torchx/cli/cmd_log.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929

3030
def validate(job_identifier: str) -> None:
31-
if not re.match(r"^\w+://[^/.]*/[^/.]+(/[^/.]+(/(\d+,?)+)?)?$", job_identifier):
31+
if not re.match(r"^\w+://[^/]*/[^/]+(/[^/]+(/(\d+,?)+)?)?$", job_identifier):
3232
logger.error(
3333
f"{job_identifier} is not of the form {ID_FORMAT}",
3434
)

torchx/components/test/utils_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ def test_sh(self) -> None:
1515
def test_python(self) -> None:
1616
self.validate(utils, "python")
1717

18+
def test_binary(self) -> None:
19+
self.validate(utils, "binary")
20+
1821
def test_touch(self) -> None:
1922
self.validate(utils, "touch")
2023

torchx/components/utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,36 @@ def python(
147147
)
148148

149149

150+
def binary(
151+
*args: str,
152+
entrypoint: str,
153+
name: str = "torchx_utils_python",
154+
num_replicas: int = 1,
155+
) -> specs.AppDef:
156+
"""
157+
Test component
158+
159+
Args:
160+
args: arguments passed to the program in sys.argv[1:] (ignored with `--c`)
161+
name: name of the job
162+
num_replicas: number of copies to run (each on its own container)
163+
:return:
164+
"""
165+
return specs.AppDef(
166+
name=name,
167+
roles=[
168+
specs.Role(
169+
name="binary",
170+
image="<NONE>",
171+
entrypoint=entrypoint,
172+
num_replicas=num_replicas,
173+
resource=specs.Resource(cpu=2, gpu=0, memMB=4096),
174+
args=[*args],
175+
)
176+
],
177+
)
178+
179+
150180
def copy(src: str, dst: str, image: str = torchx.IMAGE) -> specs.AppDef:
151181
"""
152182
copy copies the file from src to dst. src and dst can be any valid fsspec
Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# An unique identifier for the head node and workers of this cluster.
2+
cluster_name: gpu-docker
3+
4+
min_workers: 1
5+
max_workers: 4
6+
7+
# The autoscaler will scale up the cluster faster with higher upscaling speed.
8+
# E.g., if the task requires adding more nodes then autoscaler will gradually
9+
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
10+
# This number should be > 0.
11+
upscaling_speed: 1.0
12+
13+
# This executes all commands on all nodes in the docker container,
14+
# and opens all the necessary ports to support the Ray cluster.
15+
# Empty string means disabled.
16+
docker:
17+
image: "rayproject/ray-ml:latest-gpu"
18+
# image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
19+
container_name: "ray_nvidia_docker" # e.g. ray_docker
20+
21+
22+
# If a node is idle for this many minutes, it will be removed.
23+
idle_timeout_minutes: 5
24+
25+
# Cloud-provider specific configuration.
26+
provider:
27+
type: aws
28+
region: us-west-2
29+
# Availability zone(s), comma-separated, that nodes may be launched in.
30+
# Nodes are currently spread between zones by a round-robin approach,
31+
# however this implementation detail should not be relied upon.
32+
availability_zone: us-west-2a,us-west-2b
33+
security_group:
34+
GroupName: dashboard_group
35+
IpPermissions:
36+
- FromPort: 20002
37+
ToPort: 20002
38+
IpProtocol: TCP
39+
IpRanges:
40+
- CidrIp: 0.0.0.0/0
41+
42+
43+
# How Ray will authenticate with newly launched nodes.
44+
auth:
45+
ssh_user: ubuntu
46+
# By default Ray creates a new private keypair, but you can also use your own.
47+
# If you do so, make sure to also set "KeyName" in the head and worker node
48+
# configurations below.
49+
# ssh_private_key: /path/to/your/key.pem
50+
51+
# Tell the autoscaler the allowed node types and the resources they provide.
52+
# The key is the name of the node type, which is just for debugging purposes.
53+
# The node config specifies the launch config and physical instance type.
54+
available_node_types:
55+
# GPU head node.
56+
ray.head.gpu:
57+
# worker_image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
58+
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
59+
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
60+
# You can also set custom resources.
61+
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
62+
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
63+
resources: {}
64+
# Provider-specific config for this node type, e.g. instance type. By default
65+
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
66+
# For more documentation on available fields, see:
67+
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
68+
node_config:
69+
InstanceType: p2.xlarge
70+
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
71+
# You can provision additional disk space with a conf as follows
72+
BlockDeviceMappings:
73+
- DeviceName: /dev/sda1
74+
Ebs:
75+
VolumeSize: 100
76+
# Additional options in the boto docs.
77+
# CPU workers.
78+
ray.worker.default:
79+
# Override global docker setting.
80+
# This node type will run a CPU image,
81+
# rather than the GPU image specified in the global docker settings.
82+
docker:
83+
worker_image: "rayproject/ray-ml:latest-cpu"
84+
# The minimum number of nodes of this type to launch.
85+
# This number should be >= 0.
86+
min_workers: 1
87+
# The maximum number of workers nodes of this type to launch.
88+
# This takes precedence over min_workers.
89+
max_workers: 2
90+
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
91+
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
92+
# You can also set custom resources.
93+
# For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set
94+
# resources: {"CPU": 1, "GPU": 1, "custom": 5}
95+
resources: {}
96+
# Provider-specific config for this node type, e.g. instance type. By default
97+
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
98+
# For more documentation on available fields, see:
99+
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
100+
node_config:
101+
InstanceType: m5.large
102+
ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30
103+
# Run workers on spot by default. Comment this out to use on-demand.
104+
InstanceMarketOptions:
105+
MarketType: spot
106+
# Additional options can be found in the boto docs, e.g.
107+
# SpotOptions:
108+
# MaxPrice: MAX_HOURLY_PRICE
109+
# Additional options in the boto docs.
110+
111+
# Specify the node type of the head node (as configured above).
112+
head_node_type: ray.head.gpu
113+
114+
# Files or directories to copy to the head and worker nodes. The format is a
115+
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
116+
file_mounts: {
117+
# "/path1/on/remote/machine": "/path1/on/local/machine",
118+
# "/path2/on/remote/machine": "/path2/on/local/machine",
119+
}
120+
121+
# List of shell commands to run to set up nodes.
122+
# NOTE: rayproject/ray:latest has ray latest bundled
123+
setup_commands: []
124+
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl
125+
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
126+
127+
# Custom commands that will be run on the head node after common setup.
128+
head_setup_commands:
129+
- pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions
130+
131+
# Custom commands that will be run on worker nodes after common setup.
132+
worker_setup_commands: []
133+
134+
# Command to start ray on the head node. You don't need to change this.
135+
head_start_ray_commands:
136+
- ray stop
137+
- ulimit -n 65536; ray start --dashboard-port 20002 --dashboard-host=0.0.0.0 --include-dashboard True --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
138+
139+
# Command to start ray on worker nodes. You don't need to change this.
140+
worker_start_ray_commands:
141+
- ray stop
142+
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076

torchx/schedulers/__init__.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8-
from typing import Dict
8+
from typing import Dict, Optional
99

1010
import torchx.schedulers.docker_scheduler as docker_scheduler
1111
import torchx.schedulers.kubernetes_scheduler as kubernetes_scheduler
@@ -22,20 +22,38 @@ def __call__(self, session_name: str, **kwargs: object) -> Scheduler:
2222
...
2323

2424

25+
def try_get_ray_scheduler() -> Optional[SchedulerFactory]:
26+
try:
27+
from torchx.schedulers.ray_scheduler import _has_ray # @manual
28+
29+
if _has_ray:
30+
import torchx.schedulers.ray_scheduler as ray_scheduler # @manual
31+
32+
return ray_scheduler.create_scheduler
33+
34+
except ImportError: # pragma: no cover
35+
return None
36+
37+
2538
def get_scheduler_factories() -> Dict[str, SchedulerFactory]:
2639
"""
2740
get_scheduler_factories returns all the available schedulers names and the
2841
method to instantiate them.
2942
3043
The first scheduler in the dictionary is used as the default scheduler.
3144
"""
45+
3246
default_schedulers: Dict[str, SchedulerFactory] = {
3347
"local_docker": docker_scheduler.create_scheduler,
3448
"local_cwd": local_scheduler.create_cwd_scheduler,
3549
"slurm": slurm_scheduler.create_scheduler,
3650
"kubernetes": kubernetes_scheduler.create_scheduler,
3751
}
3852

53+
ray_scheduler_creator = try_get_ray_scheduler()
54+
if ray_scheduler_creator:
55+
default_schedulers["ray"] = ray_scheduler_creator
56+
3957
return load_group(
4058
"torchx.schedulers",
4159
default=default_schedulers,

torchx/schedulers/ray/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.

torchx/schedulers/ray/ray_common.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
from dataclasses import dataclass, field
8+
from typing import Dict
9+
10+
11+
@dataclass
12+
class RayActor:
13+
"""Describes an actor (a.k.a. role in TorchX terms).
14+
15+
Attributes:
16+
name:
17+
The name of the actor.
18+
command:
19+
The command that the actor should run as a subprocess.
20+
env:
21+
The environment variables to set before executing the command.
22+
num_replicas:
23+
The number of replicas (i.e. Ray actors) to run.
24+
num_cpus:
25+
The number of CPUs to allocate.
26+
num_gpus:
27+
The number of GPUs to allocate.
28+
"""
29+
30+
name: str
31+
command: str
32+
env: Dict[str, str] = field(default_factory=dict)
33+
num_replicas: int = 1
34+
num_cpus: int = 1
35+
num_gpus: int = 0
36+
# TODO: memory_size, max_retries, retry_policy

0 commit comments

Comments
 (0)