|
| 1 | +# An unique identifier for the head node and workers of this cluster. |
| 2 | +cluster_name: gpu-docker |
| 3 | + |
| 4 | +min_workers: 1 |
| 5 | +max_workers: 4 |
| 6 | + |
| 7 | +# The autoscaler will scale up the cluster faster with higher upscaling speed. |
| 8 | +# E.g., if the task requires adding more nodes then autoscaler will gradually |
| 9 | +# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. |
| 10 | +# This number should be > 0. |
| 11 | +upscaling_speed: 1.0 |
| 12 | + |
| 13 | +# This executes all commands on all nodes in the docker container, |
| 14 | +# and opens all the necessary ports to support the Ray cluster. |
| 15 | +# Empty string means disabled. |
| 16 | +docker: |
| 17 | + image: "rayproject/ray-ml:latest-gpu" |
| 18 | + # image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull |
| 19 | + container_name: "ray_nvidia_docker" # e.g. ray_docker |
| 20 | + |
| 21 | + |
| 22 | +# If a node is idle for this many minutes, it will be removed. |
| 23 | +idle_timeout_minutes: 5 |
| 24 | + |
| 25 | +# Cloud-provider specific configuration. |
| 26 | +provider: |
| 27 | + type: aws |
| 28 | + region: us-west-2 |
| 29 | + # Availability zone(s), comma-separated, that nodes may be launched in. |
| 30 | + # Nodes are currently spread between zones by a round-robin approach, |
| 31 | + # however this implementation detail should not be relied upon. |
| 32 | + availability_zone: us-west-2a,us-west-2b |
| 33 | + security_group: |
| 34 | + GroupName: dashboard_group |
| 35 | + IpPermissions: |
| 36 | + - FromPort: 20002 |
| 37 | + ToPort: 20002 |
| 38 | + IpProtocol: TCP |
| 39 | + IpRanges: |
| 40 | + - CidrIp: 0.0.0.0/0 |
| 41 | + |
| 42 | + |
| 43 | +# How Ray will authenticate with newly launched nodes. |
| 44 | +auth: |
| 45 | + ssh_user: ubuntu |
| 46 | +# By default Ray creates a new private keypair, but you can also use your own. |
| 47 | +# If you do so, make sure to also set "KeyName" in the head and worker node |
| 48 | +# configurations below. |
| 49 | +# ssh_private_key: /path/to/your/key.pem |
| 50 | + |
| 51 | +# Tell the autoscaler the allowed node types and the resources they provide. |
| 52 | +# The key is the name of the node type, which is just for debugging purposes. |
| 53 | +# The node config specifies the launch config and physical instance type. |
| 54 | +available_node_types: |
| 55 | + # GPU head node. |
| 56 | + ray.head.gpu: |
| 57 | + # worker_image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull |
| 58 | + # The node type's CPU and GPU resources are auto-detected based on AWS instance type. |
| 59 | + # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. |
| 60 | + # You can also set custom resources. |
| 61 | + # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set |
| 62 | + # resources: {"CPU": 1, "GPU": 1, "custom": 5} |
| 63 | + resources: {} |
| 64 | + # Provider-specific config for this node type, e.g. instance type. By default |
| 65 | + # Ray will auto-configure unspecified fields such as SubnetId and KeyName. |
| 66 | + # For more documentation on available fields, see: |
| 67 | + # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances |
| 68 | + node_config: |
| 69 | + InstanceType: p2.xlarge |
| 70 | + ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 |
| 71 | + # You can provision additional disk space with a conf as follows |
| 72 | + BlockDeviceMappings: |
| 73 | + - DeviceName: /dev/sda1 |
| 74 | + Ebs: |
| 75 | + VolumeSize: 100 |
| 76 | + # Additional options in the boto docs. |
| 77 | + # CPU workers. |
| 78 | + ray.worker.default: |
| 79 | + # Override global docker setting. |
| 80 | + # This node type will run a CPU image, |
| 81 | + # rather than the GPU image specified in the global docker settings. |
| 82 | + docker: |
| 83 | + worker_image: "rayproject/ray-ml:latest-cpu" |
| 84 | + # The minimum number of nodes of this type to launch. |
| 85 | + # This number should be >= 0. |
| 86 | + min_workers: 1 |
| 87 | + # The maximum number of workers nodes of this type to launch. |
| 88 | + # This takes precedence over min_workers. |
| 89 | + max_workers: 2 |
| 90 | + # The node type's CPU and GPU resources are auto-detected based on AWS instance type. |
| 91 | + # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler. |
| 92 | + # You can also set custom resources. |
| 93 | + # For example, to mark a node type as having 1 CPU, 1 GPU, and 5 units of a resource called "custom", set |
| 94 | + # resources: {"CPU": 1, "GPU": 1, "custom": 5} |
| 95 | + resources: {} |
| 96 | + # Provider-specific config for this node type, e.g. instance type. By default |
| 97 | + # Ray will auto-configure unspecified fields such as SubnetId and KeyName. |
| 98 | + # For more documentation on available fields, see: |
| 99 | + # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances |
| 100 | + node_config: |
| 101 | + InstanceType: m5.large |
| 102 | + ImageId: ami-0a2363a9cff180a64 # Deep Learning AMI (Ubuntu) Version 30 |
| 103 | + # Run workers on spot by default. Comment this out to use on-demand. |
| 104 | + InstanceMarketOptions: |
| 105 | + MarketType: spot |
| 106 | + # Additional options can be found in the boto docs, e.g. |
| 107 | + # SpotOptions: |
| 108 | + # MaxPrice: MAX_HOURLY_PRICE |
| 109 | + # Additional options in the boto docs. |
| 110 | + |
| 111 | +# Specify the node type of the head node (as configured above). |
| 112 | +head_node_type: ray.head.gpu |
| 113 | + |
| 114 | +# Files or directories to copy to the head and worker nodes. The format is a |
| 115 | +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. |
| 116 | +file_mounts: { |
| 117 | + # "/path1/on/remote/machine": "/path1/on/local/machine", |
| 118 | + # "/path2/on/remote/machine": "/path2/on/local/machine", |
| 119 | +} |
| 120 | + |
| 121 | +# List of shell commands to run to set up nodes. |
| 122 | +# NOTE: rayproject/ray:latest has ray latest bundled |
| 123 | +setup_commands: [] |
| 124 | +# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp36-cp36m-manylinux2014_x86_64.whl |
| 125 | +# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl" |
| 126 | + |
| 127 | +# Custom commands that will be run on the head node after common setup. |
| 128 | +head_setup_commands: |
| 129 | + - pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions |
| 130 | + |
| 131 | +# Custom commands that will be run on worker nodes after common setup. |
| 132 | +worker_setup_commands: [] |
| 133 | + |
| 134 | +# Command to start ray on the head node. You don't need to change this. |
| 135 | +head_start_ray_commands: |
| 136 | + - ray stop |
| 137 | + - ulimit -n 65536; ray start --dashboard-port 20002 --dashboard-host=0.0.0.0 --include-dashboard True --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml |
| 138 | + |
| 139 | +# Command to start ray on worker nodes. You don't need to change this. |
| 140 | +worker_start_ray_commands: |
| 141 | + - ray stop |
| 142 | + - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 |
0 commit comments