Closed
Description
I am trying to run the following script:
jobscript
#!/bin/bash
#SBATCH --job-name=julia-demo
#SBATCH --time=00:01:00
#SBATCH --nodes=2
#SBATCH --output=log.out
#SBATCH --error=log.err
cd $SCRATCH/temp
julia=$SCRATCH/julia/julia-1.7.0-rc2/bin/julia
srun $julia script.jl
script.jl
using Distributed, SlurmClusterManager
addprocs(SlurmManager())
using Distributed
@show workers()
# Define what id() is
@everywhere id() = (myid(), gethostname())
# Run id() on all nodes
ids = [id(), [@fetchfrom i id() for i in workers()]...]
# Print
println.(ids)
rmprocs.(workers())
This results in the following error:
$ cat log.err
ERROR: LoadError: TaskFailedException
Stacktrace:
[1] wait
@ ./task.jl:322 [inlined]
[2] addprocs_locked(ERROR: manager::SlurmManager; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed LoadError: /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:504
[3] addprocs_locked
@ /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:454 [inlined]
[4] addprocs(manager::SlurmManager; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:447
[5] addprocs(manager::SlurmManager)
@ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:441
[6] top-level scope
@ /scratch/jb6888/temp/script.jl:2
nested task error: TaskFailedException
Stacktrace:
[1] wait
@ ./task.jl:322 [inlined]
[2] TaskFailedException
Stacktrace:
[1] wait
@ ./task.jl:322 [inlined]
[2] launch(manager::SlurmManager, params::Dict{Symbol, Any}, instances_arr::Vector{WorkerConfig}, c::Condition)
@ SlurmClusterManager /scratch/jb6888/.julia/packages/SlurmClusterManager/63gkG/src/slurmmanager.jl:75
[3] (::Distributed.var"#39#42"{SlurmManager, Condition, Vector{WorkerConfig}, Dict{Symbol, Any}})()
@ Distributed ./task.jl:411
nested task error: could not parse 9196#10.0.3.111
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:33
[2] macro expansion
@ /scratch/jb6888/.julia/packages/SlurmClusterManager/63gkG/src/slurmmanager.jl:60 [inlined]
[3] (::SlurmClusterManager.var"#3#8"{SlurmManager, Vector{WorkerConfig}, Condition})()
@ SlurmClusterManager ./task.jl:411
in expression starting at /scratch/jb6888/temp/script.jl:2
addprocs_locked(manager::SlurmManager; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:504
[3] addprocs_locked
@ /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:454 [inlined]
[4] addprocs(manager::SlurmManager; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:447
[5] addprocs(manager::SlurmManager)
@ Distributed /scratch/jb6888/julia/julia-1.7.0-rc2/share/julia/stdlib/v1.7/Distributed/src/cluster.jl:441
[6] top-level scope
@ /scratch/jb6888/temp/script.jl:2
nested task error: TaskFailedException
Stacktrace:
[1] wait
@ ./task.jl:322 [inlined]
[2] launch(manager::SlurmManager, params::Dict{Symbol, Any}, instances_arr::Vector{WorkerConfig}, c::Condition)
@ SlurmClusterManager /scratch/jb6888/.julia/packages/SlurmClusterManager/63gkG/src/slurmmanager.jl:75
[3] (::Distributed.var"#39#42"{SlurmManager, Condition, Vector{WorkerConfig}, Dict{Symbol, Any}})()
@ Distributed ./task.jl:411
nested task error: could not parse 9340#10.0.3.111
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:33
[2] macro expansion
@ /scratch/jb6888/.julia/packages/SlurmClusterManager/63gkG/src/slurmmanager.jl:60 [inlined]
[3] (::SlurmClusterManager.var"#3#8"{SlurmManager, Vector{WorkerConfig}, Condition})()
@ SlurmClusterManager ./task.jl:411
in expression starting at /scratch/jb6888/temp/script.jl:2
srun: error: compute-25-12: task 1: Exited with exit code 1
srun: Terminating job step 2742997.0
slurmstepd: *** STEP 2742997.0 ON compute-25-11 CANCELLED AT 2021-10-28T10:59:29 ***
srun: Job step aborted: Waiting up to 17 seconds for job step to finish.
srun: error: compute-25-11: task 0: Killed
It's possible that I'm misunderstanding the instruction to launch jobs, and would appreciate some help on this
Metadata
Metadata
Assignees
Labels
No labels