Closed
Description
versioninfo():
Julia Version 1.10.0
Commit 3120989f39b (2023-12-25 18:01 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 48 × Intel(R) Xeon(R) Gold 6248R CPU @ 3.00GHz
WORD_SIZE: 64
LIBM: libopenlibm
LLVM: libLLVM-15.0.7 (ORCJIT, cascadelake)
Threads: 1 on 48 virtual cores
Environment:
JULIA_PKG_SERVER = https://mirrors.pku.edu.cn/julia
Minimal reproducible example:
#!/bin/bash
#SBATCH -N 2
#SBATCH --ntasks-per-node=48
#SBATCH -J test
#SBATCH --cpus-per-task=1
#SBATCH -t 0-10:00
#SBATCH -p cpu
#SBATCH --output=slurm/slurm-%x-%j.out
#SBATCH --error=slurm/slurm-%x-%j.out
#=
export SLURM_ORIGINAL_COMMAND=$(scontrol show job $SLURM_JOBID | grep "^ Command=" | head -n 1 | cut -d "=" -f 2-)
exec julia -t$SLURM_CPUS_PER_TASK --project=. --color=yes --startup-file=no "${BASH_SOURCE[0]}" "$@"
=#
using Distributed
using SlurmClusterManager
addprocs(SlurmManager(),exeflags=["--project=.", "-t$(ENV["SLURM_CPUS_PER_TASK"])", "--color=yes","--startup-file=no"])
error messeag:
ERROR: LoadError: TaskFailedException
nested task error: TaskFailedException
Stacktrace:
[1] wait
@ ./task.jl:352 [inlined]
[2] addprocs_locked(manager::SlurmManager; kwargs::@Kwargs{exeflags::Vector{String}})
@ Distributed ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:507
[3] addprocs_locked
@ ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:456 [inlined]
[4] addprocs(manager::SlurmManager; kwargs::@Kwargs{exeflags::Vector{String}})
@ Distributed ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:450
[5] addprocs
@ ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:443 [inlined]
[6] (::var"#1#2")()
@ Main /var/spool/slurm/d/job87901/slurm_script:16
nested task error: TaskFailedException
Stacktrace:
[1] wait
@ ./task.jl:352 [inlined]
[2] launch(manager::SlurmManager, params::Dict{Symbol, Any}, instances_arr::Vector{WorkerConfig}, c::Condition)
@ SlurmClusterManager ~/.julia/packages/SlurmClusterManager/R0zin/src/slurmmanager.jl:79
[3] (::Distributed.var"#43#46"{SlurmManager, Condition, Vector{WorkerConfig}, Dict{Symbol, Any}})()
@ Distributed ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:488
nested task error: launch_timeout exceeded
Stacktrace:
[1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
@ Base ./task.jl:931
[2] wait()
@ Base ./task.jl:995
[3] wait(c::Base.GenericCondition{Base.Threads.SpinLock}; first::Bool)
@ Base ./condition.jl:130
[4] wait
@ Base ./condition.jl:125 [inlined]
[5] readuntil(x::Base.PipeEndpoint, c::UInt8; keep::Bool)
@ Base ./stream.jl:1014
[6] readuntil(io::Base.Process, arg::UInt8; kw::@Kwargs{keep::Bool})
@ Base ./io.jl:443
[7] readuntil
@ ./io.jl:443 [inlined]
[8] readline(s::Base.Process; keep::Bool)
@ Base ./io.jl:561
[9] readline(s::Base.Process)
@ Base ./io.jl:560
[10] (::SlurmClusterManager.var"#3#8"{SlurmManager, Vector{WorkerConfig}, Condition})()
@ SlurmClusterManager ~/.julia/packages/SlurmClusterManager/R0zin/src/slurmmanager.jl:62
Stacktrace:
[1] sync_end(c::Channel{Any})
@ Base ./task.jl:448
[2] macro expansion
@ task.jl:480 [inlined]
[3] top-level scope
@ /var/spool/slurm/d/job87901/slurm_script:478
in expression starting at /var/spool/slurm/d/job87901/slurm_script:16
Error launching Slurm job:
Metadata
Metadata
Assignees
Labels
No labels