Skip to content

Failed to addprocs #15

Closed
Closed
@Lightup1

Description

@Lightup1

versioninfo():

Julia Version 1.10.0
Commit 3120989f39b (2023-12-25 18:01 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 48 × Intel(R) Xeon(R) Gold 6248R CPU @ 3.00GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, cascadelake)
  Threads: 1 on 48 virtual cores
Environment:
  JULIA_PKG_SERVER = https://mirrors.pku.edu.cn/julia

Minimal reproducible example:

#!/bin/bash
#SBATCH -N 2
#SBATCH --ntasks-per-node=48
#SBATCH -J test
#SBATCH --cpus-per-task=1     
#SBATCH -t 0-10:00          
#SBATCH -p cpu
#SBATCH --output=slurm/slurm-%x-%j.out
#SBATCH --error=slurm/slurm-%x-%j.out
#=
export SLURM_ORIGINAL_COMMAND=$(scontrol show job $SLURM_JOBID | grep "^   Command=" | head -n 1 | cut -d "=" -f 2-)
exec julia -t$SLURM_CPUS_PER_TASK --project=. --color=yes --startup-file=no "${BASH_SOURCE[0]}" "$@"
=#
using Distributed
using SlurmClusterManager
addprocs(SlurmManager(),exeflags=["--project=.", "-t$(ENV["SLURM_CPUS_PER_TASK"])", "--color=yes","--startup-file=no"])

error messeag:

ERROR: LoadError: TaskFailedException

    nested task error: TaskFailedException
    Stacktrace:
     [1] wait
       @ ./task.jl:352 [inlined]
     [2] addprocs_locked(manager::SlurmManager; kwargs::@Kwargs{exeflags::Vector{String}})
       @ Distributed ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:507
     [3] addprocs_locked
       @ ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:456 [inlined]
     [4] addprocs(manager::SlurmManager; kwargs::@Kwargs{exeflags::Vector{String}})
       @ Distributed ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:450
     [5] addprocs
       @ ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:443 [inlined]
     [6] (::var"#1#2")()
       @ Main /var/spool/slurm/d/job87901/slurm_script:16
    
        nested task error: TaskFailedException
        Stacktrace:
         [1] wait
           @ ./task.jl:352 [inlined]
         [2] launch(manager::SlurmManager, params::Dict{Symbol, Any}, instances_arr::Vector{WorkerConfig}, c::Condition)
           @ SlurmClusterManager ~/.julia/packages/SlurmClusterManager/R0zin/src/slurmmanager.jl:79
         [3] (::Distributed.var"#43#46"{SlurmManager, Condition, Vector{WorkerConfig}, Dict{Symbol, Any}})()
           @ Distributed ~/local/julia-1.10.0/share/julia/stdlib/v1.10/Distributed/src/cluster.jl:488
        
            nested task error: launch_timeout exceeded
            Stacktrace:
              [1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
                @ Base ./task.jl:931
              [2] wait()
                @ Base ./task.jl:995
              [3] wait(c::Base.GenericCondition{Base.Threads.SpinLock}; first::Bool)
                @ Base ./condition.jl:130
              [4] wait
                @ Base ./condition.jl:125 [inlined]
              [5] readuntil(x::Base.PipeEndpoint, c::UInt8; keep::Bool)
                @ Base ./stream.jl:1014
              [6] readuntil(io::Base.Process, arg::UInt8; kw::@Kwargs{keep::Bool})
                @ Base ./io.jl:443
              [7] readuntil
                @ ./io.jl:443 [inlined]
              [8] readline(s::Base.Process; keep::Bool)
                @ Base ./io.jl:561
              [9] readline(s::Base.Process)
                @ Base ./io.jl:560
             [10] (::SlurmClusterManager.var"#3#8"{SlurmManager, Vector{WorkerConfig}, Condition})()
                @ SlurmClusterManager ~/.julia/packages/SlurmClusterManager/R0zin/src/slurmmanager.jl:62
Stacktrace:
 [1] sync_end(c::Channel{Any})
   @ Base ./task.jl:448
 [2] macro expansion
   @ task.jl:480 [inlined]
 [3] top-level scope
   @ /var/spool/slurm/d/job87901/slurm_script:478
in expression starting at /var/spool/slurm/d/job87901/slurm_script:16
Error launching Slurm job:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions