Subjobs were all scheduled on one node, no matter how many nodes were allocated

Hello everyone. I’m new to PBS and I’m learning PBS. Now I found that no matter how many nodes were allocated, subjobs in a job array were always scheduled running on one node. I believe this is incorrect because I read in PBS document "By default PBS simultaneously runs as many subjobs from a job array as possible. ". But I can’t find anything wrong in my environment. Please help!

Here’s my PBS job script:

#!/bin/bash

#PBS -N hostname
#PBS -j oe

hostname

Here’s the command line of qsub

qsub -l select=3 -l place=scatter:excl -J 1-3  ./hostname.sh

The detailed command info is:

$ qstat -xf 43[]
Job Id: 43[].ip-AC1C0005
    Job_Name = hostname
    Job_Owner = azureuser@ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cl
        oudapp.net
    job_state = F
    queue = workq
    server = ip-AC1C0005
    Checkpoint = u
    ctime = Tue Nov 16 13:20:18 2021
    Error_Path = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.ne
        t:/shared/home/azureuser/demo/hostname.e43.^array_index^
    Hold_Types = n
    Join_Path = oe
    Keep_Files = n
    Mail_Points = a
    mtime = Tue Nov 16 13:20:19 2021
    Output_Path = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.n
        et:/shared/home/azureuser/demo/hostname.o43.^array_index^
    Priority = 0
    qtime = Tue Nov 16 13:20:18 2021
    Rerunable = True
    Resource_List.ncpus = 3
    Resource_List.nodect = 3
    Resource_List.place = scatter:excl
    Resource_List.select = 3
    Resource_List.ungrouped = false
    stime = Tue Nov 16 13:20:18 2021
    substate = 92
    Variable_List = PBS_O_HOME=/shared/home/azureuser,PBS_O_LANG=en_US.UTF-8,
        PBS_O_LOGNAME=azureuser,
        PBS_O_PATH=/shared/home/azureuser/.local/bin:/shared/home/azureuser/bi
        n:/usr/share/Modules/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/s
        bin:/root/bin:/opt/cycle/jetpack/bin:/opt/pbs/bin,
        PBS_O_MAIL=/var/spool/mail/azureuser,PBS_O_SHELL=/bin/bash,
        PBS_O_WORKDIR=/shared/home/azureuser/demo,PBS_O_SYSTEM=Linux,
        PBS_O_QUEUE=workq,
        PBS_O_HOST=ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp
        .net
    comment = Job Array Began at Tue Nov 16 at 13:20 and finished
    etime = Tue Nov 16 13:20:18 2021
    Stageout_status = 1
    Exit_status = 0
    Submit_arguments = -l select=3 -l place=scatter:excl -J 1-3 ./hostname.sh
    array = True
    array_state_count = Queued:0 Running:0 Exiting:0 Expired:0
    array_indices_submitted = 1-3
    array_indices_remaining = -
    history_timestamp = 1637068819
    project = _pbs_project_default
    Submit_Host = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.n
        et

And here’s the job output:

$ cat hostname.o43.*
stty: 'standard input': Inappropriate ioctl for device
ip-AC1C0008
stty: 'standard input': Inappropriate ioctl for device
ip-AC1C0008
stty: 'standard input': Inappropriate ioctl for device
ip-AC1C0008

You can see they’re all on host ip-AC1C0008, while I have 3 hosts.

$ pbsnodes -av
ip-AC1C0008
     Mom = ip-ac1c0008.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.net
     ntype = PBS
     state = free
     pcpus = 2
     resources_available.arch = linux
     resources_available.ccnodeid = 10eecad1-c2f2-4bb8-a502-81e9f05568eb
     resources_available.disk = 20gb
     resources_available.group_id = Standard_F2s_v2_pg0
     resources_available.host = ip-ac1c0008
     resources_available.mem = 4gb
     resources_available.ncpus = 1
     resources_available.ngpus = 0
     resources_available.nodearray = execute
     resources_available.slot_type = execute
     resources_available.ungrouped = false
     resources_available.vm_size = Standard_F2s_v2
     resources_available.vnode = ip-AC1C0008
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     last_state_change_time = Tue Nov 16 13:20:19 2021
     last_used_time = Tue Nov 16 13:20:19 2021

ip-AC1C000B
     Mom = ip-ac1c000b.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.net
     ntype = PBS
     state = free
     pcpus = 2
     resources_available.arch = linux
     resources_available.ccnodeid = be53f01e-c682-4681-ac8d-402783e109f1
     resources_available.disk = 20gb
     resources_available.group_id = Standard_F2s_v2_pg0
     resources_available.host = ip-ac1c000b
     resources_available.mem = 4gb
     resources_available.ncpus = 1
     resources_available.ngpus = 0
     resources_available.nodearray = execute
     resources_available.slot_type = execute
     resources_available.ungrouped = false
     resources_available.vm_size = Standard_F2s_v2
     resources_available.vnode = ip-AC1C000B
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     last_state_change_time = Tue Nov 16 13:20:19 2021
     last_used_time = Tue Nov 16 13:20:19 2021

ip-AC1C000D
     Mom = ip-ac1c000d.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.net
     ntype = PBS
     state = free
     pcpus = 2
     resources_available.arch = linux
     resources_available.ccnodeid = ababa327-c405-4d29-b5f1-11a75c38946e
     resources_available.disk = 20gb
     resources_available.group_id = Standard_F2s_v2_pg0
     resources_available.host = ip-ac1c000d
     resources_available.mem = 4gb
     resources_available.ncpus = 1
     resources_available.ngpus = 0
     resources_available.nodearray = execute
     resources_available.slot_type = execute
     resources_available.ungrouped = false
     resources_available.vm_size = Standard_F2s_v2
     resources_available.vnode = ip-AC1C000D
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     last_state_change_time = Tue Nov 16 13:20:19 2021
     last_used_time = Tue Nov 16 13:20:19 2021

And here’s my server config:

$ qmgr -c 'p s'
#
# Create resources and set their properties.
#
#
# Create and define resource slot_type
#
create resource slot_type
set resource slot_type type = string
set resource slot_type flag = h
#
# Create and define resource instance_id
#
create resource instance_id
set resource instance_id type = string
set resource instance_id flag = h
#
# Create and define resource vm_size
#
create resource vm_size
set resource vm_size type = string
set resource vm_size flag = h
#
# Create and define resource nodearray
#
create resource nodearray
set resource nodearray type = string
set resource nodearray flag = h
#
# Create and define resource disk
#
create resource disk
set resource disk type = size
set resource disk flag = hn
#
# Create and define resource ngpus
#
create resource ngpus
set resource ngpus type = long
set resource ngpus flag = hn
#
# Create and define resource group_id
#
create resource group_id
set resource group_id type = string
set resource group_id flag = h
#
# Create and define resource ungrouped
#
create resource ungrouped
set resource ungrouped type = string
set resource ungrouped flag = h
#
# Create and define resource ccnodeid
#
create resource ccnodeid
set resource ccnodeid type = string
set resource ccnodeid flag = h
#
# Create queues and set their attributes.
#
#
# Create and define queue workq
#
create queue workq
set queue workq queue_type = Execution
set queue workq resources_default.place = scatter:excl
set queue workq resources_default.ungrouped = false
set queue workq enabled = True
set queue workq started = True
#
# Create and define queue htcq
#
create queue htcq
set queue htcq queue_type = Execution
set queue htcq resources_default.place = free
set queue htcq resources_default.ungrouped = true
set queue htcq enabled = True
set queue htcq started = True
#
# Set server attributes.
#
set server scheduling = True
set server managers = root@*
set server default_queue = workq
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 15
set server flatuid = True
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server node_group_enable = True
set server node_group_key = group_id
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = False
set server job_history_enable = True
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999

What could be wrong? Please help!

This says you want each sub-job to use 3 chunks, each on a different host. You have only 3 execution hosts, each with only one CPU, so only one sub-job can run at a time, using all the nodes.

Specify select=1 and see if that gets what you want.

For testing purposes, you might add a sleep 10 to your script so you can see all the sub-jobs running at the same time.

Hi @dtalcott , thank you for your answer! It gives me a great point! Though I failed with

qsub -l select=1 -l place=scatter:excl -J 1-3 ./hostname.sh

but I succeeded with

qsub -J 1-3 ./hostname.sh

Comparing the details of the jobs by qstat -xf, I found the former(failed)

Job Id: 47[].ip-AC1C0005
    Job_Name = hostname
    Job_Owner = azureuser@ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cl
        oudapp.net
    job_state = F
    queue = workq
    server = ip-AC1C0005
    Checkpoint = u
    ctime = Wed Nov 17 03:01:47 2021
    Error_Path = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.ne
        t:/shared/home/azureuser/demo/hostname.e47.^array_index^
    Hold_Types = n
    Join_Path = oe
    Keep_Files = n
    Mail_Points = a
    mtime = Wed Nov 17 03:02:18 2021
    Output_Path = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.n
        et:/shared/home/azureuser/demo/hostname.o47.^array_index^
    Priority = 0
    qtime = Wed Nov 17 03:01:47 2021
    Rerunable = True
    Resource_List.ncpus = 1
    Resource_List.nodect = 1
    Resource_List.place = scatter:excl
    Resource_List.select = 1
    Resource_List.ungrouped = false
    stime = Wed Nov 17 03:01:47 2021
    substate = 92
    Variable_List = PBS_O_HOME=/shared/home/azureuser,PBS_O_LANG=en_US.UTF-8,
        PBS_O_LOGNAME=azureuser,
        PBS_O_PATH=/shared/home/azureuser/.local/bin:/shared/home/azureuser/bi
        n:/usr/share/Modules/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/s
        bin:/root/bin:/opt/cycle/jetpack/bin:/opt/pbs/bin,
        PBS_O_MAIL=/var/spool/mail/azureuser,PBS_O_SHELL=/bin/bash,
        PBS_O_WORKDIR=/shared/home/azureuser/demo,PBS_O_SYSTEM=Linux,
        PBS_O_QUEUE=workq,
        PBS_O_HOST=ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp
        .net
    comment = Job Array Began at Wed Nov 17 at 03:01 and finished
    etime = Wed Nov 17 03:01:47 2021
    Stageout_status = 1
    Exit_status = 0
    Submit_arguments = -l select=1 -l place=scatter:excl -J 1-3 ./hostname.sh
    array = True
    array_state_count = Queued:0 Running:0 Exiting:0 Expired:0
    array_indices_submitted = 1-3
    array_indices_remaining = -
    history_timestamp = 1637118138
    project = _pbs_project_default
    Submit_Host = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.n
        et

and the latter(succful)

Job Id: 49[].ip-AC1C0005
    Job_Name = hostname
    Job_Owner = azureuser@ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cl
        oudapp.net
    job_state = F
    queue = workq
    server = ip-AC1C0005
    Checkpoint = u
    ctime = Wed Nov 17 03:06:20 2021
    Error_Path = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.ne
        t:/shared/home/azureuser/demo/hostname.e49.^array_index^
    Hold_Types = n
    Join_Path = oe
    Keep_Files = n
    Mail_Points = a
    mtime = Wed Nov 17 03:06:31 2021
    Output_Path = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.n
        et:/shared/home/azureuser/demo/hostname.o49.^array_index^
    Priority = 0
    qtime = Wed Nov 17 03:06:20 2021
    Rerunable = True
    Resource_List.ncpus = 1
    Resource_List.nodect = 1
    Resource_List.place = pack
    Resource_List.select = 1:ncpus=1:ungrouped=false
    Resource_List.ungrouped = false
    stime = Wed Nov 17 03:06:21 2021
    substate = 92
    Variable_List = PBS_O_HOME=/shared/home/azureuser,PBS_O_LANG=en_US.UTF-8,
        PBS_O_LOGNAME=azureuser,
        PBS_O_PATH=/shared/home/azureuser/.local/bin:/shared/home/azureuser/bi
        n:/usr/share/Modules/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/s
        bin:/root/bin:/opt/cycle/jetpack/bin:/opt/pbs/bin,
        PBS_O_MAIL=/var/spool/mail/azureuser,PBS_O_SHELL=/bin/bash,
        PBS_O_WORKDIR=/shared/home/azureuser/demo,PBS_O_SYSTEM=Linux,
        PBS_O_QUEUE=workq,
        PBS_O_HOST=ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp
        .net
    comment = Job Array Began at Wed Nov 17 at 03:06 and finished
    etime = Wed Nov 17 03:06:20 2021
    Stageout_status = 1
    Exit_status = 0
    Submit_arguments = -J 1-3 ./hostname.sh
    array = True
    array_state_count = Queued:0 Running:0 Exiting:0 Expired:0
    array_indices_submitted = 1-3
    array_indices_remaining = -
    pset = group_id=Standard_F2s_v2_pg0
    history_timestamp = 1637118391
    project = _pbs_project_default
    Submit_Host = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.n
        et

I think the key difference between them is

    Resource_List.place = scatter:excl

VS

    Resource_List.place = pack

So it seems to me that -l place=scatter prevents subjobs running simultaneously, though I have -l select=1. Then I tried

qsub -l select=1 -l place=pack -J 1-3 ./hostname.sh

and it also succeeded.