Hello everyone. I’m new to PBS and I’m learning PBS. Now I found that no matter how many nodes were allocated, subjobs in a job array were always scheduled running on one node. I believe this is incorrect because I read in PBS document "By default PBS simultaneously runs as many subjobs from a job array as possible. ". But I can’t find anything wrong in my environment. Please help!
Here’s my PBS job script:
#!/bin/bash
#PBS -N hostname
#PBS -j oe
hostname
Here’s the command line of qsub
qsub -l select=3 -l place=scatter:excl -J 1-3 ./hostname.sh
The detailed command info is:
$ qstat -xf 43[]
Job Id: 43[].ip-AC1C0005
Job_Name = hostname
Job_Owner = azureuser@ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cl
oudapp.net
job_state = F
queue = workq
server = ip-AC1C0005
Checkpoint = u
ctime = Tue Nov 16 13:20:18 2021
Error_Path = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.ne
t:/shared/home/azureuser/demo/hostname.e43.^array_index^
Hold_Types = n
Join_Path = oe
Keep_Files = n
Mail_Points = a
mtime = Tue Nov 16 13:20:19 2021
Output_Path = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.n
et:/shared/home/azureuser/demo/hostname.o43.^array_index^
Priority = 0
qtime = Tue Nov 16 13:20:18 2021
Rerunable = True
Resource_List.ncpus = 3
Resource_List.nodect = 3
Resource_List.place = scatter:excl
Resource_List.select = 3
Resource_List.ungrouped = false
stime = Tue Nov 16 13:20:18 2021
substate = 92
Variable_List = PBS_O_HOME=/shared/home/azureuser,PBS_O_LANG=en_US.UTF-8,
PBS_O_LOGNAME=azureuser,
PBS_O_PATH=/shared/home/azureuser/.local/bin:/shared/home/azureuser/bi
n:/usr/share/Modules/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/s
bin:/root/bin:/opt/cycle/jetpack/bin:/opt/pbs/bin,
PBS_O_MAIL=/var/spool/mail/azureuser,PBS_O_SHELL=/bin/bash,
PBS_O_WORKDIR=/shared/home/azureuser/demo,PBS_O_SYSTEM=Linux,
PBS_O_QUEUE=workq,
PBS_O_HOST=ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp
.net
comment = Job Array Began at Tue Nov 16 at 13:20 and finished
etime = Tue Nov 16 13:20:18 2021
Stageout_status = 1
Exit_status = 0
Submit_arguments = -l select=3 -l place=scatter:excl -J 1-3 ./hostname.sh
array = True
array_state_count = Queued:0 Running:0 Exiting:0 Expired:0
array_indices_submitted = 1-3
array_indices_remaining = -
history_timestamp = 1637068819
project = _pbs_project_default
Submit_Host = ip-ac1c0005.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.n
et
And here’s the job output:
$ cat hostname.o43.*
stty: 'standard input': Inappropriate ioctl for device
ip-AC1C0008
stty: 'standard input': Inappropriate ioctl for device
ip-AC1C0008
stty: 'standard input': Inappropriate ioctl for device
ip-AC1C0008
You can see they’re all on host ip-AC1C0008, while I have 3 hosts.
$ pbsnodes -av
ip-AC1C0008
Mom = ip-ac1c0008.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.net
ntype = PBS
state = free
pcpus = 2
resources_available.arch = linux
resources_available.ccnodeid = 10eecad1-c2f2-4bb8-a502-81e9f05568eb
resources_available.disk = 20gb
resources_available.group_id = Standard_F2s_v2_pg0
resources_available.host = ip-ac1c0008
resources_available.mem = 4gb
resources_available.ncpus = 1
resources_available.ngpus = 0
resources_available.nodearray = execute
resources_available.slot_type = execute
resources_available.ungrouped = false
resources_available.vm_size = Standard_F2s_v2
resources_available.vnode = ip-AC1C0008
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Tue Nov 16 13:20:19 2021
last_used_time = Tue Nov 16 13:20:19 2021
ip-AC1C000B
Mom = ip-ac1c000b.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.net
ntype = PBS
state = free
pcpus = 2
resources_available.arch = linux
resources_available.ccnodeid = be53f01e-c682-4681-ac8d-402783e109f1
resources_available.disk = 20gb
resources_available.group_id = Standard_F2s_v2_pg0
resources_available.host = ip-ac1c000b
resources_available.mem = 4gb
resources_available.ncpus = 1
resources_available.ngpus = 0
resources_available.nodearray = execute
resources_available.slot_type = execute
resources_available.ungrouped = false
resources_available.vm_size = Standard_F2s_v2
resources_available.vnode = ip-AC1C000B
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Tue Nov 16 13:20:19 2021
last_used_time = Tue Nov 16 13:20:19 2021
ip-AC1C000D
Mom = ip-ac1c000d.3ffddhocb5vevods4vo0bz0jcf.ix.internal.cloudapp.net
ntype = PBS
state = free
pcpus = 2
resources_available.arch = linux
resources_available.ccnodeid = ababa327-c405-4d29-b5f1-11a75c38946e
resources_available.disk = 20gb
resources_available.group_id = Standard_F2s_v2_pg0
resources_available.host = ip-ac1c000d
resources_available.mem = 4gb
resources_available.ncpus = 1
resources_available.ngpus = 0
resources_available.nodearray = execute
resources_available.slot_type = execute
resources_available.ungrouped = false
resources_available.vm_size = Standard_F2s_v2
resources_available.vnode = ip-AC1C000D
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Tue Nov 16 13:20:19 2021
last_used_time = Tue Nov 16 13:20:19 2021
And here’s my server config:
$ qmgr -c 'p s'
#
# Create resources and set their properties.
#
#
# Create and define resource slot_type
#
create resource slot_type
set resource slot_type type = string
set resource slot_type flag = h
#
# Create and define resource instance_id
#
create resource instance_id
set resource instance_id type = string
set resource instance_id flag = h
#
# Create and define resource vm_size
#
create resource vm_size
set resource vm_size type = string
set resource vm_size flag = h
#
# Create and define resource nodearray
#
create resource nodearray
set resource nodearray type = string
set resource nodearray flag = h
#
# Create and define resource disk
#
create resource disk
set resource disk type = size
set resource disk flag = hn
#
# Create and define resource ngpus
#
create resource ngpus
set resource ngpus type = long
set resource ngpus flag = hn
#
# Create and define resource group_id
#
create resource group_id
set resource group_id type = string
set resource group_id flag = h
#
# Create and define resource ungrouped
#
create resource ungrouped
set resource ungrouped type = string
set resource ungrouped flag = h
#
# Create and define resource ccnodeid
#
create resource ccnodeid
set resource ccnodeid type = string
set resource ccnodeid flag = h
#
# Create queues and set their attributes.
#
#
# Create and define queue workq
#
create queue workq
set queue workq queue_type = Execution
set queue workq resources_default.place = scatter:excl
set queue workq resources_default.ungrouped = false
set queue workq enabled = True
set queue workq started = True
#
# Create and define queue htcq
#
create queue htcq
set queue htcq queue_type = Execution
set queue htcq resources_default.place = free
set queue htcq resources_default.ungrouped = true
set queue htcq enabled = True
set queue htcq started = True
#
# Set server attributes.
#
set server scheduling = True
set server managers = root@*
set server default_queue = workq
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 15
set server flatuid = True
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server node_group_enable = True
set server node_group_key = group_id
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = False
set server job_history_enable = True
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999
What could be wrong? Please help!