When I submit a job to a cluster of 04 nodes and 01 head node, job gets allotted to only one node. Is there any specific configuration that I could have left out?
[hpcuser@mgt1 ~]$ qsub -I -l select=2:ncpus=12;mpiprocs=12
qsub: waiting for job 7.mgt1 to start
qsub: job 7.mgt1 ready
[hpcuser@mgt1 ~]$ qsub -I -l select=2:ncpus=12:mpiprocs=12
[hpcuser@mgt1 ~]$ qstat -an
mgt1:
Req’d Req’d Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
1.mgt1 hpcuser workq STDIN 25701 2 20 – – R 01:26
node01/010+node01/110
7.mgt1 hpcuser workq STDIN 26470 2 24 – – R 00:00
node01/212+node01/312
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
The qstat -f of job id 1.mgt1 is below
[root@mgt1 test]# qstat -f
Job Id: 1.mgt1
Job_Name = STDIN
Job_Owner = hpcuser@mgt1
resources_used.cpupercent = 98
resources_used.cput = 00:45:26
resources_used.mem = 48141760kb
resources_used.ncpus = 20
resources_used.vmem = 48904932kb
resources_used.walltime = 01:23:10
job_state = R
queue = workq
server = mgt1
Checkpoint = u
ctime = Thu Mar 21 18:34:41 2019
Error_Path = /dev/pts/0
exec_host = node01/010+node01/110
exec_vnode = (node01:ncpus=10)+(node01:ncpus=10)
Hold_Types = n
interactive = True
Join_Path = n
Keep_Files = n
Mail_Points = a
mtime = Thu Mar 21 19:57:50 2019
Output_Path = /dev/pts/0
Priority = 0
qtime = Thu Mar 21 18:34:41 2019
Rerunable = False
Resource_List.ncpus = 20
Resource_List.nodect = 2
Resource_List.place = free
Resource_List.select = 2:ncpus=10
schedselect = 2:ncpus=10
stime = Thu Mar 21 18:34:41 2019
session_id = 25701
jobdir = /nfsshare/home/hpcuser
substate = 42
Variable_List = PBS_O_HOME=/nfsshare/home/hpcuser,PBS_O_LANG=en_US.UTF-8,
PBS_O_LOGNAME=hpcuser,
PBS_O_PATH=/opt/xcat/bin:/opt/xcat/sbin:/opt/xcat/share/xcat/tools:/op
t/hpc/ferret/ferret-7.4.4-RHEL7-64/bin:/opt/xcat/bin:/opt/xcat/sbin:/op
t/xcat/share/xcat/tools:/usr/local/bin:/usr/local/sbin:/usr/bin:/usr/sb
in:/bin:/sbin:/opt/pbs/bin:/nfsshare/home/hpcuser/.local/bin:/nfsshare/
home/hpcuser/bin:/opt/pbs/bin,PBS_O_MAIL=/var/spool/mail/hpcuser,
PBS_O_SHELL=/bin/bash,PBS_O_WORKDIR=/nfsshare/home/hpcuser,
PBS_O_SYSTEM=Linux,PBS_O_QUEUE=workq,PBS_O_HOST=mgt1
euser = hpcuser
egroup = hpcuser
hashname = 1.mgt1
queue_rank = 1553173481022
queue_type = E
comment = Job run at Thu Mar 21 at 18:34 on (node01:ncpus=10)+(node01:ncpus
=10)
etime = Thu Mar 21 18:34:41 2019
run_count = 1
Submit_arguments = -I -X -l select=2:ncpus=10
project = _pbs_project_default
forward_x11_cookie = MIT-MAGIC-COOKIE-1:39c4016e0ca635f02ef3b35c73e44524:0
forward_x11_port = True
run_version = 1
+++++++++++++++++++++++++++++++++++++++++++++++++++
The pbsnodes -a shows the nodes node details as below
[root@mgt1 ~]# pbsnodes -a
node01
Mom = node01
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 48
resources_available.arch = linux
resources_available.host = node01
resources_available.mem = 65184884kb
resources_available.ncpus = 48
resources_available.vnode = node01
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Mar 21 13:43:54 2019
node02
Mom = node02
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 48
resources_available.arch = linux
resources_available.host = node02
resources_available.mem = 65184884kb
resources_available.ncpus = 48
resources_available.vnode = node02
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Mar 21 13:43:54 2019
node03
Mom = node03
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 48
resources_available.arch = linux
resources_available.host = node03
resources_available.mem = 65184884kb
resources_available.ncpus = 48
resources_available.vnode = node03
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Mar 21 13:43:54 2019
node04
Mom = node04
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 24
resources_available.arch = linux
resources_available.host = node04
resources_available.mem = 65185004kb
resources_available.ncpus = 24
resources_available.vnode = node04
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Mar 21 13:43:54 2019