Hello,
We have PBS cluster with a routing queue and 2 execution queues.
The round_robin: true ALL in /var/spool/pbs/sched_priv/sched_config
set queue xeon1700s Priority = 100
set queue xeon1800s Priority = 100
/var/spool/pbs/sched_priv/sched_config
backfill_prime: false ALL
by_queue: True non_prime
by_queue: True prime
dedicated_prefix: ded
fairshare_decay_factor: 0.5
fairshare_decay_time: 24:00:00
fairshare_entity: euser
fair_share: false ALL
fairshare_usage_res: cput
help_starving_jobs: true ALL
max_starve: 24:00:00
node_sort_key: "sort_priority HIGH" ALL
nonprimetime_prefix: np_
preemptive_sched: true ALL
prime_exempt_anytime_queues: false
primetime_prefix: p_
provision_policy: "aggressive_provision"
resources: "ncpus, mem, arch, host, vnode, aoe, eoe, Qlist, acfd_fluent_solver_lic, acfd_cfx_solver_lic, acfd_par_proc_lic"
round_robin: True all
smp_cluster_dist: pack
but jobs only run in xeon1800s execution queue. PBS put jobs in Q state when xeon1800s queue is full. No jobs are being routed to xeon1700s execution queues Please help to detect the configuration issue. We have OpenPBS 20 installed in the environment.
The goal is to round-robin jobs across xeon1700s/xeon1800s
$ qstat -Q
Queue Max Tot Ena Str Que Run Hld Wat Trn Ext Type
---------------- ----- ----- --- --- ----- ----- ----- ----- ----- ----- ----
workq 0 0 yes yes 0 0 0 0 0 0 Exe*
xeon1800s 0 1 yes yes 0 1 0 0 0 0 Exe*
xeon1800w 0 0 yes yes 0 0 0 0 0 0 Exe*
xeon1700w 0 0 yes yes 0 0 0 0 0 0 Exe*
xeon1700s 0 0 yes yes 0 0 0 0 0 0 Exe*
xeon1600 0 0 yes yes 0 0 0 0 0 0 Rou*
$ qstat
Job id Name User Time Use S Queue
---------------- ---------------- ---------------- -------- - -----
216.lssd530-hs05 mpi-test.sh andrei 00:00:00 R xeon1800s
217.lssd530-hs05 mpi-test.sh andrei 00:00:00 R xeon1800s
218.lssd530-hs05 mpi-test.sh andrei 00:00:00 R xeon1800s
219.lssd530-hs05 mpi-test.sh andrei 00:00:00 R xeon1800s
220.lssd530-hs05 mpi-test.sh andrei 00:00:00 R xeon1800s
221.lssd530-hs05 mpi-test.sh andrei 0 Q xeon1800s
222.lssd530-hs05 mpi-test.sh andrei 0 Q xeon1800s
223.lssd530-hs05 mpi-test.sh andrei 0 Q xeon1800s
224.lssd530-hs05 mpi-test.sh andrei 0 Q xeon1800s
225.lssd530-hs05 mpi-test.sh andrei 0 Q xeon1800s
226.lssd530-hs05 mpi-test.sh andrei 0 Q xeon1800s
227.lssd530-hs05 mpi-test.sh andrei 0 Q xeon1800s
228.lssd530-hs05 mpi-test.sh andrei 0 Q xeon1800s
create queue xeon1600
set queue xeon1600 queue_type = Route
set queue xeon1600 route_destinations = xeon1800s
set queue xeon1600 route_destinations += xeon1700s
set queue xeon1600 enabled = True
set queue xeon1600 started = True
create queue xeon1700s
set queue xeon1700s queue_type = Execution
set queue xeon1700s Priority = 100
set queue xeon1700s resources_max.ncpus = 80
set queue xeon1700s resources_max.Qlist = xeon1700s
set queue xeon1700s resources_min.ncpus = 1
set queue xeon1700s resources_min.Qlist = xeon1700s
set queue xeon1700s resources_default.Qlist = xeon1700s
set queue xeon1700s default_chunk.Qlist = xeon1700s
set queue xeon1700s max_user_run = 9999
set queue xeon1700s enabled = True
set queue xeon1700s started = True
create queue xeon1800s
set queue xeon1800s queue_type = Execution
set queue xeon1800s Priority = 100
set queue xeon1800s resources_max.ncpus = 80
set queue xeon1800s resources_max.Qlist = xeon1800s
set queue xeon1800s resources_min.ncpus = 1
set queue xeon1800s resources_min.Qlist = xeon1800s
set queue xeon1800s resources_default.Qlist = xeon1800s
set queue xeon1800s default_chunk.Qlist = xeon1800s
set queue xeon1800s max_user_run = 9999
set queue xeon1800s enabled = True
set queue xeon1800s started = True
# Set server attributes.
#
set server scheduling = True
set server default_queue = xeon1600
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = False
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999
$ pbsnodes -a
lssd530-cs10
Mom = lssd530-cs10
ntype = PBS
state = free
pcpus = 104
resources_available.arch = linux
resources_available.host = lssd530-cs10
resources_available.mem = 395571708kb
resources_available.ncpus = 104
resources_available.Qlist = xeon1700w,xeon1700s
resources_available.vnode = lssd530-cs10
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
queue = xeon1700s
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Feb 4 02:52:08 2021
last_used_time = Thu Feb 4 02:50:14 2021
lssd530-cs12
Mom = lssd530-cs12
ntype = PBS
state = free
pcpus = 104
jobs = 207.lssd530-hs05/0, 207.lssd530-hs05/1, 207.lssd530-hs05/2, 207.lssd530-hs05/3, 207.lssd530-hs05/4, 208.lssd530-hs05/5, 208.lssd530-hs05/6, 208.lssd530-hs05/7, 208.lssd530-hs05/8, 208.lssd530-hs05/9, 209.lssd530-hs05/10, 209.lssd530-hs05/11, 209.lssd530-hs05/12, 209.lssd530-hs05/13, 209.lssd530-hs05/14, 210.lssd530-hs05/15, 210.lssd530-hs05/16, 210.lssd530-hs05/17, 210.lssd530-hs05/18, 210.lssd530-hs05/19, 211.lssd530-hs05/20, 211.lssd530-hs05/21, 211.lssd530-hs05/22, 211.lssd530-hs05/23, 211.lssd530-hs05/24, 212.lssd530-hs05/25, 212.lssd530-hs05/26, 212.lssd530-hs05/27, 212.lssd530-hs05/28, 212.lssd530-hs05/29, 213.lssd530-hs05/30, 213.lssd530-hs05/31, 213.lssd530-hs05/32, 213.lssd530-hs05/33, 213.lssd530-hs05/34, 214.lssd530-hs05/35, 214.lssd530-hs05/36, 214.lssd530-hs05/37, 214.lssd530-hs05/38, 214.lssd530-hs05/39, 215.lssd530-hs05/40, 215.lssd530-hs05/41, 215.lssd530-hs05/42, 215.lssd530-hs05/43, 215.lssd530-hs05/44
resources_available.arch = linux
resources_available.host = lssd530-cs12
resources_available.mem = 395571708kb
resources_available.ncpus = 104
resources_available.Qlist = xeon1800w,xeon1800s
resources_available.vnode = lssd530-cs12
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 45
resources_assigned.vmem = 0kb
queue = xeon1800s
resv_enable = True
sharing = default_shared
last_state_change_time = Fri Feb 5 12:29:23 2021
last_used_time = Fri Feb 5 13:18:56 2021
lssd530-cs13
Mom = lssd530-cs13
ntype = PBS
state = free
pcpus = 104
resources_available.arch = linux
resources_available.host = lssd530-cs13
resources_available.mem = 395571708kb
resources_available.ncpus = 104
resources_available.Qlist = xeon1800w,xeon1800s
resources_available.vnode = lssd530-cs13
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
queue = xeon1800s
resv_enable = True
sharing = default_shared
last_state_change_time = Fri Feb 5 12:29:38 2021
last_used_time = Fri Feb 5 13:12:49 2021
lssd530-cs09
Mom = lssd530-cs09
ntype = PBS
state = free
pcpus = 52
resources_available.arch = linux
resources_available.host = lssd530-cs09
resources_available.mem = 65744424kb
resources_available.ncpus = 104
resources_available.Qlist = xeon1700w,xeon1700s
resources_available.vnode = lssd530-cs09
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
queue = xeon1700s
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Feb 4 02:52:08 2021
last_used_time = Thu Feb 4 02:50:14 2021