PBS load-balancing issues across 4 execution queues

Hello,

We have PBS cluster with a route queue and 4 execution queues. The load_balancing: true ALL in /var/spool/pbs/sched_priv/sched_config but jobs only run in 2 execution queues (xeon1700s/w). No jobs are running in xeon1800s/w execution queues. Please help to detect configuration issue. We have PBS Pro 14.0 installed in the environment.

The goal is to loadbalance jobs across xeon1700s/xeon1800s and xeon1700w/xeon1800w based on how many CPU requested per job

Route Queue - xeon1600
create queue xeon1600
set queue xeon1600 queue_type = Route
set queue xeon1600 route_destinations = “xeon1700s,xeon1800s,xeon1700w,xeon1800w”
set queue xeon1600 enabled = True
set queue xeon1600 started = True

create queue xeon1700s
set queue xeon1700s queue_type = Execution
set queue xeon1700s resources_max.ncpus = 80
set queue xeon1700s resources_min.ncpus = 1
set queue xeon1700s enabled = True
set queue xeon1700s started = True

create queue xeon1700w
set queue xeon1700w queue_type = Execution
set queue xeon1700w resources_max.ncpus = 4200
set queue xeon1700w resources_min.ncpus = 800
set queue xeon1700w enabled = True
set queue xeon1700w started = True

create queue xeon1800s
set queue xeon1800s queue_type = Execution
set queue xeon1800s resources_min.ncpus = 1
set queue xeon1800s resources_max.ncpus = 80
set queue xeon1800s enabled = True
set queue xeon1800s started = True

create queue xeon1800w
set queue xeon1800w queue_type = Execution
set queue xeon1800w resources_min.ncpus = 800
set queue xeon1700w resources_max.ncpus = 6280
set queue xeon1800w enabled = True
set queue xeon1800w started = True

qmgr config output file

Create resources and set their properties.

Create and define resource qlist

create resource qlist
set resource qlist type = string_array
set resource qlist flag = h

Create and define resource acfd_fluent_solver_lic

create resource acfd_fluent_solver_lic
set resource acfd_fluent_solver_lic type = long

Create and define resource acfd_cfx_solver_lic

create resource acfd_cfx_solver_lic
set resource acfd_cfx_solver_lic type = long

Create and define resource acfd_par_proc_lic

create resource acfd_par_proc_lic
set resource acfd_par_proc_lic type = long

Create queues and set their attributes.

Create and define queue xeon1300g

create queue xeon1300g
set queue xeon1300g queue_type = Execution
set queue xeon1300g resources_max.qlist = xeon1300g
set queue xeon1300g resources_min.qlist = xeon1300g
set queue xeon1300g resources_default.qlist = xeon1300g
set queue xeon1300g default_chunk.qlist = xeon1300g
set queue xeon1300g enabled = True
set queue xeon1300g started = True

Create and define queue xeon1400g

create queue xeon1400g
set queue xeon1400g queue_type = Execution
set queue xeon1400g resources_max.qlist = xeon1400g
set queue xeon1400g resources_min.qlist = xeon1400g
set queue xeon1400g resources_default.qlist = xeon1400g
set queue xeon1400g default_chunk.qlist = xeon1400g
set queue xeon1400g enabled = True
set queue xeon1400g started = True

Create and define queue xeon1300

create queue xeon1300
set queue xeon1300 queue_type = Execution
set queue xeon1300 resources_max.qlist = xeon1300
set queue xeon1300 resources_min.qlist = xeon1300
set queue xeon1300 resources_default.qlist = xeon1300
set queue xeon1300 default_chunk.qlist = xeon1300
set queue xeon1300 enabled = True
set queue xeon1300 started = True

Create and define queue xeon10g11g

create queue xeon10g11g
set queue xeon10g11g queue_type = Execution
set queue xeon10g11g resources_max.qlist = xeon10g11g
set queue xeon10g11g resources_min.qlist = xeon10g11g
set queue xeon10g11g resources_default.qlist = xeon10g11g
set queue xeon10g11g default_chunk.qlist = xeon10g11g
set queue xeon10g11g enabled = True
set queue xeon10g11g started = True

Create and define queue xeon1500

create queue xeon1500
set queue xeon1500 queue_type = Execution
set queue xeon1500 resources_max.qlist = xeon1500
set queue xeon1500 resources_min.qlist = xeon1500
set queue xeon1500 resources_default.qlist = xeon1500
set queue xeon1500 default_chunk.qlist = xeon1500
set queue xeon1500 enabled = True
set queue xeon1500 started = True

Create and define queue xeon1400

create queue xeon1400
set queue xeon1400 queue_type = Execution
set queue xeon1400 resources_max.qlist = xeon1400
set queue xeon1400 resources_min.qlist = xeon1400
set queue xeon1400 resources_default.qlist = xeon1400
set queue xeon1400 default_chunk.qlist = xeon1400
set queue xeon1400 enabled = True
set queue xeon1400 started = True

Create and define queue xeon10pri

create queue xeon10pri
set queue xeon10pri queue_type = Execution
set queue xeon10pri resources_max.qlist = xeon10pri
set queue xeon10pri resources_min.qlist = xeon10pri
set queue xeon10pri resources_default.qlist = xeon10pri
set queue xeon10pri default_chunk.qlist = xeon10pri
set queue xeon10pri enabled = True
set queue xeon10pri started = True

Create and define queue xeon900

create queue xeon900
set queue xeon900 queue_type = Execution
set queue xeon900 resources_max.qlist = xeon900
set queue xeon900 resources_min.qlist = xeon900
set queue xeon900 resources_default.qlist = xeon900
set queue xeon900 default_chunk.qlist = xeon900
set queue xeon900 enabled = True
set queue xeon900 started = True

Create and define queue xeon1700

create queue xeon1700
set queue xeon1700 queue_type = Execution
set queue xeon1700 resources_max.qlist = xeon1700
set queue xeon1700 resources_min.qlist = xeon1700
set queue xeon1700 resources_default.qlist = xeon1700
set queue xeon1700 default_chunk.qlist = xeon1700
set queue xeon1700 enabled = False
set queue xeon1700 started = False

Create and define queue xeon1200g-1400g

create queue xeon1200g-1400g
set queue xeon1200g-1400g queue_type = Execution
set queue xeon1200g-1400g resources_max.qlist = xeon1200g-1400g
set queue xeon1200g-1400g resources_min.qlist = xeon1200g-1400g
set queue xeon1200g-1400g resources_default.qlist = xeon1200g-1400g
set queue xeon1200g-1400g default_chunk.qlist = xeon1200g-1400g
set queue xeon1200g-1400g enabled = True
set queue xeon1200g-1400g started = True

Create and define queue xeon1800

create queue xeon1800
set queue xeon1800 queue_type = Execution
set queue xeon1800 resources_max.qlist = xeon1800
set queue xeon1800 resources_min.qlist = xeon1800
set queue xeon1800 resources_default.qlist = xeon1800
set queue xeon1800 default_chunk.qlist = xeon1800
set queue xeon1800 enabled = False
set queue xeon1800 started = False

Create and define queue xeon1000

create queue xeon1000
set queue xeon1000 queue_type = Execution
set queue xeon1000 resources_max.qlist = xeon1000
set queue xeon1000 resources_min.qlist = xeon1000
set queue xeon1000 resources_default.qlist = xeon1000
set queue xeon1000 default_chunk.qlist = xeon1000
set queue xeon1000 enabled = True
set queue xeon1000 started = True

Create and define queue xeon1100

create queue xeon1100
set queue xeon1100 queue_type = Execution
set queue xeon1100 resources_max.qlist = xeon1100
set queue xeon1100 resources_min.qlist = xeon1100
set queue xeon1100 resources_default.qlist = xeon1100
set queue xeon1100 default_chunk.qlist = xeon1100
set queue xeon1100 enabled = True
set queue xeon1100 started = True

Create and define queue xeon1100g

create queue xeon1100g
set queue xeon1100g queue_type = Execution
set queue xeon1100g resources_max.qlist = xeon1100g
set queue xeon1100g resources_min.qlist = xeon1100g
set queue xeon1100g resources_default.qlist = xeon1100g
set queue xeon1100g default_chunk.qlist = xeon1100g
set queue xeon1100g enabled = True
set queue xeon1100g started = True

Create and define queue xeon1200

create queue xeon1200
set queue xeon1200 queue_type = Execution
set queue xeon1200 resources_max.qlist = xeon1200
set queue xeon1200 resources_min.qlist = xeon1200
set queue xeon1200 resources_default.qlist = xeon1200
set queue xeon1200 default_chunk.qlist = xeon1200
set queue xeon1200 enabled = True
set queue xeon1200 started = True

Create and define queue xeon1200g

create queue xeon1200g
set queue xeon1200g queue_type = Execution
set queue xeon1200g resources_max.qlist = xeon1200g
set queue xeon1200g resources_min.qlist = xeon1200g
set queue xeon1200g resources_default.qlist = xeon1200g
set queue xeon1200g default_chunk.qlist = xeon1200g
set queue xeon1200g enabled = True
set queue xeon1200g started = True

Create and define queue xeon1000g

create queue xeon1000g
set queue xeon1000g queue_type = Execution
set queue xeon1000g resources_max.qlist = xeon1000g
set queue xeon1000g resources_min.qlist = xeon1000g
set queue xeon1000g resources_default.qlist = xeon1000g
set queue xeon1000g default_chunk.qlist = xeon1000g
set queue xeon1000g enabled = True
set queue xeon1000g started = True

Create and define queue xeon1700s

create queue xeon1700s
set queue xeon1700s queue_type = Execution
set queue xeon1700s resources_max.ncpus = 80
set queue xeon1700s resources_max.qlist = xeon1700s
set queue xeon1700s resources_min.ncpus = 1
set queue xeon1700s resources_min.qlist = xeon1700s
set queue xeon1700s resources_default.qlist = xeon1700s
set queue xeon1700s default_chunk.qlist = xeon1700s
set queue xeon1700s enabled = True
set queue xeon1700s started = True

Create and define queue xeon1700w

create queue xeon1700w
set queue xeon1700w queue_type = Execution
set queue xeon1700w resources_max.ncpus = 6280
set queue xeon1700w resources_max.qlist = xeon1700w
set queue xeon1700w resources_min.ncpus = 81
set queue xeon1700w resources_min.qlist = xeon1700w
set queue xeon1700w resources_default.qlist = xeon1700w
set queue xeon1700w default_chunk.qlist = xeon1700w
set queue xeon1700w enabled = True
set queue xeon1700w started = True

Create and define queue xeon1800s

create queue xeon1800s
set queue xeon1800s queue_type = Execution
set queue xeon1800s resources_max.ncpus = 80
set queue xeon1800s resources_max.qlist = xeon1800s
set queue xeon1800s resources_min.ncpus = 1
set queue xeon1800s resources_min.qlist = xeon1800s
set queue xeon1800s resources_default.qlist = xeon1800s
set queue xeon1800s default_chunk.qlist = xeon1800s
set queue xeon1800s max_user_run = 5
set queue xeon1800s enabled = True
set queue xeon1800s started = True

Create and define queue xeon1800w

create queue xeon1800w
set queue xeon1800w queue_type = Execution
set queue xeon1800w resources_max.qlist = xeon1800w
set queue xeon1800w resources_min.ncpus = 81
set queue xeon1800w resources_min.qlist = xeon1800w
set queue xeon1800w resources_default.qlist = xeon1800w
set queue xeon1800w default_chunk.qlist = xeon1800w
set queue xeon1800w max_user_run = 5
set queue xeon1800w enabled = True
set queue xeon1800w started = True

Create and define queue xeon1600

create queue xeon1600
set queue xeon1600 queue_type = Route
set queue xeon1600 route_destinations = xeon1700s
set queue xeon1600 route_destinations += xeon1800s
set queue xeon1600 route_destinations += xeon1700w
set queue xeon1600 route_destinations += xeon1800w
set queue xeon1600 enabled = True
set queue xeon1600 started = True

Set server attributes.

set server scheduling = True
set server default_queue = xeon1600
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server license_count = Avail_Global:1000000 Avail_Local:1000000 Used:0 High_Use:0 Avail_Sockets:1000000 Unused_Sockets:1000000
set server eligible_time_enable = False
set server max_concurrent_provision = 5

Thank you for the above details.

Routing queues would be used only to route the request to specific queues based on what user has requested and sometimes to avoid users directly submitting to execution queues. Routing queues would not be satisfy load balancing. load_balancing would be based on the load of the compute nodes ( many mom_priv/config parameters with respect load are considered). You might have to look at Peer Scheduling, that might help.

Otherwise, please read the below

Please check the PBS Pro Administrator guide at this link: https://www.altair.com/pdfs/pbsworks/PBSAdminGuide2020.1.pdf

I think you could implement the below:

FYI: i know you have mentioned that your setup is on PBS Pro 14.0

The load_balancing scheduler parameter (deprecated as of version 2020.1) controls a behavior wherein a scheduler won’t place a job where the anticipated load would exceed $max_load. For example if a machine has a load of 1.25, is running a 1-CPU job, and has 2 CPUs, PBS won’t place another 1-CPU job there.

Hello,

Okey, I see. the load_balancing won’t help us to achieve a goal of distributing workload evenly across 4 slurm execution queue.

  • Can we use peer scheduling in single slurm cluster, or have to separate resource in multiple clusters?
  • Is there a way to evenly distribute workload across 4 execution queues using vnodes? Do we “create resource aveload type=” for each execution queue?

Best regards,

Andrei

I hope you meant PBS Pro OSS. The below would suffice your requirement

  • peer scheduling
  • sorting vnodes as per the load average (as per the 4.9.50.3)

Thank you

Hi, yes sorry PBS Pro OSS. Thank you for recommendations. I will try to implement vnodes and peer scheduling on our test system

1 Like

Hi,

Looks like peer scheduling leverage multiple clusters/execution systems when resource are available, but this is not true load balancing. How can we achieve true loadlbalancing across 4 execution queues when jobs submitted in the default route queue?

“Peer to Peer” scheduling: A site can have multiple PBS
Pro clusters (each cluster has its server, scheduler and one
or more execution systems). A scheduler in any given
cluster can be configured to move jobs from other clusters
to its cluster when the resources required by the job are
available locally

Thanks in advance

According to the guide, the routing queue should try destinations in round-robin fashion, in the order listed, but this jobs only being routed to xeon1700s/w queues.

#Create and define queue xeon1600
create queue xeon1600
set queue xeon1600 queue_type = Route
set queue xeon1600 route_destinations = xeon1700s
set queue xeon1600 route_destinations += xeon1800s
set queue xeon1600 route_destinations += xeon1700w
set queue xeon1600 route_destinations += xeon1800w
set queue xeon1600 enabled = True
set queue xeon1600 started = True

2.3.6 Routing Queues A routing queue is used only to route jobs; jobs cannot run from a routing queue. A routing queue has the following properties:
• Can route to multiple destinations
• Tries destinations in round-robin fashion, in the order listed
• Can route to execution queues
• Can route to other routing queues
• Can route to queues in other complexes (at other servers)

Destinations can be specified in the following ways:
route_destinations = Q1
route_destinations = Q1@Server1
route_destinations = “Q1, Q2@Server1, Q3@Server2”
route_destinations += Q1
route_destinations += "Q4, Q5@Server3”

Could you please share your qsub statements along with your assessment of which queue should have received it based on the above routing/execution queue configuration.

Qsub statement

### Job Name
#PBS -N mpi_job
### Project code
#PBS -A MPI_Test
#PBS -l walltime=00:06:16
#PBS -q xeon1600
### Merge output and error files
#PBS -j oe
#PBS -k eod
### Select 2 nodes with 2 CPUs 
#PBS -lselect=4:ncpus=10:mpiprocs=16:switch=10gBE
### Send email on abort, begin and end
#PBS -m abe
### Specify mail recipient
##PBS -M john.fowler@xd-lab.net 
export I_MPI_FABRICS=ofi:tcp 
export I_MPI_STATS=ipm
echo "Start" 
cd ~/tpbs
date
ls 
date
echo "MPIRUN"
# mpirun -n 2 -iface eno1 -verbose IMB-P2P
# mpirun  -iface eno1 -verbose IMB-P2P
mpirun -l -v -n 160  IMB-MPI1
echo "Sleep" 
sleep 300
echo "End" 

We have round_robin: True all and expect jobs roundrobin across 2 execution queues

/var/spool/pbs/sched_priv/sched_config
backfill_prime:	false	ALL
by_queue: True		non_prime
by_queue: True		prime
dedicated_prefix: ded
fairshare_decay_factor: 0.5
fairshare_decay_time: 24:00:00
fairshare_entity: euser
fair_share: false	ALL
fairshare_usage_res: cput
help_starving_jobs:	true	ALL
max_starve: 24:00:00
node_sort_key: "sort_priority HIGH"	ALL
nonprimetime_prefix: np_
preemptive_sched: true	ALL
prime_exempt_anytime_queues:	false
primetime_prefix: p_
provision_policy: "aggressive_provision"
resources: "ncpus, mem, arch, host, vnode, aoe, eoe, Qlist, acfd_fluent_solver_lic, acfd_cfx_solver_lic, acfd_par_proc_lic"
round_robin: True	all
smp_cluster_dist: pack


$ qstat
Job id            Name             User              Time Use S Queue
----------------  ---------------- ----------------  -------- - -----
197.lssd530-hs05  mpi-test.sh      andrei            00:00:00 R xeon1800s       
198.lssd530-hs05  mpi-test.sh      andrei            00:00:00 R xeon1800s       
199.lssd530-hs05  mpi-test.sh      andrei            00:00:00 R xeon1800s       
200.lssd530-hs05  mpi-test.sh      andrei            00:00:00 R xeon1800s       
201.lssd530-hs05  mpi-test.sh      andrei            00:00:00 R xeon1800s       
202.lssd530-hs05  mpi-test.sh      andrei                   0 Q xeon1800s       
203.lssd530-hs05  t.sh             john                     0 Q xeon1800s       
204.lssd530-hs05  t.sh             john                     0 Q xeon1800s       
205.lssd530-hs05  t.sh             john                     0 Q xeon1800s       
206.lssd530-hs05  t.sh             john                     0 Q xeon1800s 

$ pbsnodes -a
lssd530-cs10
     Mom = lssd530-cs10
     ntype = PBS
     state = free
     pcpus = 104
     resources_available.arch = linux
     resources_available.host = lssd530-cs10
     resources_available.mem = 395571708kb
     resources_available.ncpus = 104
     resources_available.Qlist = xeon1700w,xeon1700s
     resources_available.vnode = lssd530-cs10
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     queue = xeon1700s
     resv_enable = True
     sharing = default_shared
     last_state_change_time = Thu Feb  4 02:52:08 2021
     last_used_time = Thu Feb  4 02:50:14 2021

lssd530-cs12
     Mom = lssd530-cs12
     ntype = PBS
     state = free
     pcpus = 104
     jobs = 202.lssd530-hs05/0, 202.lssd530-hs05/1, 202.lssd530-hs05/2, 202.lssd530-hs05/3, 202.lssd530-hs05/4, 202.lssd530-hs05/5, 202.lssd530-hs05/6, 202.lssd530-hs05/7, 202.lssd530-hs05/8, 202.lssd530-hs05/9, 202.lssd530-hs05/10, 202.lssd530-hs05/11, 202.lssd530-hs05/12, 202.lssd530-hs05/13, 202.lssd530-hs05/14, 202.lssd530-hs05/15, 202.lssd530-hs05/16, 202.lssd530-hs05/17, 202.lssd530-hs05/18, 202.lssd530-hs05/19, 202.lssd530-hs05/20, 202.lssd530-hs05/21, 202.lssd530-hs05/22, 202.lssd530-hs05/23, 202.lssd530-hs05/24, 202.lssd530-hs05/25, 202.lssd530-hs05/26, 202.lssd530-hs05/27, 202.lssd530-hs05/28, 202.lssd530-hs05/29, 202.lssd530-hs05/30, 202.lssd530-hs05/31, 202.lssd530-hs05/32, 202.lssd530-hs05/33, 202.lssd530-hs05/34, 202.lssd530-hs05/35, 202.lssd530-hs05/36, 202.lssd530-hs05/37, 202.lssd530-hs05/38, 202.lssd530-hs05/39, 203.lssd530-hs05/40, 203.lssd530-hs05/41, 203.lssd530-hs05/42, 203.lssd530-hs05/43, 203.lssd530-hs05/44, 204.lssd530-hs05/45, 204.lssd530-hs05/46, 204.lssd530-hs05/47, 204.lssd530-hs05/48, 204.lssd530-hs05/49, 205.lssd530-hs05/50, 205.lssd530-hs05/51, 205.lssd530-hs05/52, 205.lssd530-hs05/53, 205.lssd530-hs05/54, 206.lssd530-hs05/55, 206.lssd530-hs05/56, 206.lssd530-hs05/57, 206.lssd530-hs05/58, 206.lssd530-hs05/59
     resources_available.arch = linux
     resources_available.host = lssd530-cs12
     resources_available.mem = 395571708kb
     resources_available.ncpus = 104
     resources_available.Qlist = xeon1800w,xeon1800s
     resources_available.vnode = lssd530-cs12
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 60
     resources_assigned.vmem = 0kb
     queue = xeon1800s
     resv_enable = True
     sharing = default_shared
     last_state_change_time = Fri Feb  5 12:29:23 2021
     last_used_time = Fri Feb  5 13:12:49 2021

lssd530-cs13
     Mom = lssd530-cs13
     ntype = PBS
     state = free
     pcpus = 104
     resources_available.arch = linux
     resources_available.host = lssd530-cs13
     resources_available.mem = 395571708kb
     resources_available.ncpus = 104
     resources_available.Qlist = xeon1800w,xeon1800s
     resources_available.vnode = lssd530-cs13
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     queue = xeon1800s
     resv_enable = True
     sharing = default_shared
     last_state_change_time = Fri Feb  5 12:29:38 2021
     last_used_time = Fri Feb  5 13:12:49 2021

lssd530-cs09
     Mom = lssd530-cs09
     ntype = PBS
     state = free
     pcpus = 52
     resources_available.arch = linux
     resources_available.host = lssd530-cs09
     resources_available.mem = 65744424kb
     resources_available.ncpus = 104
     resources_available.Qlist = xeon1700w,xeon1700s
     resources_available.vnode = lssd530-cs09
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     queue = xeon1700s
     resv_enable = True
     sharing = default_shared
     last_state_change_time = Thu Feb  4 02:52:08 2021
     last_used_time = Thu Feb  4 02:50:14 2021