How to scatter jobs over vnodes?

[~]pbsnodes -aSj
mem ncpus nmics ngpus
vnode state njobs run susp f/t f/t f/t f/t jobs


compute-00 free 0 0 0 376gb/376gb 72/72 0/0 0/0 –
compute-01 free 0 0 0 376gb/376gb 72/72 0/0 0/0 –
compute-02 free 6 6 0 236gb/376gb 24/72 0/0 0/0 22695,22697,22698,22699,22700,22989
compute-03 free 0 0 0 376gb/376gb 72/72 0/0 0/0 –

[~]# qmgr -c ‘p s’

Create queues and set their attributes.

Create and define queue workq

create queue workq
set queue workq queue_type = Execution
set queue workq enabled = True
set queue workq started = True

Create and define queue default

create queue default
set queue default queue_type = Execution
set queue default Priority = 50
set queue default resources_max.mem = 24000mb
set queue default resources_max.ncpus = 16
set queue default resources_max.nodes = 4
set queue default resources_max.walltime = 96:00:00
set queue default resources_default.ncpus = 1
set queue default resources_default.walltime = 24:00:00
set queue default enabled = True
set queue default started = True

Set server attributes.

set server scheduling = True
set server acl_roots = rcole
set server default_queue = default
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server flatuid = True
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server default_qsub_arguments = -V
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = False
set server job_history_enable = True
set server job_history_duration = 7200:00:00
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999

Thank you for these details.

Your queue “default” is mapped to the node compute-00, please check the output of pbsnodes compute-00 . Hence, any jobs submitted to “default” queue is put on compute-00 , also in your qmgr -c ‘p s’ , the default_queue attribute is set to “default” .

Solution:

  1. submit jobs to workq (or)
  2. as root user qmgr -c “unset node compute-00 queue”

Hope this resolves your issue.

So are you saying that I should remove that setting for All Compute Nodes?
qmgr -c “unset node compute-00 queue” (00,01,02,03)
I have that set for all compute nodes.
[root@hmrihpcp02 ~]# pbsnodes -a | grep ‘queue|Mom’
Mom = compute-00.local.cluster
queue = default
Mom = compute-01.local.cluster
queue = default
Mom = compute-02.local.cluster
queue = default
Mom = compute-03.local.cluster
queue = default

Yes, you have to unset node to queue mapping from all the nodes. If that is set on all the nodes even submitting jobs to workq does not run on these nodes, as they are only dedicated to Run jobs submitted to default queue.

Unset it on all the nodes and submit jobs to either of the queues, then it should work.

Recommend:
Please unset it on all the nodes and implement Qlists instead( search the PBS admin guide or this community discussion) .

Still same result, ‘unset’ to all nodes. Didn’t do Qlists yet, but have following config and it still packs the nodes before moving to the next node.

Config Details:

[]# pbsnodes -a
compute-00
Mom = compute-00.local.cluster
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 72
resources_available.arch = linux
resources_available.host = compute-00
resources_available.mem = 394618508kb
resources_available.ncpus = 72
resources_available.vnode = compute-00
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Tue May 19 07:26:20 2020
last_used_time = Tue May 19 07:29:29 2020

compute-01
Mom = compute-01.local.cluster
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 72
resources_available.arch = linux
resources_available.host = compute-01
resources_available.mem = 394618508kb
resources_available.ncpus = 72
resources_available.vnode = compute-01
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Tue May 19 07:26:12 2020
last_used_time = Tue May 19 07:28:49 2020

compute-02
Mom = compute-02.local.cluster
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 72
resources_available.arch = linux
resources_available.host = compute-02
resources_available.mem = 394618508kb
resources_available.ncpus = 72
resources_available.vnode = compute-02
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Tue May 19 07:24:30 2020
last_used_time = Tue May 19 07:28:14 2020

compute-03
Mom = compute-03.local.cluster
Port = 15002
pbs_version = 19.1.1
ntype = PBS
state = free
pcpus = 72
resources_available.arch = linux
resources_available.host = compute-03
resources_available.mem = 394618508kb
resources_available.ncpus = 72
resources_available.vnode = compute-03
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Tue May 19 07:24:30 2020
last_used_time = Tue May 19 07:14:41 2020

[]# qmgr
Max open servers: 49
Qmgr: p s

Create queues and set their attributes.

Create and define queue workq

create queue workq
set queue workq queue_type = Execution
set queue workq enabled = True
set queue workq started = True

Create and define queue default

create queue default
set queue default queue_type = Execution
set queue default Priority = 50
set queue default resources_max.mem = 24000mb
set queue default resources_max.ncpus = 16
set queue default resources_max.nodes = 4
set queue default resources_max.walltime = 96:00:00
set queue default resources_default.ncpus = 1
set queue default resources_default.walltime = 24:00:00
set queue default enabled = True
set queue default started = True

Set server attributes.

set server scheduling = True
set server acl_roots = rcole
set server default_queue = default
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server flatuid = True
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server default_qsub_arguments = -V
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = False
set server job_history_enable = True
set server job_history_duration = 7200:00:00
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999

[]# cat pbs_sched_config
round_robin: False all
by_queue: True prime
by_queue: True non_prime
strict_ordering: false ALL
help_starving_jobs: true ALL
max_starve: 24:00:00
backfill_prime: false ALL
prime_exempt_anytime_queues: false
primetime_prefix: p_
nonprimetime_prefix: np_
node_sort_key: “ncpus HIGH unused” all
provision_policy: “aggressive_provision”
sort_queues: true ALL
resources: “ncpus, mem, arch, host, vnode, aoe, eoe”
load_balancing: true ALL
#smp_cluster_dist: lowest_load
#smp_cluster_dist: round_robin
fair_share: true ALL
unknown_shares: 10
fairshare_usage_res: cput
fairshare_entity: euser
fairshare_decay_time: 24:00:00
fairshare_decay_factor: 0.5
preemptive_sched: true ALL
preempt_queue_prio: 150
preempt_prio: “express_queue, normal_jobs”
preempt_order: “SCR”
preempt_sort: min_time_since_start
dedicated_prefix: ded
log_filter: 3328

Thank you @rcole

Please backup your sched_config file and replace it with the below content

round_robin: False all
by_queue: False prime
by_queue: False non_prime
strict_ordering: true ALL
help_starving_jobs: false ALL
max_starve: 24:00:00
backfill_prime: false ALL
prime_exempt_anytime_queues: false
primetime_prefix: p_
nonprimetime_prefix: np_
node_sort_key: “ncpus LOW assigned” ALL
provision_policy: “avoid_provision”
sort_queues: false ALL
resources: “ncpus, mem, arch, host, vnode, aoe, eoe”
load_balancing: false ALL
smp_cluster_dist: pack
fair_share: false ALL
unknown_shares: 10
fairshare_usage_res: ncpus*walltime
fairshare_entity: euser
fairshare_decay_time: 24:00:00
fairshare_decay_factor: 0.5
preemptive_sched: false ALL
preempt_queue_prio: 150
preempt_prio: “express_queue, normal_jobs”
preempt_order: “SCR”
preempt_sort: min_time_since_start
dedicated_prefix: ded
log_filter: 0

  • restart the PBS Services or
  • kill -HUP

Test1:

> for i in {1..4};do qsub -q workq -l select=1:ncpus=1  -l place=excl -- /bin/sleep 30  ; done 
> share the output of :  qstat -answ1

Test2 : once Test1 jobs have completed

>     for i in {1..4};do qsub  -q workq -l select=1:ncpus=18  -- /bin/sleep 30 ; done 
>     share the output of  qstat -answ1 

Test3 : once Test2 jobs have completed

 for i in {1..4};do qsub  -q workq -l select=1:ncpus=1  -- /bin/sleep 30 ; done 
share the output of  qstat -answ1

Test1:
[tmhrtc@hmrihpcp02 ~] for i in {1..4};do qsub -q workq -l select=1:ncpus=1 -l place=excl -- /bin/sleep 30 ; done 23633.hmrihpcp02 23634.hmrihpcp02 23635.hmrihpcp02 23636.hmrihpcp02 [tmhrtc@hmrihpcp02 ~] qstat -answ1
hmrihpcp02:
Req’d Req’d Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time


23480.hmrihpcp02 mresswl default MCoV-Run35_FC67 176222 1 8 24gb 24:00 R 07:42:12 compute-02/0*8
Job run at Tue May 19 at 08:51 on (compute-02:ncpus=8:mem=24576000kb)
23633.hmrihpcp02 tmhrtc workq STDIN 341734 1 1 – – R 00:00:10 compute-00/0
Job run at Tue May 19 at 16:34 on (compute-00:ncpus=1)
23634.hmrihpcp02 tmhrtc workq STDIN 58557 1 1 – – R 00:00:10 compute-01/0
Job run at Tue May 19 at 16:34 on (compute-01:ncpus=1)
23635.hmrihpcp02 tmhrtc workq STDIN 99011 1 1 – – R 00:00:10 compute-03/0
Job run at Tue May 19 at 16:34 on (compute-03:ncpus=1)
23636.hmrihpcp02 tmhrtc workq STDIN – 1 1 – – Q – –
Not Running: No available resources on nodes
[tmhrtc@hmrihpcp02 ~]$

Test2:
[tmhrtc@hmrihpcp02 ~] for i in {1..4};do qsub -q workq -l select=1:ncpus=18 -- /bin/sleep 30 ; done 23637.hmrihpcp02 23638.hmrihpcp02 23639.hmrihpcp02 23640.hmrihpcp02 [tmhrtc@hmrihpcp02 ~] qstat -answ1

hmrihpcp02:
Req’d Req’d Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time


23480.hmrihpcp02 mresswl default MCoV-Run35_FC67 176222 1 8 24gb 24:00 R 07:44:12 compute-02/08
Job run at Tue May 19 at 08:51 on (compute-02:ncpus=8:mem=24576000kb)
23637.hmrihpcp02 tmhrtc workq STDIN 341810 1 18 – – R 00:00:00 compute-00/0
18
Job run at Tue May 19 at 16:36 on (compute-00:ncpus=18)
23638.hmrihpcp02 tmhrtc workq STDIN 341811 1 18 – – R 00:00:00 compute-00/118
Job run at Tue May 19 at 16:36 on (compute-00:ncpus=18)
23639.hmrihpcp02 tmhrtc workq STDIN 341812 1 18 – – R 00:00:00 compute-00/2
18
Job run at Tue May 19 at 16:36 on (compute-00:ncpus=18)
23640.hmrihpcp02 tmhrtc workq STDIN 341813 1 18 – – R 00:00:00 compute-00/3*18
Job run at Tue May 19 at 16:36 on (compute-00:ncpus=18)

Test3:
[tmhrtc@hmrihpcp02 ~] for i in {1..4};do qsub -q workq -l select=1:ncpus=1 -- /bin/sleep 30 ; done 23641.hmrihpcp02 23642.hmrihpcp02 23643.hmrihpcp02 23644.hmrihpcp02 [tmhrtc@hmrihpcp02 ~] qstat -answ1

hmrihpcp02:
Req’d Req’d Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time


23480.hmrihpcp02 mresswl default MCoV-Run35_FC67 176222 1 8 24gb 24:00 R 07:46:12 compute-02/0*8
Job run at Tue May 19 at 08:51 on (compute-02:ncpus=8:mem=24576000kb)
23641.hmrihpcp02 tmhrtc workq STDIN 341848 1 1 – – R 00:00:00 compute-00/0
Job run at Tue May 19 at 16:37 on (compute-00:ncpus=1)
23642.hmrihpcp02 tmhrtc workq STDIN 341849 1 1 – – R 00:00:00 compute-00/1
Job run at Tue May 19 at 16:37 on (compute-00:ncpus=1)
23643.hmrihpcp02 tmhrtc workq STDIN 341850 1 1 – – R 00:00:00 compute-00/2
Job run at Tue May 19 at 16:37 on (compute-00:ncpus=1)
23644.hmrihpcp02 tmhrtc workq STDIN 341851 1 1 – – R 00:00:00 compute-00/3
Job run at Tue May 19 at 16:37 on (compute-00:ncpus=1)

Test1 worked like I was wanting it to…

Thank you @rcole for these tests.

Test1: exclusively asks for a node, so that no other job can run on the same node.

Did you kill -HUP or restart pbs services after updating sched_config ?

Honestly, in my tests i do see the jobs being spread across all the nodes respecting: node_sort_key: “ncpus LOW assigned” ALL for Test2 and Test3.

I will keep an eye on this

should i unset “sharing = default_shared”

No , that is by default. You do not need to unset or change it. My test test environment has the same configuration but works without any issues respecting the node sort key.

Probably, delete all the nodes and create them again and test .

Qmgr: d n @default
Qmgr: c n nodename