tomoki
June 26, 2019, 8:57am
1
[test@ohpc137pbsib-sms ~]$ qsub --version
pbs_version = 19.1.1
[test@ohpc137pbsib-sms ~]$
In my comprehension job 99 should be requeued(status is Q) by the following definition.
preempt_order:“R 80 R 50 S”
[root@ohpc137pbsib-sms ~]# cat /var/spool/pbs/sched_priv/sched_config |grep ^preempt
preemptive_sched: true ALL
preempt_queue_prio: 150
preempt_prio: "express_queue, normal_jobs"
preempt_order:"R 80 R 50 S"
preempt_sort: min_time_since_start
[root@ohpc137pbsib-sms ~]#
[test@ohpc137pbsib-sms ~]$ qsub -l select=3:ncpus=24 yes.sh
99.ohpc137pbsib-sms
[test@ohpc137pbsib-sms ~]$
[root@ohpc137pbsib-sms ~]# qalter -l soft_walltime=00:03:00 99
[root@ohpc137pbsib-sms ~]#
[test@ohpc137pbsib-sms ~]$ qstat -f 99|grep walltime
resources_used.walltime = 00:00:27
Resource_List.soft_walltime = 00:03:00
[test@ohpc137pbsib-sms ~]$
The percentage of time remaining on this job is more than 80%.
[test@ohpc137pbsib-sms ~]$ qsub -l select=3:ncpus=24 yes.sh
100.ohpc137pbsib-sms
[test@ohpc137pbsib-sms ~]$
[test@ohpc137pbsib-sms ~]$ qstat -an
ohpc137pbsib-sms:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
--------------- -------- -------- ---------- ------ --- --- ------ ----- - -----
99.ohpc137pbsib test workq yes 112334 3 72 -- -- R 00:00
ohpc137pbsib-c001/0*24+ohpc137pbsib-c002/0*24+ohpc137pbsib-c003/0*24
100.ohpc137pbsi test workq yes -- 3 72 -- -- Q --
--
[test@ohpc137pbsib-sms ~]$
[root@ohpc137pbsib-sms ~]# qrun 100
[test@ohpc137pbsib-sms ~]$ qstat -an
ohpc137pbsib-sms:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
--------------- -------- -------- ---------- ------ --- --- ------ ----- - -----
99.ohpc137pbsib test workq yes 112334 3 72 -- -- S 00:00
ohpc137pbsib-c001/0*24+ohpc137pbsib-c002/0*24+ohpc137pbsib-c003/0*24
100.ohpc137pbsi test workq yes 112418 3 72 -- -- R 00:00
ohpc137pbsib-c001/0*24+ohpc137pbsib-c002/0*24+ohpc137pbsib-c003/0*24
[test@ohpc137pbsib-sms ~]$
adarsh
June 26, 2019, 2:57pm
2
Please create another queue called “highpriority” as below:
qmgr -c “create queue highpriority queue_type=e,started=t,enabled=t”
qmgr -c “set queue highprirority Priority=160”
Do similar test as above, but submit the second job to the highpriority queue ?
In your can you submitted the job to workq which has the same priority ? how would JobID-100 get more priority than JobID-99 ? Hence, there was no suspension, until you ran qrun 100, which makes it to run by suspending the job-99 , as it has overcome its soft limit.
tomoki
June 27, 2019, 12:18am
3
Dear adarsh
Thanks for your advice.
But unfortunately the result was not what I intended.
[root@ohpc137pbsib-sms ~]# cat /var/spool/pbs/sched_priv/sched_config |grep ^preempt
preemptive_sched: true ALL
preempt_queue_prio: 150
preempt_prio: "express_queue, normal_jobs"
preempt_order:"R 80 R 50 S"
preempt_sort: min_time_since_start
[root@ohpc137pbsib-sms ~]#
[root@ohpc137pbsib-sms ~]# qmgr -c "p q expQ"
#
# Create queues and set their attributes.
#
#
# Create and define queue expQ
#
create queue expQ
set queue expQ queue_type = Execution
set queue expQ Priority = 160
set queue expQ enabled = True
set queue expQ started = True
[root@ohpc137pbsib-sms ~]#
[test@ohpc137pbsib-sms ~]$ qsub -l select=3:ncpus=24 yes.sh
102.ohpc137pbsib-sms
[test@ohpc137pbsib-sms ~]
[root@ohpc137pbsib-sms ~]# qalter -l soft_walltime=00:03:00 102
[test@ohpc137pbsib-sms ~]$ qstat -f 102|grep walltime
resources_used.walltime = 00:00:10
Resource_List.soft_walltime = 00:03:00
[test@ohpc137pbsib-sms ~]$
[test@ohpc137pbsib-sms ~]$ qsub -l select=3:ncpus=24 -q expQ yes.sh
103.ohpc137pbsib-sms
[test@ohpc137pbsib-sms ~]$
[test@ohpc137pbsib-sms ~]$ qstat -an
ohpc137pbsib-sms:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
--------------- -------- -------- ---------- ------ --- --- ------ ----- - -----
102.ohpc137pbsi test workq yes 37662 3 72 -- -- S 00:00
ohpc137pbsib-c001/0*24+ohpc137pbsib-c002/0*24+ohpc137pbsib-c003/0*24
103.ohpc137pbsi test expQ yes 37717 3 72 -- -- R 00:00
ohpc137pbsib-c001/0*24+ohpc137pbsib-c002/0*24+ohpc137pbsib-c003/0*24
[test@ohpc137pbsib-sms ~]$
changed preempt_order to “R”.
Job ID 106 has been requeued.
I do not know why job ID 102 is not requeued.
[root@ohpc137pbsib-sms ~]# cat /var/spool/pbs/sched_priv/sched_config |grep ^preempt_order
preempt_order: "R"
[root@ohpc137pbsib-sms ~]#
[test@ohpc137pbsib-sms ~]$ qsub -l select=3:ncpus=24 yes.sh
106.ohpc137pbsib-sms
[test@ohpc137pbsib-sms ~]$
[root@ohpc137pbsib-sms ~]# qalter -l soft_walltime=00:03:00 106
[test@ohpc137pbsib-sms ~]$ qstat -f 106|grep walltime
resources_used.walltime = 00:00:10
Resource_List.soft_walltime = 00:03:00
[test@ohpc137pbsib-sms ~]$
[test@ohpc137pbsib-sms ~]$ qsub -l select=3:ncpus=24 -q expQ yes.sh
107.ohpc137pbsib-sms
[test@ohpc137pbsib-sms ~]$
[test@ohpc137pbsib-sms ~]$ qstat -an
ohpc137pbsib-sms:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
--------------- -------- -------- ---------- ------ --- --- ------ ----- - -----
106.ohpc137pbsi test workq yes 38419 3 72 -- -- Q --
--
107.ohpc137pbsi test expQ yes 38481 3 72 -- -- R 00:00
ohpc137pbsib-c001/0*24+ohpc137pbsib-c002/0*24+ohpc137pbsib-c003/0*24
[test@ohpc137pbsib-sms ~]$
[test@ohpc137pbsib-sms ~]$ pbsnodes -a
ohpc137pbsib-c001
Mom = ohpc137pbsib-c001.localdomain
ntype = PBS
state = free
pcpus = 24
resources_available.arch = linux
resources_available.host = ohpc137pbsib-c001
resources_available.mem = 196814204kb
resources_available.ncpus = 24
resources_available.vnode = ohpc137pbsib-c001
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Jun 27 09:08:19 2019
last_used_time = Thu Jun 27 09:08:19 2019
ohpc137pbsib-c002
Mom = ohpc137pbsib-c002.localdomain
ntype = PBS
state = free
pcpus = 24
resources_available.arch = linux
resources_available.host = ohpc137pbsib-c002
resources_available.mem = 196814200kb
resources_available.ncpus = 24
resources_available.vnode = ohpc137pbsib-c002
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Jun 27 09:08:19 2019
last_used_time = Thu Jun 27 09:08:19 2019
ohpc137pbsib-c003
Mom = ohpc137pbsib-c003.localdomain
ntype = PBS
state = free
pcpus = 24
resources_available.arch = linux
resources_available.host = ohpc137pbsib-c003
resources_available.mem = 196814208kb
resources_available.ncpus = 24
resources_available.vnode = ohpc137pbsib-c003
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Thu Jun 27 09:08:19 2019
last_used_time = Thu Jun 27 09:08:19 2019
[test@ohpc137pbsib-sms ~]$