Hi,
I am testing the the vmem limit enforcement refer to 5.14.3.3 “Configuring Per-job Resource Limit Enforcement at Vnodes”.
But, vmem is not killed forcibly.
I had expected that the following message would be output.
=>> PBS: job killed: vmem xxxxxxxx exceeded limit yyyyyy.
However, the job ended normally.
Why is the job not killed due to vmem limitation?
-
Running program (Allocate 1gb memory).
[test@sl02-sms ~]$ cat mallc.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>#define ALLOC_SIZE 4
int main(){
int sleep_time=120;
char *mem=NULL;
char memadd[ALLOC_SIZE];
int i, j;
size_t size= 512 * 512 * 1024; / 250mb */while(sleep_time){ sleep_time = sleep(sleep_time); } for(i=0; i<ALLOC_SIZE; i++){ mem = (char *)malloc(size); size_t alloc_size = malloc_usable_size(mem); if(!mem){ printf("not allocate memory size=%d\n", alloc_size); return -1; } printf("memory allocated size=%d\n", alloc_size); for(j=0; j<size; j++){ mem[j] = 0x00; } memadd[i]=mem; sleep(10); } while(sleep_time){ sleep_time = sleep(sleep_time); } for(i=0; i<ALLOC_SIZE; i++){ free(memadd[i]); } return 0;
}
-
Running script (5bg per process).
[test@sl02-sms ~]$ cat mallc.sh
#!/bin/bash
#PBS -N test
#PBS -j oeecho “resource limit information”
ulimit -a
echo “1 process runnning”
./mallc &
echo “2 process runnning”
./mallc &
echo “3 process runnning”
./mallc &
echo “4 process runnning”
./mallc &
echo “5 process runnning”
./mallc[test@sl02-sms ~]$
-
Running job.
[test@sl02-sms ~]$ qsub -l pvmem=2gb,vmem=4gb mallc.sh
1443.sl02-sms
[test@sl02-sms ~]$
!!! vmem was not killed by job limit. !!!
[test@sl02-sms ~]$ cat test.o1443
resource limit information
core file size (blocks, -c) 0
data seg size (kbytes, -d) unlimited
scheduling priority (-e) 0
file size (blocks, -f) unlimited
pending signals (-i) 63668
max locked memory (kbytes, -l) unlimited
max memory size (kbytes, -m) unlimited
open files (-n) 1024
pipe size (512 bytes, -p) 8
POSIX message queues (bytes, -q) 819200
real-time priority (-r) 0
stack size (kbytes, -s) 16384
cpu time (seconds, -t) unlimited
max user processes (-u) 63668
virtual memory (kbytes, -v) 2097152
file locks (-x) unlimited
1 process runnning
2 process runnning
3 process runnning
4 process runnning
5 process runnning
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
memory allocated size=268439536
[test@sl02-sms ~]$ qstat -f
Job Id: 1443.sl02-sms
Job_Name = test
Job_Owner = test@sl02-sms
resources_used.cpupercent = 0
resources_used.cput = 00:00:00
resources_used.mem = 5124kb
resources_used.ncpus = 1
resources_used.vmem = 249844kb
resources_used.walltime = 00:00:12
job_state = R
queue = workq
server = sl02-sms
Checkpoint = u
ctime = Fri Jan 18 11:31:05 2019
Error_Path = sl02-sms:/home/test/test.e1443
exec_host = sl02-c001/0
exec_vnode = (sl02-c001:ncpus=1)
Hold_Types = n
Join_Path = oe
Keep_Files = n
Mail_Points = a
mtime = Fri Jan 18 11:31:05 2019
Output_Path = sl02-sms:/home/test/test.o1443
Priority = 0
qtime = Fri Jan 18 11:31:05 2019
Rerunable = True
Resource_List.ncpus = 1
Resource_List.nodect = 1
Resource_List.place = pack
Resource_List.pvmem = 2gb
Resource_List.select = 1:ncpus=1:vmem=4gb
Resource_List.vmem = 4gb
[root@sl02-c001 ~]# ps -aef | grep mallc |grep -v grep
test 25586 25585 0 11:25 ? 00:00:00 ./mallc
test 25587 25585 0 11:25 ? 00:00:00 ./mallc
test 25588 25585 0 11:25 ? 00:00:00 ./mallc
test 25589 25585 0 11:25 ? 00:00:00 ./mallc
test 25590 25585 0 11:25 ? 00:00:00 ./mallc
[root@sl02-c001 ~]#
My environment is below.
[test@sl02-sms ~]$ qsub --version
pbs_version = 14.1.2
[root@sl02-sms ~]# grep ^resources /var/spool/pbs/sched_priv/sched_config
resources: “ncpus, mem, arch, host, vnode, netwins, aoe”
[root@sl02-sms ~]#
[test@sl02-sms ~]$ pbsnodes -av
sl02-c001
Mom = sl02-c001.localdomain
ntype = PBS
state = free
pcpus = 12
resources_available.arch = linux
resources_available.host = sl02-c001
resources_available.mem = 16363572kb
resources_available.ncpus = 12
resources_available.vnode = sl02-c001
resources_assigned.accelerator_memory = 0kb
resources_assigned.dyna-license = 0
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.netwins = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
sl02-c002
Mom = sl02-c002.localdomain
ntype = PBS
state = free
pcpus = 12
resources_available.arch = linux
resources_available.host = sl02-c002
resources_available.mem = 16363572kb
resources_available.ncpus = 12
resources_available.vnode = sl02-c002
resources_assigned.accelerator_memory = 0kb
resources_assigned.dyna-license = 0
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.netwins = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
[test@sl02-sms ~]$
[root@sl02-c001 ~]# cat /var/spool/pbs/mom_priv/config
$clienthost sl02-sms
$usecp *:/home /home
$restrict_user_maxsysid 999
$enforce mem