Hi,
I have configured cgroups with the device subsystem enabled for GPU detection on my new node with 3 GPU cards. PBS only detects 3 ngpus on that node. Is this correct? Is there any way I can set PBS to detect all GPU cores from a graphics card separately? For example, if one graphics card has 3000 GPUs, then with three cards, PBS detects 9000 ngpus available for use?
cgroup config file
{
"cgroup_prefix" : "pbs_jobs",
"exclude_hosts" : [],
"exclude_vntypes" : ["no_cgroups"],
"run_only_on_hosts" : [],
"periodic_resc_update" : true,
"vnode_per_numa_node" : "vntype in: gpu_node*",
"online_offlined_nodes" : true,
"use_hyperthreads" : true,
"ncpus_are_cores" : false,
"discover_gpus" : true,
"manage_rlimit_as" : true,
"cgroup" : {
"cpuacct" : {
"enabled" : true,
"exclude_hosts" : [],
"exclude_vntypes" : []
},
"cpuset" : {
"enabled" : true,
"exclude_cpus" : [],
"exclude_hosts" : [],
"exclude_vntypes" : [],
"mem_fences" : false,
"mem_hardwall" : false,
"memory_spread_page" : false
},
"devices" : {
"enabled" : "vntype in: gpu_node*",
"exclude_hosts" : [],
"exclude_vntypes" : [],
"allow" : [
"b *:* rwm",
"c *:* rwm",
["nvidiactl", "rwm", "*"]
]
},
"memory" : {
"enabled" : true,
"exclude_hosts" : [],
"exclude_vntypes" : [],
"soft_limit" : false,
"enforce_default" : true,
"exclhost_ignore_default" : false,
"default" : "256MB",
"reserve_percent" : 0,
"reserve_amount" : "1GB"
},
"memsw" : {
"enabled" : false,
"exclude_hosts" : [],
"exclude_vntypes" : [],
"enforce_default" : true,
"exclhost_ignore_default" : false,
"default" : "0B",
"reserve_percent" : 0,
"reserve_amount" : "0B"
}
}
}
pbsnodes -aSjv
mem ncpus nmics ngpus
vnode state njobs run susp f/t f/t f/t f/t jobs
--------------- --------------- ------ ----- ------ ----------- ------- ------- ------- -------
node1 free 0 0 0 0 b/0 b 0/0 0/0 0/0 --
node1[0] free 0 0 0 100gb/100gb 32/32 0/0 2/2 --
node1[1] free 0 0 0 100gb/100gb 32/32 0/0 1/1 --