Hook;pbs_python;Server and MoM vnode names may not be consistent

Hello,

We are seeing the following error on one of our compute nodes (node0128) and are not sure why it might be seeing a different node (node0064). We have other nodes with similar images and settings and we are yet to experience this issue with those nodes.

Excerpt from mom_log:

10/06/2020 10:50:02;0002;pbs_mom;Svr;pbs_mom;Hello from server at 10.30.255.254:15001

10/06/2020 10:50:02;0001;pbs_mom;Job;602377.bright01-thx;Job discarded at request of Server

10/06/2020 10:50:02;0008;pbs_mom;Job;602377.bright01-thx;kill_job

10/06/2020 10:50:02;0010;pbs_python;Hook;pbs_python;WARNING: job seems to have no resources assigned to this host.

10/06/2020 10:50:02;0010;pbs_python;Hook;pbs_python;Server and MoM vnode names may not be consistent.

10/06/2020 10:50:02;0010;pbs_python;Hook;pbs_python;Pattern for expected vnode name(s) is node0128[[\d]+]

10/06/2020 10:50:02;0010;pbs_python;Hook;pbs_python;Job exec_vnode is (node0064[1]:mem=27262976kb:ncpus=8:ngpus=2:mic_cores

=0)

10/06/2020 10:50:02;0010;pbs_python;Hook;pbs_python;You may have forgotten to set PBS_MOM_NODE_NAME to the desired matching

entry in the exec_vnode string

10/06/2020 10:50:02;0010;pbs_python;Hook;pbs_python;Job will fail or be configured with default ncpus/mem

And here is what our PBS hook looks like:

{
“cgroup_prefix”:“pbspro”,
“enabled”:“vntype in: reg_node,gpu_node,lm_node”,
“periodic_resc_update”:true,
“vnode_per_numa_node”:“vntype in: gpu_node”,
“online_offlined_nodes”:“vntype in: gpu_node,reg_node,lm_node”,
“use_hyperthreads”: true,
“ncpus_are_cores”: true,
“cgroup”:{
“cpuacct”:{
“enabled”:true,
“exclude_hosts”:[

     ]
  },
  "cpuset":{
     "enabled":true,
     "exclude_hosts":[

     ],
     "exclude_vntypes":[

     ],
     "memory_spread_page":false,
     "mem_hardwall":true,
     "mem_fences":true
  },
  "devices":{
     "enabled":true,
     "exclude_hosts":[

     ],
     "exclude_vntypes":[

     ],
     "allow":[
        	"b *:* rwm",
        	"c *:* rwm",
  	"c 195:* m",
  "c 136:* rwm",
  ["infiniband/rdma_cm", "rwm"],
  ["fuse", "rwm"],
  ["net/tun", "rwm"],
  ["tty", "rwm"],
  ["ptmx", "rwm"],
  ["console", "rwm"],
  ["null", "rwm"],
  ["zero", "rwm"],
  ["full", "rwm"],
  ["random", "rwm"],
  ["urandom", "rwm"],
  ["cpu/0/cpuid", "rwm", "*"],
  ["nvidia-modeset", "rwm"],
  ["nvidia-uvm", "rwm"],
  ["nvidia-uvm-tools", "rwm"],
  ["nvidiactl", "rwm"],
        	["mic/scif", "rwm"]
     ]
  },
  "hugetlb":{
     "enabled":false,
     "default":"1MB",
     "exclude_hosts":[

     ],
     "exclude_vntypes":[

     ]
  },
  "memory":{
     "enabled":true,
     "default":"256MB",
     "reserve_amount":"10GB",

“reserve_percent”:“0”,
“exclude_hosts”:[

     ],
     "exclude_vntypes":[

     ]
  },
  "memsw":{
     "enabled":true,
     "default":"256MB",

“reserve_amount”:“10GB”,
“reserve_percent”:“0”,
“exclude_hosts”:[

     ],
     "exclude_vntypes":[

     ]
  }

}
}

Here’s what qmgr settings look like for the node0128:

create node node0128 Mom=node0128.thunder.ccast
set node node0128 state = offline
set node node0128 resources_available.arch = linux
set node node0128 resources_available.host = node0128
set node node0128 resources_available.hpmem = 0b
set node node0128 resources_available.mem = 0b
set node node0128 resources_available.ncpus = 0
set node node0128 resources_available.ngpus = 0
set node node0128 resources_available.plist = cascadelake
set node node0128 resources_available.qlist = condo07
set node node0128 resources_available.qlist += preemptible
set node node0128 resources_available.vmem = 0b
set node node0128 resources_available.vnode = node0128
set node node0128 resources_available.vntype = gpu_node
set node node0128 comment = offlined by hook ‘pbs_cgroups_dev_a’ due to hook error
set node node0128 resv_enable = True
set node node0128 in_multivnode_host = 1

We appreciate your thoughts and directions as we investigate.

-Siji