Hi,
I am having trouble debugging connection to computational nodes.
I tried everything in the similar issues, but I can’t locate the problem
Here are the symptoms.
First pbs-hostn
and pbsnodes
both show the nodes
$pbs_hostn -v vt001
primary name: vt001 (from gethostbyname())
aliases: -none-
address length: 4 bytes
address: 192.168.20.1 (18131136 dec) name: vt001
$pbsnodes -a
vt001
Mom = vt001
ntype = PBS
state = free
pcpus = 96
resources_available.arch = linux
resources_available.host = vt001
resources_available.mem = 394907120kb
resources_available.ncpus = 96
resources_available.vnode = vt001
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
..
However when I use pbsnodes -l -s vt001
it fails
$pbsnodes -l -s vt001
Connection refused
pbsnodes: cannot connect to server vt001, error=111
I tried to use strace to locate the problem but I didn’t know how to proceed
$strace pbsnodes -l -s vt001
......
fstat(3, {st_mode=S_IFREG|0644, st_size=19183, ...}) = 0
read(3, "# Network services, Internet sty"..., 4096) = 4096
read(3, "1/tcp\t\tClearcase\nclearcase\t371/u"..., 4096) = 4096
read(3, "\t1812/tcp\nradius\t\t1812/udp\nradiu"..., 4096) = 4096
read(3, "\t# users & groups database\nafs3-"..., 4096) = 4096
read(3, "/tcp\t\t\t# french minitel\nxtelw\t\t1"..., 4096) = 2799
read(3, "", 4096) = 0
close(3) = 0
openat(AT_FDCWD, "/etc/pbs.conf", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=177, ...}) = 0
read(3, "PBS_SERVER=feynman\nPBS_START_SER"..., 4096) = 177
read(3, "", 4096) = 0
close(3) = 0
openat(AT_FDCWD, "/etc/pbs.conf", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=177, ...}) = 0
read(3, "PBS_SERVER=feynman\nPBS_START_SER"..., 4096) = 177
read(3, "", 4096) = 0
close(3) = 0
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 4
connect(4, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
close(4) = 0
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 4
connect(4, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
close(4) = 0
stat("/etc/resolv.conf", {st_mode=S_IFREG|0644, st_size=590, ...}) = 0
openat(AT_FDCWD, "/etc/host.conf", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=92, ...}) = 0
read(4, "# The \"order\" line is only used "..., 4096) = 92
read(4, "", 4096) = 0
close(4) = 0
futex(0x7f6ed0ac4ba4, FUTEX_WAKE_PRIVATE, 2147483647) = 0
openat(AT_FDCWD, "/etc/resolv.conf", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=590, ...}) = 0
read(4, "# This file is managed by man:sy"..., 4096) = 590
read(4, "", 4096) = 0
close(4) = 0
uname({sysname="Linux", nodename="feynman", ...}) = 0
openat(AT_FDCWD, "/etc/hosts", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=517, ...}) = 0
read(4, "127.0.0.1\tlocalhost\n127.0.1.1\tub"..., 4096) = 517
read(4, "", 4096) = 0
close(4) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(15001), sin_addr=inet_addr("192.168.20.1")}, 16) = -1 ECONNREFUSED (Connection refused)
close(3) = 0
dup(2) = 3
fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
fstat(3, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 4), ...}) = 0
write(3, "Connection refused\n", 19Connection refused
) = 19
close(3) = 0
write(2, "pbsnodes: cannot connect to serv"..., 52pbsnodes: cannot connect to server vt001, error=111
) = 52
exit_group(1) = ?
+++ exited with 1 +++
In the node, I checked if the pbs_mom is running by
$ /etc/init.d/pbs status
pbs_mom is pid 2332
I proceeded to check the opened ports
admin@server:$sudo lsof -i -P -n | grep LISTEN
pbs_comm 26877 root 13u IPv4 9665789 0t0 TCP *:17001 (LISTEN)
pbs_sched 26892 root 4u IPv4 9663067 0t0 TCP *:15004 (LISTEN)
pbs_serve 27095 root 8u IPv4 9649094 0t0 TCP *:15001 (LISTEN)
admin@vt001:$sudo lsof -i -P -n | grep LISTEN
pbs_mom 82765 root 5u IPv4 3442317 0t0 TCP *:15002 (LISTEN)
pbs_mom 82765 root 6u IPv4 3442318 0t0 TCP *:15003 (LISTEN)
Many thanks