Hi,
Whenever we start the master node , pbs service is automatically running and after sometime it is getting error as below.
[root@master ~]# pbsnodes -aSj
Connection refused
pbsnodes: cannot connect to server master.calligotech.com, error=111
my server logs
[root@master ~]# cat /var/spool/pbs/server_logs/20200727
07/27/2020 12:20:19;0002;Server@master;Svr;Log;Log opened
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;pbs_version=19.1.1
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;pbs_build=mach=N/A:security=N/A:configure_args=N/A
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;hostname=master.calligotech.com;pbs_leaf_name=N/A;pbs_mom_node_name=N/A
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;ipv4 interface lo: localhost
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;ipv4 interface eno1: master
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;ipv4 interface eno2: master
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;ipv4 interface virbr0: master.calligotech.com
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;ipv6 interface lo: master.calligotech.com
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;ipv6 interface eno1: master.calligotech.com
07/27/2020 12:20:19;0002;Server@master;Svr;Server@master;ipv6 interface eno2: master.calligotech.com
07/27/2020 12:20:19;0006;Server@master;Fil;Server@master;Version 19.1.1, started, initialization type = 1
07/27/2020 12:20:21;0002;Server@master;Svr;Server@master;pbs_status_db exit code 1
07/27/2020 12:20:21;0002;Server@master;Svr;Server@master;Starting PBS dataservice
07/27/2020 12:20:27;0002;Server@master;Svr;Server@master;connected to PBS dataservice@master.calligotech.com
07/27/2020 12:20:27;0086;Server@master;Svr;pbs_python_ext_quick_start_interpreter;--> Python Interpreter quick started, compiled with version:'2.7.5 (default, Aug 4 2017, 00:39:18)
[GCC 4.8.5 20150623 (Red Hat 4.8.5-16)]' <--
07/27/2020 12:20:27;0086;Server@master;Svr;pbs_python_ext_quick_start_interpreter;--> Inserted Altair PBS Python modules dir '/share/apps/platform/pbs/lib/python/altair' <--
07/27/2020 12:20:27;0002;Server@master;n/a;setup_env;read environment from /var/spool/pbs/pbs_environment
07/27/2020 12:20:28;0c06;Server@master;TPP;Server@master(Main Thread);TPP leaf node names = 192.168.0.50:15001,127.0.0.1:15001,10.1.50.1:15001,192.168.0.50:15001,192.168.122.1:15001
07/27/2020 12:20:28;0002;Server@master;Svr;Server@master;Server pid = 3021 ready; using ports Server:15001 Scheduler:15004 MOM:15002 RM:15003
07/27/2020 12:20:28;0c06;Server@master;TPP;Server@master(Thread 0);Thread ready
07/27/2020 12:20:28;0c06;Server@master;TPP;Server@master(Thread 0);Registering address 192.168.0.50:15001 to pbs_comm
07/27/2020 12:20:28;0c06;Server@master;TPP;Server@master(Thread 0);Registering address 10.1.50.1:15001 to pbs_comm
07/27/2020 12:20:28;0c06;Server@master;TPP;Server@master(Thread 0);Registering address 192.168.122.1:15001 to pbs_comm
07/27/2020 12:20:28;0c06;Server@master;TPP;Server@master(Thread 0);Connected to pbs_comm master.calligotech.com:17001
07/27/2020 12:50:19;0002;Server@master;Svr;Server@master;Stopping PBS dataservice
07/27/2020 12:50:24;0002;Server@master;Svr;Server@master;Server shutdown completed
07/27/2020 12:50:24;0002;Server@master;Svr;Log;Log closed
07/27/2020 12:53:44;0002;Server@master;Svr;Log;Log opened
07/27/2020 12:53:44;0002;Server@master;Svr;Server@master;pbs_version=19.1.1
07/27/2020 12:53:44;0002;Server@master;Svr;Server@master;pbs_build=mach=N/A:security=N/A:configure_args=N/A
07/27/2020 12:53:44;0002;Server@master;Svr;Server@master;hostname=master.calligotech.com;pbs_leaf_name=N/A;pbs_mom_node_name=N/A
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;ipv4 interface lo: localhost
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;ipv4 interface eno1: master
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;ipv4 interface eno2: master
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;ipv4 interface virbr0: master.calligotech.com
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;ipv6 interface lo: localhost
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;ipv6 interface eno1: master.calligotech.com
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;ipv6 interface eno2: master.calligotech.com
07/27/2020 12:53:45;0006;Server@master;Fil;Server@master;Version 19.1.1, started, initialization type = 1
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;pbs_status_db exit code 1
07/27/2020 12:53:45;0002;Server@master;Svr;Server@master;Starting PBS dataservice
07/27/2020 12:53:57;0002;Server@master;Svr;Server@master;PBS dataservice not running:[Connection: failed: could not connect to server: Connection refused
Is the server running on host "192.168.0.50" and accepting
TCP/IP connections on port 15007?]
07/27/2020 12:53:58;0002;Server@master;Svr;Server@master;pbs_status_db exit code 1
07/27/2020 12:54:00;0002;Server@master;Svr;Server@master;Starting PBS dataservice
07/27/2020 12:54:00;0006;Server@master;Svr;Server@master;Failed to start PBS dataservice
07/27/2020 12:54:01;0002;Server@master;Svr;Server@master;pbs_status_db exit code 1
07/27/2020 12:54:05;0002;Server@master;Svr;Server@master;Starting PBS dataservice
07/27/2020 12:54:05;0006;Server@master;Svr;Server@master;Failed to start PBS dataservice
07/27/2020 12:54:05;0002;Server@master;Svr;Server@master;pbs_status_db exit code 1
07/27/2020 12:54:10;0002;Server@master;Svr;Server@master;Starting PBS dataservice
07/27/2020 12:54:22;0002;Server@master;Svr;Server@master;PBS dataservice not running:[Connection: failed: could not connect to server: Connection refused
Is the server running on host "192.168.0.50" and accepting
TCP/IP connections on port 15007?]
07/27/2020 12:54:23;0002;Server@master;Svr;Server@master;pbs_status_db exit code 1
07/27/2020 12:54:30;0002;Server@master;Svr;Server@master;Starting PBS dataservice
07/27/2020 12:54:30;0006;Server@master;Svr;Server@master;Failed to start PBS dataservice
07/27/2020 12:54:31;0002;Server@master;Svr;Server@master;pbs_status_db exit code 1
07/27/2020 12:54:39;0002;Server@master;Svr;Server@master;Starting PBS dataservice
07/27/2020 12:54:51;0002;Server@master;Svr;Server@master;PBS dataservice not running:[Connection: failed: could not connect to server: Connection refused
Is the server running on host "192.168.0.50" and accepting
TCP/IP connections on port 15007?]
and as you suggested i changed /etc/pbs.conf file and i have added
PBS_SERVER_HOST_NAME=master.calligotech.com
and strace pbsnodes -aSj logs are below
mmap(NULL, 105051, PROT_READ, MAP_PRIVATE, 4, 0) = 0x7f9345488000
close(4) = 0
open("/lib64/libnss_sss.so.2", O_RDONLY|O_CLOEXEC) = 4
read(4, "\177ELF\2\1\1\0\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\360\25\0\0\0\0\0\0"..., 832) = 832
fstat(4, {st_mode=S_IFREG|0755, st_size=37096, ...}) = 0
mmap(NULL, 2131056, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 4, 0) = 0x7f9344268000
mprotect(0x7f9344270000, 2093056, PROT_NONE) = 0
mmap(0x7f934446f000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 4, 0x7000) = 0x7f934446f000
close(4) = 0
mprotect(0x7f934446f000, 4096, PROT_READ) = 0
munmap(0x7f9345488000, 105051) = 0
fstat(-1, 0x7ffdc24018c0) = -1 EBADF (Bad file descriptor)
socket(AF_LOCAL, SOCK_STREAM, 0) = 4
fcntl(4, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl(4, F_GETFD) = 0
fcntl(4, F_SETFD, FD_CLOEXEC) = 0
connect(4, {sa_family=AF_LOCAL, sun_path="/var/lib/sss/pipes/nss"}, 110) = -1 ENOENT (No such file or directory)
close(4) = 0
close(3) = 0
munmap(0x7f93454c0000, 4096) = 0
socket(AF_LOCAL, SOCK_STREAM, 0) = 3
fcntl(3, F_GETFL) = 0x2 (flags O_RDWR)
fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
fcntl(3, F_GETFD) = 0
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
connect(3, {sa_family=AF_LOCAL, sun_path="/var/lib/sss/pipes/nss"}, 110) = -1 ENOENT (No such file or directory)
close(3) = 0
open("/etc/pbs.conf", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=204, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f93454c0000
read(3, "PBS_EXEC=/share/apps/platform/pb"..., 4096) = 204
read(3, "", 4096) = 0
close(3) = 0
munmap(0x7f93454c0000, 4096) = 0
open("/etc/pbs.conf", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=204, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f93454c0000
read(3, "PBS_EXEC=/share/apps/platform/pb"..., 4096) = 204
read(3, "", 4096) = 0
close(3) = 0
munmap(0x7f93454c0000, 4096) = 0
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
socket(AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 4
connect(4, {sa_family=AF_LOCAL, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
close(4) = 0
socket(AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 4
connect(4, {sa_family=AF_LOCAL, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
close(4) = 0
open("/etc/host.conf", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=9, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f93454c0000
read(4, "multi on\n", 4096) = 9
read(4, "", 4096) = 0
close(4) = 0
munmap(0x7f93454c0000, 4096) = 0
futex(0x7f9344c47a30, FUTEX_WAKE_PRIVATE, 2147483647) = 0
open("/etc/resolv.conf", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=73, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f93454c0000
read(4, "search local calligotech.com\nnam"..., 4096) = 73
read(4, "", 4096) = 0
close(4) = 0
munmap(0x7f93454c0000, 4096) = 0
open("/etc/hosts", O_RDONLY|O_CLOEXEC) = 4
fstat(4, {st_mode=S_IFREG|0644, st_size=286, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f93454c0000
read(4, "# Added by rocks report host #\n#"..., 4096) = 286
read(4, "", 4096) = 0
close(4) = 0
munmap(0x7f93454c0000, 4096) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(15001), sin_addr=inet_addr("192.168.0.50")}, 16) = -1 ECONNREFUSED (Connection refused)
close(3) = 0
dup(2) = 3
fcntl(3, F_GETFL) = 0x8002 (flags O_RDWR|O_LARGEFILE)
fstat(3, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f93454c0000
write(3, "Connection refused\n", 19Connection refused
) = 19
close(3) = 0
munmap(0x7f93454c0000, 4096) = 0
write(2, "pbsnodes: cannot connect to serv"..., 69pbsnodes: cannot connect to server master.calligotech.com, error=111
) = 69
exit_group(1) = ?
+++ exited with 1 +++
[root@master ~]#
Still i am getting same issue , please help me out.
[root@master ~]# pbsnodes -aSj
Connection refused
pbsnodes: cannot connect to server master.calligotech.com, error=111
[root@master ~]# service pbs restart
Restarting PBS
Stopping PBS
Killing Server.
PBS server - was pid: 1340
PBS mom - was pid: 30622
PBS sched - was pid: 30635
PBS comm - was pid: 30594
Waiting for shutdown to complete
Starting PBS
/share/apps/platform/pbs/sbin/pbs_comm ready (pid=28045), Proxy Name:master.calligotech.com:17001, Threads:4
PBS comm
PBS mom
Creating usage database for fairshare.
PBS sched
Connecting to PBS dataservice..Failed to start PBS dataservice:[2020-07-27 14:14:05 ISTFATAL: could not create lock file "/var/run/postgresql/.s.PGSQL.15007.lock": No such file or directory]
.Failed to start PBS dataservice
.Failed to start PBS dataservice
..Failed to start PBS dataservice
continuing in background.
PBS server
touch: cannot touch '/var/lock/subsys/pbs': No such file or directory
[root@master ~]#
What is the reason some times it will work properly and sometimes not. please help me out.
Regards,
Zain