Qstat: cannot connect to server amaster (errno=113)

Hi,
I have setup pbs server and configured pbs client, but when i am trying to execute qstat on client, i am getting below error messge.

[root@aclient ~]# qstat -a
No route to host
qstat: cannot connect to server amaster (errno=113)

Pbs Server Info:

[root@amaster ~]# pbsnodes -l -s aclient
No route to host
pbsnodes: cannot connect to server aclient, error=113
[root@amaster ~]# pbsnodes -a
aclient
Mom = aclient
Port = 15002
pbs_version = unavailable
ntype = PBS
state = state-unknown,down
pcpus = 1
resources_available.host = aclient
resources_available.ncpus = 1
resources_available.vnode = aclient
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
comment = node down: communication closed
resv_enable = True
sharing = default_shared
last_state_change_time = Wed Dec 11 12:18:45 2019

[root@amaster ~]# netstat -tunap | grep pbs
tcp 0 0 0.0.0.0:15004 0.0.0.0:* LISTEN 31046/pbs_sched
tcp 0 0 0.0.0.0:17001 0.0.0.0:* LISTEN 31031/pbs_comm
tcp 0 0 0.0.0.0:15001 0.0.0.0:* LISTEN 31355/pbs_server.bi
tcp 0 0 192.168.1.222:17001 192.168.1.222:710 ESTABLISHED 31031/pbs_comm
tcp 0 0 192.168.1.222:17001 192.168.1.222:867 ESTABLISHED 31031/pbs_comm
tcp 0 0 192.168.1.222:43146 192.168.1.222:15007 ESTABLISHED 31355/pbs_server.bi
tcp 0 0 192.168.1.222:710 192.168.1.222:17001 ESTABLISHED 31046/pbs_sched
tcp 0 0 192.168.1.222:867 192.168.1.222:17001 ESTABLISHED 31355/pbs_server.bi
[root@amaster ~]# cat /etc/pbs.conf
PBS_EXEC=/opt/pbs
PBS_SERVER=amaster
PBS_START_SERVER=1
PBS_START_SCHED=1
PBS_START_COMM=1
PBS_START_MOM=0
PBS_HOME=/var/spool/pbs
PBS_CORE_LIMIT=unlimited
PBS_SCP=/bin/scp

[root@amaster ~]# cat /var/spool/pbs/server_priv/nodes
aclient np=1
[root@amaster ~]# cat /var/spool/pbs/mom_priv/config
$clienthost amaster

Client node info:

[root@aclient ~]# qstat -a
No route to host
qstat: cannot connect to server amaster (errno=113)
[root@aclient ~]# pbsnodes -a
No route to host
pbsnodes: cannot connect to server amaster, error=113
[root@aclient ~]# netstat -tunap | grep pbs
tcp 0 0 0.0.0.0:15003 0.0.0.0:* LISTEN 5674/pbs_mom
tcp 0 0 0.0.0.0:15002 0.0.0.0:* LISTEN 5674/pbs_mom
[root@aclient ~]# cat /etc/pbs.conf
PBS_EXEC=/opt/pbs
PBS_SERVER=amaster
PBS_START_SERVER=0
PBS_START_SCHED=0
PBS_START_COMM=0
PBS_START_MOM=1
PBS_HOME=/var/spool/pbs
PBS_CORE_LIMIT=unlimited
PBS_SCP=/bin/scp
[root@aclient ~]# cat /var/spool/pbs/mom_priv/config
$clienthost amaster
$logevent 0x1ff
$restrict_user_maxsysid 999

[root@aclient ~]#

Please guide me to resolve this issue.

Regards,
Zain

Please check

  1. your DNS and /etc/hosts for the name resolution of “amaster”
  2. firewall settings ( 15001 to 15009 and 17001 should be open between server/computenodes)
  3. SELinux disabled and system rebooted
  4. check and share this command output pbs_hostn -v amaster
  5. /etc/hosts should be the same both on the server and compute nodes

thanks for your response,
selinux was enabled on computenode. i have disabled and its working.

But when i submit job from pbsdata user on the pbs server node, i am getting below

  1. finally it shows Error state
  2. i am not able to view location of error file
    Error_Path = raidsm.calligotech.com:/home/pbsdata/STDIN.e12
  3. how to store log file like STDIN.e and STDIN.o

[pbsdata@raidsm ~] qsub -l select=1:ncpus=2:mem=100mb:host=raidsn -- echo "echo calligo" 12.raidsm.calligotech.com [pbsdata@raidsm ~] qstat -ans

raidsm.calligotech.com:
Req’d Req’d Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time


12.raidsm.calli pbsdata workq STDIN 2435 1 2 100mb – E 00:00
raidsn/02
Job run at Fri Dec 13 at 15:51 on (raidsn:ncpus=2:mem=102400kb)
[pbsdata@raidsm ~]$ qstat -fx 12.raidsm.calligotech.com
Job Id: 12.raidsm.calligotech.com
Job_Name = STDIN
Job_Owner = pbsdata@raidsm.calligotech.com
resources_used.cpupercent = 0
resources_used.cput = 00:00:00
resources_used.mem = 0kb
resources_used.ncpus = 2
resources_used.vmem = 0kb
resources_used.walltime = 00:00:00
job_state = E
queue = workq
server = raidsm.calligotech.com
Checkpoint = u
ctime = Fri Dec 13 15:51:25 2019
Error_Path = raidsm.calligotech.com:/home/pbsdata/STDIN.e12
exec_host = raidsn/0
2
exec_vnode = (raidsn:ncpus=2:mem=102400kb)
Hold_Types = n
Join_Path = n
Keep_Files = n
Mail_Points = a
mtime = Fri Dec 13 15:51:25 2019
Output_Path = raidsm.calligotech.com:/home/pbsdata/STDIN.o12
Priority = 0
qtime = Fri Dec 13 15:51:25 2019
Rerunable = True
Resource_List.mem = 100mb
Resource_List.ncpus = 2
Resource_List.nodect = 1
Resource_List.place = free
Resource_List.select = 1:ncpus=2:mem=100mb:host=raidsn
stime = Fri Dec 13 15:51:25 2019
session_id = 2435
jobdir = /home/pbsdata
substate = 51
Variable_List = PBS_O_HOME=/home/pbsdata,PBS_O_LANG=en_US.UTF-8,
PBS_O_LOGNAME=pbsdata,
PBS_O_PATH=/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/sha
re/apps/platform/pbs/bin:/home/pbsdata/.local/bin:/home/pbsdata/bin,
PBS_O_MAIL=/var/spool/mail/pbsdata,PBS_O_SHELL=/bin/bash,
PBS_O_WORKDIR=/home/pbsdata,PBS_O_SYSTEM=Linux,PBS_O_QUEUE=workq,
PBS_O_HOST=raidsm.calligotech.com
comment = Job run at Fri Dec 13 at 15:51 on (raidsn:ncpus=2:mem=102400kb)
etime = Fri Dec 13 15:51:25 2019
run_count = 1
Exit_status = 254
Submit_arguments = -l select=1:ncpus=2:mem=100mb:host=raidsn – echo echo c
alligo
executable = jsdl-hpcpa:Executableecho</jsdl-hpcpa:Executable>
argument_list = jsdl-hpcpa:Argumentecho calligo</jsdl-hpcpa:Argument>
project = _pbs_project_default

[pbsdata@raidsm ~]$

Qmgr : print server is below
[root@raidsm ~]# qmgr -c “p s”

Create queues and set their attributes.

Create and define queue workq

create queue workq
set queue workq queue_type = Execution
set queue workq enabled = True
set queue workq started = True

Set server attributes.

set server scheduling = True
set server default_queue = workq
set server log_events = 511
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = False
set server job_history_enable = True
set server job_history_duration = 720:00:00
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999
[root@raidsm ~]#
[root@raidsm ~]# qmgr -c “print node @default

Create nodes and set their properties.

Create and define node raidsn

create node raidsn Mom=raidsn.calligotech.com
set node raidsn state = free
set node raidsn resources_available.arch = linux
set node raidsn resources_available.host = raidsn
set node raidsn resources_available.mem = 5986384kb
set node raidsn resources_available.ncpus = 4
set node raidsn resources_available.vnode = raidsn
set node raidsn resv_enable = True

Please guide me to resolve this issue.

Thank you and Nice one !

This is because you have an issue with passwordless SSH/SCP, if stdout and stderr files are not seen i the job submission directory.
Please check the mom logs ( $PBS_HOME/mom_log/YYYYMMDD ) on the compute node (raidsn) where this job ran, it will give you the information .

Please try this command:
qsub -l select=1:ncpus=2:mem=100mb:host=raidsn – /usr/bin/echo calligo

Hi,
I have setup passwordless ssh from pbs server to computenode ie. raidsm to raidsn
still i am not able to stderr and stdout files.
And also my job status is “E”
Please find the logs below…

less /var/spool/pbs/mom_logs/20191213

12/13/2019 18:24:36;0008;pbs_mom;Job;28.raidsm.calligotech.com;Started, pid = 1919
12/13/2019 18:24:36;0800;pbs_mom;n/a;mom_get_sample;nprocs: 214, cantstat: 0, nomem: 0, skipped: 0, cached: 0
12/13/2019 18:24:36;0080;pbs_mom;Job;28.raidsm.calligotech.com;task 00000001 terminated
12/13/2019 18:24:36;0800;pbs_mom;n/a;mom_get_sample;nprocs: 213, cantstat: 0, nomem: 0, skipped: 0, cached: 0
12/13/2019 18:24:36;0008;pbs_mom;Job;28.raidsm.calligotech.com;Terminated
12/13/2019 18:24:36;0100;pbs_mom;Job;28.raidsm.calligotech.com;task 00000001 cput= 0:00:00
12/13/2019 18:24:36;0008;pbs_mom;Job;28.raidsm.calligotech.com;kill_job
12/13/2019 18:24:36;0100;pbs_mom;Job;28.raidsm.calligotech.com;raidsn cput= 0:00:00 mem=0kb
12/13/2019 18:24:36;0800;pbs_mom;n/a;mom_get_sample;nprocs: 214, cantstat: 0, nomem: 0, skipped: 0, cached: 0
12/13/2019 18:24:36;0008;pbs_mom;Job;28.raidsm.calligotech.com;no active tasks
12/13/2019 18:24:36;0100;pbs_mom;Job;28.raidsm.calligotech.com;Obit sent
12/13/2019 18:24:36;0100;pbs_mom;Req;;Type 54 request received from root@192.168.1.224:15001, sock=3
12/13/2019 18:24:36;0080;pbs_mom;Job;28.raidsm.calligotech.com;copy file request received
12/13/2019 18:24:37;0080;pbs_mom;Fil;sys_copy;command: /bin/scp -Brvp /var/spool/pbs/spool/28.raidsm.calligotech.com.OU pbsdata@raidsm.calligotech.com:/home/pbsdata/STDIN.o28 status=1, try=1
12/13/2019 18:24:47;0800;pbs_mom;n/a;mom_get_sample;nprocs: 216, cantstat: 0, nomem: 0, skipped: 0, cached: 0
12/13/2019 18:25:03;0800;pbs_mom;n/a;mom_get_sample;nprocs: 215, cantstat: 0, nomem: 0, skipped: 0, cached: 0
12/13/2019 18:25:08;0080;pbs_mom;Fil;sys_copy;command: /share/apps/platform/pbs/sbin/pbs_rcp -rp /var/spool/pbs/spool/28.raidsm.calligotech.com.OU pbsdata@raidsm.calligotech.com:/home/pbsdata/STDIN.o28 status=1, try=2
12/13/2019 18:25:19;0080;pbs_mom;Fil;sys_copy;command: /bin/scp -Brvp /var/spool/pbs/spool/28.raidsm.calligotech.com.OU pbsdata@raidsm.calligotech.com:/home:

[pbsdata@raidsm ~]$ qsub -l select=1:ncpus=2:mem=100mb:host=raidsn – /usr/bin/echo calligo
28.raidsm.calligotech.com
You have new mail in /var/spool/mail/pbsdata

[pbsdata@raidsm ~]$ qstat -ans

raidsm.calligotech.com:
Req’d Req’d Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time


28.raidsm.calli pbsdata workq STDIN 1919 1 2 100mb – E 00:00
raidsn/0*2
Job run at Fri Dec 13 at 18:24 on (raidsn:ncpus=2:mem=102400kb)
[pbsdata@raidsm ~]$

Please help me on this.

Regards,
Zain

Zain

Your scp is failing from compute node to server ( stage out )

you need to setup password-less ssh/scp between

  • server to compute node
  • compute node server
  • compute node to compute node ( MPI / MPP jobs)

Please make sure you have these three lines in the same order in your /etc/pbs.conf of server and compute nodes ( run which command to update the respective path to the binaries) and restart pbs services.

$PBS_RCP=/bin/false
$PBS_SCP=/usr/bin/scp
$PBS_RSHCOMMAND=/usr/bin/ssh

Thank you