Hello,
I am very new to OpenPBS,
just install OpenPBS on two computers one server node (magi01) and one compute node(magi02)
the problem is that after the job is submitted, it is held but not running
[magi01@magi01 ~]$ qstat -a
magi01:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
--------------- -------- -------- ---------- ------ --- --- ------ ----- - -----
4.magi01 magi01 workq hivePytho* -- 1 1 -- 00:15 H --
5.magi01 magi01 workq STDIN -- 1 1 -- -- H --
CURRENT STATUS
SELinux is disabled
Firewall is disabled
etc/hosts is configured as below
server node
27.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.68.114 magi01
192.168.68.117 magi02
computenode
127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost localhost.localdomain localhost6 localhost6.localdomain6
192.168.68.114 magi01
192.168.68.117 magi02
Status for the compute node
[magi01@magi01 ~]$ pbsnodes -av
magi02
Mom = magi02
ntype = PBS
state = free
pcpus = 128
resources_available.arch = linux
resources_available.host = magi02
resources_available.mem = 263739016kb
resources_available.ncpus = 128
resources_available.vnode = magi02
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
last_state_change_time = Wed Nov 11 09:22:17 2020
I’m not very sure how to check if the ports is open or closed
but the results for nmap -p- is below
15001,15004,15007,17001 is open but i’m not sure for the other ports
[magi01@magi01 ~]$ nmap -p- magi01
Starting Nmap 7.70 ( https://nmap.org ) at 2020-11-11 09:26 CST
Nmap scan report for magi01 (192.168.68.114)
Host is up (0.000044s latency).
Not shown: 65528 closed ports
PORT STATE SERVICE
22/tcp open ssh
111/tcp open rpcbind
5355/tcp open llmnr
15001/tcp open unknown
15004/tcp open unknown
15007/tcp open unknown
17001/tcp open unknown
By running qstat -answl
I found out that it returns job held, too many failed attemps to run
[magi01@magi01 ~]$ qstat -answ1
magi01:
Req'd Req'd Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time
------------------------------ --------------- --------------- --------------- -------- ---- ----- ------ ----- - -----
4.magi01 magi01 workq hivePythonExam* -- 1 1 -- 00:15 H -- --
job held, too many failed attempts to run
5.magi01 magi01 workq STDIN -- 1 1 -- -- H -- --
job held, too many failed attempts to run
Return from qstat -f
[root@magi01 magi01]# qstat -f
Job Id: 4.magi01
Job_Name = hivePythonExample
Job_Owner = magi01@magi01
job_state = H
queue = workq
server = magi01
Checkpoint = u
ctime = Wed Nov 11 09:12:23 2020
Error_Path = magi01:/home/magi01/Desktop/hivePythonExample.e4
Hold_Types = s
Join_Path = oe
Keep_Files = n
Mail_Points = a
mtime = Wed Nov 11 09:12:25 2020
Output_Path = magi01:/home/magi01/Desktop/hivePythonExample.out
Priority = 0
qtime = Wed Nov 11 09:12:23 2020
Rerunable = True
Resource_List.ncpus = 1
Resource_List.nodect = 1
Resource_List.place = pack
Resource_List.pmem = 1gb
Resource_List.select = 1:ncpus=1
Resource_List.walltime = 00:15:00
schedselect = 1:ncpus=1
stime = Wed Nov 11 09:12:24 2020
substate = 20
Variable_List = PBS_O_HOME=/home/magi01,PBS_O_LANG=en_US.UTF-8,
PBS_O_LOGNAME=magi01,
PBS_O_PATH=/home/magi01/.local/bin:/home/magi01/bin:/usr/local/bin:/us
r/bin:/usr/local/sbin:/usr/sbin:/opt/pbs/bin,
PBS_O_MAIL=/var/spool/mail/magi01,PBS_O_SHELL=/bin/bash,
PBS_O_WORKDIR=/home/magi01/Desktop,PBS_O_SYSTEM=Linux,
PBS_O_QUEUE=workq,PBS_O_HOST=magi01
euser = magi01
egroup = magi01
hashname = 4.magi01
queue_rank = 1605057143058
queue_type = E
comment = job held, too many failed attempts to run
etime = Wed Nov 11 09:12:23 2020
run_count = 21
Exit_status = -10
Submit_arguments = t.pbs
project = _pbs_project_default
run_version = 21
Submit_Host = magi01
Any help will be really appreciated,
Regards,
David