Hi friends,
I have installed openPBS v20, all 6 nodes are connected, and jobs can be submitted, but the jobs are forever stuck in the “Q” state and never get into running.
pbsnodes -av
mybay02
Mom = mybay02
Port = 15002
pbs_version = 20.0.0
ntype = PBS
state = free
pcpus = 64
resources_available.arch = linux
resources_available.host = mybay02
resources_available.mem = 527405960kb
resources_available.ncpus = 64
resources_available.vnode = mybay02
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Mon Dec 5 16:15:37 2022
mybay03
Mom = mybay03
Port = 15002
pbs_version = 20.0.0
ntype = PBS
state = free
pcpus = 64
resources_available.arch = linux
resources_available.host = mybay03
resources_available.mem = 527537272kb
resources_available.ncpus = 64
resources_available.vnode = mybay03
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Mon Dec 5 16:15:37 2022
mybay04
Mom = mybay04
Port = 15002
pbs_version = 20.0.0
ntype = PBS
state = free
pcpus = 64
resources_available.arch = linux
resources_available.host = mybay04
resources_available.mem = 527405960kb
resources_available.ncpus = 64
resources_available.vnode = mybay04
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Mon Dec 5 16:15:37 2022
mybay05
Mom = mybay05
Port = 15002
pbs_version = 20.0.0
ntype = PBS
state = free
pcpus = 64
resources_available.arch = linux
resources_available.host = mybay05
resources_available.mem = 527537272kb
resources_available.ncpus = 64
resources_available.vnode = mybay05
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Mon Dec 5 16:15:37 2022
myview01
Mom = myview01
Port = 15002
pbs_version = 20.0.0
ntype = PBS
state = free
pcpus = 96
resources_available.arch = linux
resources_available.host = myview01
resources_available.mem = 1056013684kb
resources_available.ncpus = 96
resources_available.vnode = myview01
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Mon Dec 5 16:15:37 2022
mybay
Mom = mybay
Port = 15002
pbs_version = 20.0.0
ntype = PBS
state = free
pcpus = 64
resources_available.arch = linux
resources_available.host = mybay
resources_available.mem = 527405960kb
resources_available.ncpus = 64
resources_available.vnode = mybay
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Mon Dec 5 16:15:36 2022
[root@mybay ~]# qstat
Job id Name User Time Use S Queue
---------------- ---------------- ---------------- -------- - -----
1011.mybay testing user1 0 Q small
[root@mybay ~]# tracejob 1011
Job: 1011.mybay
12/06/2022 08:51:00 S enqueuing into small, state Q hop 1
12/06/2022 08:51:00 S Job Queued at request of user1@mybay, owner = user1@mybay, job name = testing, queue = small
12/06/2022 08:51:00 A user=user1 group=user1 project=_pbs_project_default jobname=testing queue=small ctime=1670287860 qtime=1670287860 etime=1670287860
Resource_List.mem=1gb Resource_List.mpiprocs=2 Resource_List.ncpus=2 Resource_List.nodect=1 Resource_List.nodes=mybay:ppn=2
Resource_List.place=scatter Resource_List.select=1:ncpus=2:host=mybay:mpiprocs=2 Resource_List.walltime=00:10:00
and I noticed that the queue rank ( queue_rank = 1670287860350734) is ultra large and the priority is 0:
[root@mybay ~]# qstat -f 1011
Job Id: 1011.mybay
Job_Name = testing
Job_Owner = user1@mybay
job_state = Q
queue = small
server = mybay
Checkpoint = u
ctime = Tue Dec 6 08:51:00 2022
Error_Path = mybay:/home/user1/Templates/testing.e1011
Hold_Types = n
Join_Path = oe
Keep_Files = n
Mail_Points = a
mtime = Tue Dec 6 08:51:00 2022
Output_Path = mybay:/home/user1/Templates/testing.o1011
Priority = 0
qtime = Tue Dec 6 08:51:00 2022
Rerunable = True
Resource_List.mem = 1gb
Resource_List.mpiprocs = 2
Resource_List.ncpus = 2
Resource_List.nodect = 1
Resource_List.nodes = mybay:ppn=2
Resource_List.place = scatter
Resource_List.select = 1:ncpus=2:host=mybay:mpiprocs=2
Resource_List.walltime = 00:10:00
schedselect = 1:ncpus=2:host=mybay:mpiprocs=2
substate = 10
Variable_List = PBS_O_HOME=/home/user1,PBS_O_LANG=en_US.UTF-8,
PBS_O_LOGNAME=user1,
PBS_O_PATH=/home/user1/.local/bin:/home/user1/bin:/usr/local/bin:/us
r/bin:/usr/local/sbin:/usr/sbin:/opt/pbs/bin,
PBS_O_MAIL=/var/spool/mail/user1,PBS_O_SHELL=/bin/bash,
PBS_O_WORKDIR=/home/user1/Templates,PBS_O_SYSTEM=Linux,
PBS_O_QUEUE=small,PBS_O_HOST=mybay
euser = user1
egroup = user1
queue_rank = 1670287860350734
queue_type = E
etime = Tue Dec 6 08:51:00 2022
eligible_time = 00:04:44
accrue_type = 2
Submit_arguments = firstJob.sh
project = _pbs_project_default
Submit_Host = mybay
and qrun shows it cannot contact scheduler:
[root@mybay ~]# qrun 1011
qrun: Could not contact Scheduler 1011.mybay
And these are the server info:
[root@mybay ~]# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue workq
#
create queue workq
set queue workq queue_type = Execution
set queue workq enabled = True
set queue workq started = True
#
# Create and define queue testq1
#
create queue testq1
set queue testq1 queue_type = Execution
set queue testq1 Priority = 10
set queue testq1 max_running = 20
set queue testq1 resources_max.cput = 03:02:10
set queue testq1 resources_max.mem = 8gb
set queue testq1 enabled = True
set queue testq1 started = True
#
# Create and define queue small
#
create queue small
set queue small queue_type = Execution
set queue small Priority = 10
set queue small resources_max.mem = 10gb
set queue small resources_max.ncpus = 2
set queue small resources_max.nodect = 1
set queue small resources_max.walltime = 06:00:00
set queue small resources_default.mem = 1gb
set queue small resources_default.ncpus = 1
set queue small resources_default.walltime = 01:00:00
set queue small enabled = True
set queue small started = True
#
# Set server attributes.
#
set server scheduling = True
set server default_queue = workq
set server log_events = 511
set server mailer = /usr/sbin/sendmail
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.mem = 1gb
set server resources_default.ncpus = 1
set server resources_default.nodect = 1
set server resources_default.nodes = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server default_node = mybay02
set server flatuid = True
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = True
set server job_history_enable = True
set server job_history_duration = 96:00:00
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999
And this is the job script:
[user1@mybay Templates]$ cat firstJob.sh
#!/bin/sh
#PBS -l nodes=mybay:ppn=2
#PBS -l walltime=00:10:00
#PBS -q small
#PBS -j oe
#PBS -N testing
echo "hi i am in my first job"
echo date
These are additional info:
[root@mybay ~]# systemctl status pbs
● pbs.service - Portable Batch System
Loaded: loaded (/opt/pbs/libexec/pbs_init.d; enabled; vendor preset: disabled)
Active: active (running) since Tue 2022-12-06 08:50:33 CST; 50min ago
Docs: man:pbs(8)
Process: 18323 ExecStop=/opt/pbs/libexec/pbs_init.d stop (code=exited, status=0/SUCCESS)
Process: 59656 ExecStart=/opt/pbs/libexec/pbs_init.d start (code=exited, status=0/SUCCESS)
Tasks: 0
Memory: 12.0K
CGroup: /system.slice/pbs.service
├─19431 /usr/bin/postgres -D /var/spool/pbs/datastore -p 15007
├─19442 postgres: logger process
├─19444 postgres: checkpointer process
├─19445 postgres: writer process
├─19446 postgres: wal writer process
├─19447 postgres: autovacuum launcher process
├─19448 postgres: stats collector process
├─19449 postgres: bgworker: logical replication launcher
└─19493 postgres: postgres pbs_datastore 10.2.208.101(48896) idle
Dec 06 08:50:33 mybay systemd[1]: Starting Portable Batch System...
Dec 06 08:50:33 mybay pbs_init.d[59656]: Starting PBS
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS comm already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS mom already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS scheduler already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS Server already running.
Dec 06 08:50:33 mybay systemd[1]: Started Portable Batch System.
[root@mybay ~]# /etc/init.d/pbs status
pbs_server is pid 19494
pbs_mom is pid 19284
pbs_sched is pid 19296
pbs_comm is 19274
[root@mybay ~]# pbs_hostn -v mybay
primary name: mybay (from gethostbyname())
aliases: -none-
address length: 4 bytes
address: 10.2.208.101 (1708130826 dec) name: mybay
[root@mybay ~]# tail /var/spool/pbs/sched_logs/20221206
12/06/2022 09:43:37;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:39;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:41;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:43;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:45;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:47;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:49;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:51;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:53;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:55;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
BTW, the firewalls on all nodes have been shut down.
Your help is greatly appreciated! Thanks.