Hi friends,
I have installed openPBS v20, all 6 nodes are connected, and jobs can be submitted, but the jobs are forever stuck in the “Q” state and never get into running.
pbsnodes -av
mybay02
     Mom = mybay02
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay02
     resources_available.mem = 527405960kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay02
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022
mybay03
     Mom = mybay03
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay03
     resources_available.mem = 527537272kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay03
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022
mybay04
     Mom = mybay04
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay04
     resources_available.mem = 527405960kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay04
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022
mybay05
     Mom = mybay05
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay05
     resources_available.mem = 527537272kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay05
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022
myview01
     Mom = myview01
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 96
     resources_available.arch = linux
     resources_available.host = myview01
     resources_available.mem = 1056013684kb
     resources_available.ncpus = 96
     resources_available.vnode = myview01
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022
mybay
     Mom = mybay
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay
     resources_available.mem = 527405960kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:36 2022
[root@mybay ~]# qstat
Job id            Name             User              Time Use S Queue
----------------  ---------------- ----------------  -------- - -----
1011.mybay      testing          user1                   0 Q small
[root@mybay ~]# tracejob 1011
Job: 1011.mybay
12/06/2022 08:51:00  S    enqueuing into small, state Q hop 1
12/06/2022 08:51:00  S    Job Queued at request of user1@mybay, owner = user1@mybay, job name = testing, queue = small
12/06/2022 08:51:00  A    user=user1 group=user1 project=_pbs_project_default jobname=testing queue=small ctime=1670287860 qtime=1670287860 etime=1670287860
                          Resource_List.mem=1gb Resource_List.mpiprocs=2 Resource_List.ncpus=2 Resource_List.nodect=1 Resource_List.nodes=mybay:ppn=2
                          Resource_List.place=scatter Resource_List.select=1:ncpus=2:host=mybay:mpiprocs=2 Resource_List.walltime=00:10:00
and I noticed that the queue rank ( queue_rank = 1670287860350734) is ultra large and the priority is 0:
[root@mybay ~]# qstat -f 1011
Job Id: 1011.mybay
    Job_Name = testing
    Job_Owner = user1@mybay
    job_state = Q
    queue = small
    server = mybay
    Checkpoint = u
    ctime = Tue Dec  6 08:51:00 2022
    Error_Path = mybay:/home/user1/Templates/testing.e1011
    Hold_Types = n
    Join_Path = oe
    Keep_Files = n
    Mail_Points = a
    mtime = Tue Dec  6 08:51:00 2022
    Output_Path = mybay:/home/user1/Templates/testing.o1011
    Priority = 0
    qtime = Tue Dec  6 08:51:00 2022
    Rerunable = True
    Resource_List.mem = 1gb
    Resource_List.mpiprocs = 2
    Resource_List.ncpus = 2
    Resource_List.nodect = 1
    Resource_List.nodes = mybay:ppn=2
    Resource_List.place = scatter
    Resource_List.select = 1:ncpus=2:host=mybay:mpiprocs=2
    Resource_List.walltime = 00:10:00
    schedselect = 1:ncpus=2:host=mybay:mpiprocs=2
    substate = 10
    Variable_List = PBS_O_HOME=/home/user1,PBS_O_LANG=en_US.UTF-8,
        PBS_O_LOGNAME=user1,
        PBS_O_PATH=/home/user1/.local/bin:/home/user1/bin:/usr/local/bin:/us
        r/bin:/usr/local/sbin:/usr/sbin:/opt/pbs/bin,
        PBS_O_MAIL=/var/spool/mail/user1,PBS_O_SHELL=/bin/bash,
        PBS_O_WORKDIR=/home/user1/Templates,PBS_O_SYSTEM=Linux,
        PBS_O_QUEUE=small,PBS_O_HOST=mybay
    euser = user1
    egroup = user1
    queue_rank = 1670287860350734
    queue_type = E
    etime = Tue Dec  6 08:51:00 2022
    eligible_time = 00:04:44
    accrue_type = 2
    Submit_arguments = firstJob.sh
    project = _pbs_project_default
    Submit_Host = mybay
and qrun shows it cannot contact scheduler:
[root@mybay ~]# qrun 1011
qrun: Could not contact Scheduler 1011.mybay
And these are the server info:
[root@mybay ~]# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue workq
#
create queue workq
set queue workq queue_type = Execution
set queue workq enabled = True
set queue workq started = True
#
# Create and define queue testq1
#
create queue testq1
set queue testq1 queue_type = Execution
set queue testq1 Priority = 10
set queue testq1 max_running = 20
set queue testq1 resources_max.cput = 03:02:10
set queue testq1 resources_max.mem = 8gb
set queue testq1 enabled = True
set queue testq1 started = True
#
# Create and define queue small
#
create queue small
set queue small queue_type = Execution
set queue small Priority = 10
set queue small resources_max.mem = 10gb
set queue small resources_max.ncpus = 2
set queue small resources_max.nodect = 1
set queue small resources_max.walltime = 06:00:00
set queue small resources_default.mem = 1gb
set queue small resources_default.ncpus = 1
set queue small resources_default.walltime = 01:00:00
set queue small enabled = True
set queue small started = True
#
# Set server attributes.
#
set server scheduling = True
set server default_queue = workq
set server log_events = 511
set server mailer = /usr/sbin/sendmail
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.mem = 1gb
set server resources_default.ncpus = 1
set server resources_default.nodect = 1
set server resources_default.nodes = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server default_node = mybay02
set server flatuid = True
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = True
set server job_history_enable = True
set server job_history_duration = 96:00:00
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999
And this is the job script:
[user1@mybay Templates]$ cat firstJob.sh
#!/bin/sh
#PBS -l nodes=mybay:ppn=2
#PBS -l walltime=00:10:00
#PBS -q small
#PBS -j oe
#PBS -N testing
echo "hi i am in my first job"
echo date
These are additional info:
[root@mybay ~]# systemctl status pbs
● pbs.service - Portable Batch System
   Loaded: loaded (/opt/pbs/libexec/pbs_init.d; enabled; vendor preset: disabled)
   Active: active (running) since Tue 2022-12-06 08:50:33 CST; 50min ago
     Docs: man:pbs(8)
  Process: 18323 ExecStop=/opt/pbs/libexec/pbs_init.d stop (code=exited, status=0/SUCCESS)
  Process: 59656 ExecStart=/opt/pbs/libexec/pbs_init.d start (code=exited, status=0/SUCCESS)
    Tasks: 0
   Memory: 12.0K
   CGroup: /system.slice/pbs.service
           ├─19431 /usr/bin/postgres -D /var/spool/pbs/datastore -p 15007
           ├─19442 postgres: logger process
           ├─19444 postgres: checkpointer process
           ├─19445 postgres: writer process
           ├─19446 postgres: wal writer process
           ├─19447 postgres: autovacuum launcher process
           ├─19448 postgres: stats collector process
           ├─19449 postgres: bgworker: logical replication launcher
           └─19493 postgres: postgres pbs_datastore 10.2.208.101(48896) idle
Dec 06 08:50:33 mybay systemd[1]: Starting Portable Batch System...
Dec 06 08:50:33 mybay pbs_init.d[59656]: Starting PBS
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS comm already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS mom already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS scheduler already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS Server already running.
Dec 06 08:50:33 mybay systemd[1]: Started Portable Batch System.
[root@mybay ~]# /etc/init.d/pbs status
pbs_server is pid 19494
pbs_mom is pid 19284
pbs_sched is pid 19296
pbs_comm is 19274
[root@mybay ~]# pbs_hostn -v mybay
primary name: mybay (from gethostbyname())
aliases:            -none-
     address length:  4 bytes
     address:         10.2.208.101   (1708130826 dec)  name:  mybay
[root@mybay ~]# tail /var/spool/pbs/sched_logs/20221206
12/06/2022 09:43:37;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:39;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:41;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:43;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:45;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:47;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:49;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:51;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:53;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:55;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
BTW, the firewalls on all nodes have been shut down.
Your help is greatly appreciated! Thanks.