Qrun: Could not contact Scheduler

Hi friends,

I have installed openPBS v20, all 6 nodes are connected, and jobs can be submitted, but the jobs are forever stuck in the “Q” state and never get into running.

pbsnodes -av
mybay02
     Mom = mybay02
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay02
     resources_available.mem = 527405960kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay02
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022

mybay03
     Mom = mybay03
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay03
     resources_available.mem = 527537272kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay03
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022

mybay04
     Mom = mybay04
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay04
     resources_available.mem = 527405960kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay04
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022

mybay05
     Mom = mybay05
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay05
     resources_available.mem = 527537272kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay05
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022

myview01
     Mom = myview01
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 96
     resources_available.arch = linux
     resources_available.host = myview01
     resources_available.mem = 1056013684kb
     resources_available.ncpus = 96
     resources_available.vnode = myview01
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:37 2022

mybay
     Mom = mybay
     Port = 15002
     pbs_version = 20.0.0
     ntype = PBS
     state = free
     pcpus = 64
     resources_available.arch = linux
     resources_available.host = mybay
     resources_available.mem = 527405960kb
     resources_available.ncpus = 64
     resources_available.vnode = mybay
     resources_assigned.accelerator_memory = 0kb
     resources_assigned.hbmem = 0kb
     resources_assigned.mem = 0kb
     resources_assigned.naccelerators = 0
     resources_assigned.ncpus = 0
     resources_assigned.vmem = 0kb
     resv_enable = True
     sharing = default_shared
     license = l
     last_state_change_time = Mon Dec  5 16:15:36 2022
[root@mybay ~]# qstat
Job id            Name             User              Time Use S Queue
----------------  ---------------- ----------------  -------- - -----
1011.mybay      testing          user1                   0 Q small
[root@mybay ~]# tracejob 1011

Job: 1011.mybay

12/06/2022 08:51:00  S    enqueuing into small, state Q hop 1
12/06/2022 08:51:00  S    Job Queued at request of user1@mybay, owner = user1@mybay, job name = testing, queue = small
12/06/2022 08:51:00  A    user=user1 group=user1 project=_pbs_project_default jobname=testing queue=small ctime=1670287860 qtime=1670287860 etime=1670287860
                          Resource_List.mem=1gb Resource_List.mpiprocs=2 Resource_List.ncpus=2 Resource_List.nodect=1 Resource_List.nodes=mybay:ppn=2
                          Resource_List.place=scatter Resource_List.select=1:ncpus=2:host=mybay:mpiprocs=2 Resource_List.walltime=00:10:00

and I noticed that the queue rank ( queue_rank = 1670287860350734) is ultra large and the priority is 0:

[root@mybay ~]# qstat -f 1011
Job Id: 1011.mybay
    Job_Name = testing
    Job_Owner = user1@mybay
    job_state = Q
    queue = small
    server = mybay
    Checkpoint = u
    ctime = Tue Dec  6 08:51:00 2022
    Error_Path = mybay:/home/user1/Templates/testing.e1011
    Hold_Types = n
    Join_Path = oe
    Keep_Files = n
    Mail_Points = a
    mtime = Tue Dec  6 08:51:00 2022
    Output_Path = mybay:/home/user1/Templates/testing.o1011
    Priority = 0
    qtime = Tue Dec  6 08:51:00 2022
    Rerunable = True
    Resource_List.mem = 1gb
    Resource_List.mpiprocs = 2
    Resource_List.ncpus = 2
    Resource_List.nodect = 1
    Resource_List.nodes = mybay:ppn=2
    Resource_List.place = scatter
    Resource_List.select = 1:ncpus=2:host=mybay:mpiprocs=2
    Resource_List.walltime = 00:10:00
    schedselect = 1:ncpus=2:host=mybay:mpiprocs=2
    substate = 10
    Variable_List = PBS_O_HOME=/home/user1,PBS_O_LANG=en_US.UTF-8,
        PBS_O_LOGNAME=user1,
        PBS_O_PATH=/home/user1/.local/bin:/home/user1/bin:/usr/local/bin:/us
        r/bin:/usr/local/sbin:/usr/sbin:/opt/pbs/bin,
        PBS_O_MAIL=/var/spool/mail/user1,PBS_O_SHELL=/bin/bash,
        PBS_O_WORKDIR=/home/user1/Templates,PBS_O_SYSTEM=Linux,
        PBS_O_QUEUE=small,PBS_O_HOST=mybay
    euser = user1
    egroup = user1
    queue_rank = 1670287860350734
    queue_type = E
    etime = Tue Dec  6 08:51:00 2022
    eligible_time = 00:04:44
    accrue_type = 2
    Submit_arguments = firstJob.sh
    project = _pbs_project_default
    Submit_Host = mybay

and qrun shows it cannot contact scheduler:

[root@mybay ~]# qrun 1011
qrun: Could not contact Scheduler 1011.mybay

And these are the server info:

[root@mybay ~]# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue workq
#
create queue workq
set queue workq queue_type = Execution
set queue workq enabled = True
set queue workq started = True
#
# Create and define queue testq1
#
create queue testq1
set queue testq1 queue_type = Execution
set queue testq1 Priority = 10
set queue testq1 max_running = 20
set queue testq1 resources_max.cput = 03:02:10
set queue testq1 resources_max.mem = 8gb
set queue testq1 enabled = True
set queue testq1 started = True
#
# Create and define queue small
#
create queue small
set queue small queue_type = Execution
set queue small Priority = 10
set queue small resources_max.mem = 10gb
set queue small resources_max.ncpus = 2
set queue small resources_max.nodect = 1
set queue small resources_max.walltime = 06:00:00
set queue small resources_default.mem = 1gb
set queue small resources_default.ncpus = 1
set queue small resources_default.walltime = 01:00:00
set queue small enabled = True
set queue small started = True
#
# Set server attributes.
#
set server scheduling = True
set server default_queue = workq
set server log_events = 511
set server mailer = /usr/sbin/sendmail
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.mem = 1gb
set server resources_default.ncpus = 1
set server resources_default.nodect = 1
set server resources_default.nodes = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server default_node = mybay02
set server flatuid = True
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = True
set server job_history_enable = True
set server job_history_duration = 96:00:00
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999

And this is the job script:

[user1@mybay Templates]$ cat firstJob.sh
#!/bin/sh
#PBS -l nodes=mybay:ppn=2
#PBS -l walltime=00:10:00
#PBS -q small
#PBS -j oe
#PBS -N testing

echo "hi i am in my first job"
echo date

These are additional info:

[root@mybay ~]# systemctl status pbs
● pbs.service - Portable Batch System
   Loaded: loaded (/opt/pbs/libexec/pbs_init.d; enabled; vendor preset: disabled)
   Active: active (running) since Tue 2022-12-06 08:50:33 CST; 50min ago
     Docs: man:pbs(8)
  Process: 18323 ExecStop=/opt/pbs/libexec/pbs_init.d stop (code=exited, status=0/SUCCESS)
  Process: 59656 ExecStart=/opt/pbs/libexec/pbs_init.d start (code=exited, status=0/SUCCESS)
    Tasks: 0
   Memory: 12.0K
   CGroup: /system.slice/pbs.service
           ├─19431 /usr/bin/postgres -D /var/spool/pbs/datastore -p 15007
           ├─19442 postgres: logger process
           ├─19444 postgres: checkpointer process
           ├─19445 postgres: writer process
           ├─19446 postgres: wal writer process
           ├─19447 postgres: autovacuum launcher process
           ├─19448 postgres: stats collector process
           ├─19449 postgres: bgworker: logical replication launcher
           └─19493 postgres: postgres pbs_datastore 10.2.208.101(48896) idle

Dec 06 08:50:33 mybay systemd[1]: Starting Portable Batch System...
Dec 06 08:50:33 mybay pbs_init.d[59656]: Starting PBS
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS comm already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS mom already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS scheduler already running.
Dec 06 08:50:33 mybay pbs_init.d[59656]: PBS Server already running.
Dec 06 08:50:33 mybay systemd[1]: Started Portable Batch System.
[root@mybay ~]# /etc/init.d/pbs status
pbs_server is pid 19494
pbs_mom is pid 19284
pbs_sched is pid 19296
pbs_comm is 19274
[root@mybay ~]# pbs_hostn -v mybay
primary name: mybay (from gethostbyname())
aliases:            -none-
     address length:  4 bytes
     address:         10.2.208.101   (1708130826 dec)  name:  mybay
[root@mybay ~]# tail /var/spool/pbs/sched_logs/20221206
12/06/2022 09:43:37;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:39;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:41;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:43;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:45;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:47;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:49;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:51;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:53;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 09:43:55;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server

BTW, the firewalls on all nodes have been shut down.

Your help is greatly appreciated! Thanks.

Please check this discussion: Jobs stuck in Q stat and the scheduler seems not working right

1 Like

Dear adarsh,

Thank you for the reply. I have read it before I posted, and can’t find information that helps fix my issues.

Interestingly, when I change the server/scheduler node to a second machine, the whole thing works.

Does that mean the software on the previous scheduler node was not properly installed?

Thanks.

I have just restarted my machine, and found the problem persists. However, from the scheduler logs, there are some info, which may help the diagnostics. Do you mind helping check what might be the problem for this node (I can confirm the problem is unique to this node only)?

[root@mybay ~]# tail /var/spool/pbs/sched_logs/20221206
12/06/2022 18:07:41;0002;pbs_sched;Svr;pbs_sched;ipv6 interface lo: localhost6.localdomain6
12/06/2022 18:07:41;0002;pbs_sched;Svr;pbs_sched;ipv6 interface eno12399: mybay
12/06/2022 18:07:41;0002;pbs_sched;n/a;setup_env;read environment from /var/spool/pbs/pbs_environment
12/06/2022 18:07:41;0006;pbs_sched;Fil;pbs_sched;Version 20.0.0, started, initialization type = 0
12/06/2022 18:07:41;0002;pbs_sched;Svr;sched_main;/opt/pbs/sbin/pbs_sched startup pid 5551
12/06/2022 18:07:41;0040;pbs_sched;Fil;fairshare usage;Creating usage database for fairshare
12/06/2022 18:07:41;0080;pbs_sched;Req;;Launching 32 worker threads
12/06/2022 18:07:45;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 18:07:47;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server
12/06/2022 18:07:49;0001;pbs_sched;Svr;pbs_sched;Access from host not allowed, or unknown host (15008) in open_server_conns, Couldn't register the scheduler default with connected server

Please check the PBS_SERVER in the /etc/pbs.conf and whether the hostname assigned to it is resolvable to the same IP address on that host.

Or you can try to start scheduler (man pbs_sched)
$PBS_EXEC/sbin/pbs_sched -c /path/to/clientsfile # check whether it makes a difference

   -c <clientsfile>
                Add clients to this scheduler's list of known clients.  The clientsfile contains single-line entries of the form
                    $clienthost <hostname>

                Each  hostname is added to the list of hosts allowed to connect to this scheduler.  If clientsfile cannot be opened, this scheduler aborts.  Path can be absolute or
                relative.  If relative, it is relative to PBS_HOME/sched_priv.

I have reinstalled OS and it works now. Thanks.

1 Like