Unable to submit a GPU job

Hi,

I have 4 CPU-only nodes and 1 GPU node.

I am trying to submit a job to a GPU node and it is throwing the below error.
“qsub: Access from host not allowed, or unknown host”

I am able to submit cpu jobs on CPU-only nodes.

The PBS-related configuration is as below:
###############
cat cuda.pbs
###############
#!/bin/bash

#PBS -N gpu_job
#PBS -l ncpus=1
#PBS -l ngpus=1
#PBS -q gpuq
#PBS -l walltime=00:00:20
#PBS -o output_6.log
#PBS -e error_6.log

Navigate to the directory where your code is located

cd /hpc/home/hpcuser01/cuda-job/

Run the executable

./matrix_multiplication_6
###########################

############################
[root@gpu01 ~]# cat /etc/pbs.conf
############################
PBS_EXEC=/opt/pbs
PBS_SERVER=admin1
PBS_START_SERVER=0
PBS_START_SCHED=0
PBS_START_COMM=0
PBS_START_MOM=1
PBS_HOME=/var/spool/pbs
PBS_CORE_LIMIT=unlimited
PBS_SCP=/bin/scp

#############################################
pbsnodes -a
############################################

n1
Mom = n1
Port = 15002
pbs_version = 2022.1.1.20220926110806
ntype = PBS
state = free
pcpus = 128
resources_available.arch = linux
resources_available.enablegpu = False
resources_available.host = n1
resources_available.mem = 263419196kb
resources_available.ncpus = 128
resources_available.vnode = n1
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Sat May 20 21:27:18 2023
last_used_time = Tue May 23 15:52:38 2023
server_instance_id = admin1.mguif.local:15001

n2
Mom = n2
Port = 15002
pbs_version = 2022.1.1.20220926110806
ntype = PBS
state = free
pcpus = 128
resources_available.arch = linux
resources_available.enablegpu = False
resources_available.host = n2
resources_available.mem = 263419196kb
resources_available.ncpus = 128
resources_available.vnode = n2
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Sat May 20 21:27:18 2023
last_used_time = Mon May 22 11:22:11 2023
server_instance_id = admin1.mguif.local:15001

n3
Mom = n3
Port = 15002
pbs_version = 2022.1.1.20220926110806
ntype = PBS
state = free
pcpus = 128
resources_available.arch = linux
resources_available.enablegpu = False
resources_available.host = n3
resources_available.mem = 263419196kb
resources_available.ncpus = 128
resources_available.vnode = n3
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Sat May 20 21:27:18 2023
last_used_time = Mon May 22 11:22:11 2023
server_instance_id = admin1.mguif.local:15001

n4
Mom = n4
Port = 15002
pbs_version = 2022.1.1.20220926110806
ntype = PBS
state = free
pcpus = 128
resources_available.arch = linux
resources_available.enablegpu = False
resources_available.host = n4
resources_available.mem = 263419196kb
resources_available.ncpus = 128
resources_available.vnode = n4
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Sat May 20 21:27:18 2023
last_used_time = Mon May 22 11:22:11 2023
server_instance_id = admin1.mguif.local:15001

gpu01
Mom = gpu01
Port = 15002
pbs_version = 2022.1.1.20220926110806
ntype = PBS
state = free
pcpus = 64
resources_available.arch = linux
resources_available.enablegpu = True
resources_available.host = gpu01
resources_available.mem = 791976532kb
resources_available.ncpus = 64
resources_available.ngpus = 4
resources_available.vnode = gpu01
resources_assigned.accelerator_memory = 0kb
resources_assigned.hbmem = 0kb
resources_assigned.mem = 0kb
resources_assigned.naccelerators = 0
resources_assigned.ncpus = 0
resources_assigned.ngpus = 0
resources_assigned.vmem = 0kb
resv_enable = True
sharing = default_shared
license = l
last_state_change_time = Tue May 23 13:49:31 2023
last_used_time = Mon May 22 16:07:47 2023
server_instance_id = admin1.mguif.local:15001

################################################################
Qmgr: print server

Create resources and set their properties.

Create and define resource enablegpu

create resource enablegpu
set resource enablegpu type = boolean
set resource enablegpu flag = h

Create and define resource ngpus

create resource ngpus
set resource ngpus type = long
set resource ngpus flag = hn

Create queues and set their attributes.

Create and define queue workq

create queue workq
set queue workq queue_type = Execution
set queue workq enabled = True
set queue workq started = True

Create and define queue gpu

create queue gpu
set queue gpu queue_type = Execution
set queue gpu enabled = True
set queue gpu started = True

Create and define queue gpuq

create queue gpuq
set queue gpuq queue_type = Execution
set queue gpuq acl_host_enable = True
set queue gpuq acl_hosts = gpu01
set queue gpuq enabled = True
set queue gpuq started = True

Set server attributes.

set server scheduling = True
set server default_queue = workq
set server log_events = 511
set server mailer = /usr/sbin/sendmail
set server mail_from = adm
set server query_other_jobs = True
set server resources_default.ncpus = 1
set server default_chunk.ncpus = 1
set server scheduler_iteration = 600
set server resv_enable = True
set server node_fail_requeue = 310
set server max_array_size = 10000
set server pbs_license_info = 6200@admin1
set server pbs_license_min = 0
set server pbs_license_max = 2147483647
set server pbs_license_linger_time = 31536000
set server eligible_time_enable = False
set server job_history_enable = True
set server job_history_duration = 720:00:00
set server max_concurrent_provision = 5
set server max_job_sequence_id = 9999999
Qmgr:

####################################################

I think this is what is causing that message. In the PBS documentation I typically see users listed for acl_hosts, not nodes/resources

There were couple of issues. I have corrected it as below

Troubleshooting done today to fix GPU work submission issue on gpu node

[hpcuser01@admin1 ~]$ qsub -I -l select=1:ncpus=1:mpiprocs=1:ngpus=1 -q gpuq
qsub: Access from host not allowed, or unknown host

Refer PBS Admin Guide PBSAdminGuide2021.1.2 section - 4.9.2.1 - Procedure to Associate Vnodes with Queues
First created a resource Qlist
qmgr -c ‘create resource Qlist type=string_array, flag=h’

Then assigned gpuq to use the resource Qlist with a name given as gpu
set queue gpuq default_chunk.Qlist = gpu

similarly assigned workq to use the resource Qlist with a name given as cpu
set queue workq default_chunk.Qlist = cpu

Next we set the resource gpu on GPU01 node and resource cpu on n1-4 nodes
set node gpu01 resources_available.Qlist = gpu
set node n1 resources_available.Qlist = cpu
set node n2 resources_available.Qlist = cpu
set node n3 resources_available.Qlist = cpu
set node n4 resources_available.Qlist = cpu

Removed acl_host_enable and acl_hosts entry for gpuq as below( this was set earlier)

Qmgr: print queue gpuq

Create queues and set their attributes.

Create and define queue gpuq

create queue gpuq
set queue gpuq queue_type = Execution
set queue gpuq acl_host_enable = True
set queue gpuq acl_hosts = gpu01
set queue gpuq default_chunk.Qlist = gpu
set queue gpuq enabled = True
set queue gpuq started = True
Qmgr: unset queue gpuq acl_host_enable
Qmgr: unset queue gpuq acl_hosts
Qmgr: print queue gpuq

Create queues and set their attributes.

Create and define queue gpuq

create queue gpuq
set queue gpuq queue_type = Execution
set queue gpuq default_chunk.Qlist = gpu
set queue gpuq enabled = True
set queue gpuq started = True
Qmgr: quit

This resolved the job submission using job script

i.e qsub cuda.pbs
1160.admin1
[hpcuser01@admin1 ~]$ qstat -an

admin1:
Req’d Req’d Elap
Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time


1160.admin1 hpcuser* gpuq gpu_job 24084 1 1 – 00:00 R 00:00
gpu01/0

However Inerative submission had issue
[hpcuser01@admin1 ~]$ qsub -I -l select=1:ncpus=1:mpiprocs=1:ngpus=1 -q gpuq
qsub: Access from host not allowed, or unknown host

The iterative issue was also resolved. This was found to be multiple host entry for the same server. We commented out the unwanted entries to fix the issue. You may use below commands and check from admin node and any compute/gpu node too the hostnames

[root@admin1 ~]# pbs_hostn -v admin1
[root@gpu01 ~]# pbs_hostn -v admin1