Hi @subhasisb ,
Thanks for reply.
I use gdb like You said. I descry error in manner of comparing hostnames in are_we_primary() scheduler function:
Summary of gdb
[root@primary scheduler]# gdb pbs_sched
GNU gdb (GDB) Red Hat Enterprise Linux 8.2-15.el8
Copyright (C) 2018 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from pbs_sched...done.
(gdb) break are_we_primary
Breakpoint 1 at 0x46ba19: file pbs_sched_utils.cpp, line 1112.
(gdb) n
The program is not being run.
(gdb) run
Starting program: openpbs/src/scheduler/pbs_sched
Missing separate debuginfos, use: yum debuginfo-install glibc-2.28-151.el8.x86_64
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Breakpoint 1, are_we_primary () at pbs_sched_utils.cpp:1112
1112 if ((c = are_we_primary()) == 1) {
Missing separate debuginfos, use: yum debuginfo-install libblkid-2.32.1-27.el8.x86_64 libgcc-8.4.1-1.el8.x86_64 libical-3.0.3-3.el8.x86_64 libicu-60.3-2.el8_1.x86_64 libmount-2.32.1-27.el8.x86_64 libstdc++-8.4.1-1.el8.x86_64 libuuid-2.32.1-27.el8.x86_64 libxcrypt-4.1.1-4.el8.x86_64 openssl-libs-1.1.1g-15.el8_3.x86_64 python3-libs-3.6.8-37.el8.rocky.x86_64 sssd-client-2.4.0-9.el8_4.1.x86_64 systemd-libs-239-45.el8_4.1.x86_64 zlib-1.2.11-17.el8.x86_64
(gdb) n
465 snprintf(server_host, sizeof(server_host), "%s", pbs_conf.pbs_leaf_name);
(gdb) n
466 endp = strchr(server_host, ','); /* find the first name */
(gdb) n
467 if (endp)
(gdb) n
469 endp = strchr(server_host, ':'); /* cut out the port */
(gdb) n
470 if (endp)
(gdb) n
479 if ((pbs_conf.pbs_secondary == NULL) && (pbs_conf.pbs_primary == NULL))
(gdb) n
481 if ((pbs_conf.pbs_secondary == NULL) || (pbs_conf.pbs_primary == NULL))
(gdb) p pbs_conf.pbs_secondary
$1 = 0x6e7b60 "sec"
(gdb) p pbs_conf.pbs_primary
$2 = 0x6e7b40 "pri"
(gdb) n
484 if (get_fullhostname(pbs_conf.pbs_primary, hn1, (sizeof(hn1) - 1)) == -1) {
(gdb) n
489 if (strcmp(hn1, server_host) == 0)
(gdb) p hn1
$3 = "primary.domain", '\000' <repeats 233 times>
(gdb) p server_host
$4 = "pri", '\000' <repeats 75 times>, "cput\000\000\000\000\200\206\272\366\377\177\000\000\300\340\377\377\377\177\000\000\003\000\000\000\000\000\000\000mem\000\377\177\000\000\000\232\060\001a˝p\340\340\377\377\377\177\000\000\b\000\000\000\000\000\000\000walltime\000\220\224\366\377\177\000\000\000\341\377\377\377\177\000\000\r\000\000\000\000\000\000\000\002", '\000' <repeats 78 times>
(gdb) s
sched_main (argc=1, argv=0x7fffffffe368, sched_ptr=<optimized out>) at pbs_sched_utils.cpp:1112
1112 if ((c = are_we_primary()) == 1) {
(gdb) n
1117 log_err(-1, "pbs_sched", "neither primary or secondary server");
(gdb) n
pbs_sched: pbs_sched, neither primary or secondary server
1118 exit(1);
(gdb) n
[Inferior 1 (process 2330) exited with code 01]
It is comparing short hostname of pbs_leaf_name (server_host variable) value with FQDN hostname(hn1 variable) of PBS_PRIMARY and PBS_SECONDARY value.
When I change a bit code of are_we_primary function scheduler runs whithout error.
File with are_we_primary function is: src/scheduler/pbs_sched_utils.cpp
oryginal function
are_we_primary()
{
char server_host[PBS_MAXHOSTNAME + 1];
char hn1[PBS_MAXHOSTNAME + 1];
if (pbs_conf.pbs_leaf_name) {
char *endp;
snprintf(server_host, sizeof(server_host), "%s", pbs_conf.pbs_leaf_name);
endp = strchr(server_host, ','); /* find the first name */
if (endp)
*endp = '\0';
endp = strchr(server_host, ':'); /* cut out the port */
if (endp)
*endp = '\0';
} else if ((gethostname(server_host, (sizeof(server_host) - 1)) == -1) ||
(get_fullhostname(server_host, server_host, (sizeof(server_host) - 1)) == -1)) {
log_err(-1, __func__, "Unable to get my host name");
return -1;
}
/* both secondary and primary should be set or neither set */
if ((pbs_conf.pbs_secondary == NULL) && (pbs_conf.pbs_primary == NULL))
return 1;
if ((pbs_conf.pbs_secondary == NULL) || (pbs_conf.pbs_primary == NULL))
return -1;
if (get_fullhostname(pbs_conf.pbs_primary, hn1, (sizeof(hn1) - 1)) == -1) {
log_err(-1, __func__, "Unable to get full host name of primary");
return -1;
}
if (strcmp(hn1, server_host) == 0)
return 1; /* we are the listed primary */
if (get_fullhostname(pbs_conf.pbs_secondary, hn1, (sizeof(hn1) - 1)) == -1) {
log_err(-1, __func__, "Unable to get full host name of secondary");
return -1;
}
if (strcmp(hn1, server_host) == 0)
return 0; /* we are the secondary */
return -1; /* cannot be neither */
}
Propose of change:
are_we_primary()
{
char server_host[PBS_MAXHOSTNAME + 1];
char hn1[PBS_MAXHOSTNAME + 1];
char srvh[PBS_MAXHOSTNAME + 1];
if (pbs_conf.pbs_leaf_name) {
char *endp;
snprintf(server_host, sizeof(server_host), "%s", pbs_conf.pbs_leaf_name);
endp = strchr(server_host, ','); /* find the first name */
if (endp)
*endp = '\0';
endp = strchr(server_host, ':'); /* cut out the port */
if (endp)
*endp = '\0';
} else if ((gethostname(server_host, (sizeof(server_host) - 1)) == -1) ||
(get_fullhostname(server_host, server_host, (sizeof(server_host) - 1)) == -1)) {
log_err(-1, __func__, "Unable to get my host name");
return -1;
}
/* both secondary and primary should be set or neither set */
if ((pbs_conf.pbs_secondary == NULL) && (pbs_conf.pbs_primary == NULL))
return 1;
if ((pbs_conf.pbs_secondary == NULL) || (pbs_conf.pbs_primary == NULL))
return -1;
if (get_fullhostname(server_host, srvh, (sizeof(srvh) - 1)) == -1) {
log_err(-1, __func__, "Unable to get full host name of pbs_leaf_name");
return -1;
}
if (get_fullhostname(pbs_conf.pbs_primary, hn1, (sizeof(hn1) - 1)) == -1) {
log_err(-1, __func__, "Unable to get full host name of primary");
return -1;
}
if (strcmp(hn1, srvh) == 0)
return 1; /* we are the listed primary */
if (get_fullhostname(pbs_conf.pbs_secondary, hn1, (sizeof(hn1) - 1)) == -1) {
log_err(-1, __func__, "Unable to get full host name of secondary");
return -1;
}
if (strcmp(hn1, srvh) == 0)
return 0; /* we are the secondary */
return -1; /* cannot be neither */
}
I added srvh variable:
char srvh[PBS_MAXHOSTNAME + 1];
which becomes a FQDN hostname of PBS_LEAF_NAME variable:
get_fullhostname(server_host, srvh, …
and now we can compare two variables with fullhostnames:
…strcmp(hn1, srvh)…
I am not computer programmer so maybe this solution need to be done with another way, but for me this working good.
Regards!