Time left for a job

Hello,
Apologies if this has already been answered or the information is trivially available, but I could not find it anywhere. I have a C++ program that I am submitting to PBS. I want to check the time left via an API call from time to time so that I can checkpoint the job and not lose the work done. I cannot find whether this call exists and which it is. Thanks in advance for your help. Best regards,

Hi

You can write a python program which can directly interface to the PBS server and make queries and return the time left for a job. If you look at my site here PBSWeb in the tests directory you will find relatively simple examples.

Much easier though is to write a bash script or python script to parse the output of qstat. It would only take an hour or so to do that and the script could then run qalter to bump up the jobs time or prod your C++ program to checkpoint itself.

Mike

Thanks,
cumbersome but effective. Thanks a lot again! Best,

Federico Carminati
av Ste Cécile 29
1217 Meyrin
Switzerland/Suisse
Tel: +41797732246

FWIW, I put together the following C code that you can call from within your program to get the time remaining. It does the equivalent of a qstat on the job and compares the walltime used with the time requested. The routine self-throttles to at most one server call every 30 seconds and estimates the remaining time between real calls.

One reason to compute the remaining time while the program is running is that someone might use qalter to change the job’s walltime and you would want to take that into account.

A disadvantage to this method (and any method that runs from inside your program) is that it needs to fork to run pbs_iff to validate each connection to the server. If your program has lots of mapped files, etc, this might be measurable overhead even with the throttling.

A way around this issue is to create a separate program/script that computes the time remaining every so often and writes the value to a designated file. Your main program could then read this file whenever it wanted to check the remaining time. This technique would have lower overhead for your program, but might cause more overhead overall.

#include <stdio.h>
#include <pbs_ifl.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>

char *strptime(const char *s, const char *format, struct tm *tm);

/**
 * @brief
 *	sets the attribute details
 *
 * (Modified from routine in qselect.c)
 * @param[out] list - attribute list
 * @param[in] a_name - attribute name
 * @param[in] r_name - resource name
 * @param[in] op - operation to perform (usually EQ)
 *
 * @return 0 on success, else -1
 *
 */
static int
set_attrop(struct attropl **list, char *a_name, char *r_name, enum batch_op op)
{
	struct attropl *attr;

	attr = (struct attropl *) malloc(sizeof(struct attropl));
	if (attr == NULL) {
		return -1;
	}

	if (a_name == NULL) {
		attr->name = NULL;
	} else {
		attr->name = strdup(a_name);
		if (attr->name == NULL) {
			free(attr);
			return -1;
		}
	}

	if (r_name == NULL) {
		attr->resource = NULL;
	} else {
		attr->resource = strdup(r_name);
		if (attr->resource == NULL) {
			if (attr->name)
				free(attr->name);
			free(attr);
			return -1;
		}
	}

	attr->op = op;
	attr->next = *list;
	*list = attr;
	return 0;
}


/**
 * @brief
 *	get remaining time for current PBS job
 *
 * @return time, in seconds; -1 on error
 *
 */
#define INTERVAL	30	/* How often to annoy the server for new data */
long
get_time_left(void)
{
	static time_t last_check_time = 0;
	static long last_value;
	time_t now;
	struct attropl *query_list = NULL;
	char * job_id;
	int conn = -1;
	struct batch_status *bs = NULL;
	char * rsrc_name = "walltime";	/* resource of interest */
	long limit = -1;
	long used = -1;

	job_id = getenv("PBS_JOBID");
	if (job_id == NULL) {
		return -1;
	}

	/* To reduce load on the server and because the MoM updates the
	 * server infrequently, interpolate remaining time between updates.
	 */
	now = time(NULL);
	if (now < last_check_time + INTERVAL) {
		return last_value - (now - last_check_time);
	}
	last_check_time = now;

	/* Set up to ask for requested walltime and used walltime */

	set_attrop(&query_list, ATTR_l, rsrc_name, EQ);
	set_attrop(&query_list, ATTR_used, rsrc_name, EQ);

	conn = pbs_connect(NULL);
	if (conn < 0) {
		goto err_out;
	}
	bs = pbs_statjob(conn, job_id, (struct attrl *)query_list, NULL);
	/* Because we asked about just one job, the first response will
	 * apply to that job. */
	if (bs == NULL) {
		goto err_out;
	}
	struct attrl *attribs;
	for (attribs = bs->attribs; attribs; attribs = attribs->next) {
		char *aname = attribs->name;
		if (strcmp(aname, ATTR_l) != 0
		 && strcmp(aname, ATTR_used) != 0) {
		 	continue;
		}
		if (strcmp(attribs->resource, rsrc_name)) {
			continue;
		}
		/* Walltime is formatted as hh:mm:ss, so convert to seconds */
		char *t;
		struct tm tm;
		t = strptime(attribs->value, "%H:%M:%S", &tm);
		if (t == NULL) {
			goto err_out;
		}
		long value = (long)tm.tm_sec +
			     tm.tm_min * 60 +
			     tm.tm_hour * 3600;
		if (strcmp(aname, ATTR_l) == 0) {
			limit = value;
		} else {
			used = value;
		}
	}
	/* Free malloced items. */
err_out:
	if (bs) pbs_statfree(bs);
	if (conn >= 0) pbs_disconnect(conn);
	while (query_list) {
		struct attropl *temp;
		if (query_list->name) free(query_list->name);
		if (query_list->resource) free(query_list->resource);
		temp = query_list->next;
		free(query_list);
		query_list = temp;
	}
	if (limit < 0 || used < 0) {
		return -1;
	}
	last_value = limit - used;
	return last_value;
}

#ifdef UNIT_TEST
/* Compile with
 * gcc -g -O0 -Wall -DUNIT_TEST -I /opt/pbs/include -L /opt/pbs/lib -Wl,-rpath=/opt/pbs/lib -lpbs time_left.c
 * And run from inside a PBS job.
 */
int main(int argc, char **argv)
{
	long	left;

	left = get_time_left();
	printf("%ld\n", left);
	sleep(5);
	left = get_time_left();
	printf("%ld\n", left);
	sleep(5);
	left = get_time_left();
	printf("%ld\n", left);
	sleep(INTERVAL);
	left = get_time_left();
	printf("%ld\n", left);
	return 0;
}
#endif
1 Like