/* Schedwi
   Copyright (C) 2007-2015 Herve Quatremain

   This file is part of Schedwi.

   Schedwi is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   Schedwi is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/* job_launcher_main.c -- Launcher main function */

#include <schedwi.h>

#if STDC_HEADERS
#include <stdlib.h>
#include <string.h>
#endif

#if HAVE_STDIO_H
#include <stdio.h>
#endif

#if HAVE_TIME_H
#include <time.h>
#endif

#if HAVE_UNISTD_H
#include <unistd.h>
#endif

#if HAVE_SIGNAL_H
#include <signal.h>
#endif

#if HAVE_SYS_WAIT_H
# include <sys/wait.h>
#endif
#ifndef WEXITSTATUS
# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
#endif
#ifndef WIFEXITED
# define WIFEXITED(stat_val) (((stat_val) & 255) == 0)
#endif
#ifndef WTERMSIG
# define WTERMSIG(stat_val) ((unsigned)(stat_val) >> 8)
#endif

#if HAVE_ERRNO_H
#include <errno.h>
#endif
#ifndef errno
extern int errno;
#endif


#include <conf.h>
#include <lwc_log.h>
#include <result_mgnt.h>
#include <signal_utils.h>
#include <lib_functions.h>
#include <utils.h>
#include <job_launcher_main.h>

/*
 * Big number for the alarm() system call used to compute the job duration.
 * On some system (GNU/linux on ppc for instance) this number must not be
 * to big due to limits.  The value here should be fine
 */
#define VERY_BIG_NUMBER_OF_SEC 2000000

#if HAVE_PID_T
static pid_t child_pid;
#else
static int child_pid;
#endif


/*
 * Signal handler for SIGTERM
 */
static void
sigterm_action (int sig)
{
#if HAVE_NANOSLEEP
	struct timespec req;


	req.tv_sec = SCHEDWI_SLEEP_BETWEEN_TERM_KILL;
	req.tv_nsec = 0;
#endif
	kill (child_pid, SIGTERM);

#if HAVE_NANOSLEEP
	nanosleep (&req, NULL);
#elif HAVE_SLEEP
	sleep (SCHEDWI_SLEEP_BETWEEN_TERM_KILL);
#elif HAVE_USLEEP
	usleep (SCHEDWI_SLEEP_BETWEEN_TERM_KILL * 1000000);
#endif

	kill (child_pid, SIGKILL);
}


/*
 * Launcher main function:
 *   - Wait for the child process (the actual job command) to end
 *   - Read its status (exit, kill)
 *   - Write it to a result file
 *   - Exit
 */
void
#if HAVE_PID_T
launcher_main (	pid_t child, const char *job_id, const char *job_name,
		char open_log)
#else
launcher_main (	int child, const char *job_id, const char *job_name,
		char open_log)
#endif
{
	const char *dir_name, *prefix, *suffix;
	const char *log_file, *date_format;
	long int facility;
	char *file_name, *id;
	int status, ret, workload;
	ssize_t l;
	unsigned int remaining;
	pid_t c;
	struct sigaction sa;
	sigset_t mask;


	/*
	 * alarm() is used to compute the duration of the job (child
	 * process).  It is called the first time with a big number of
	 * seconds as a parameter (it will then never deliver the SIGALRM
	 * signal).  The second time, after the end of the child process, it
	 * is called again.  Its returned value is the remaining number of
	 * seconds from the first call.  This way we can compute the duration
	 * of the child process.
	 * We could use two calls to time(), and use difftime().  However
	 * this is not accurate as the system time may change between the
	 * two calls (daylight saving for example).
	 */
	alarm (VERY_BIG_NUMBER_OF_SEC);

	child_pid = child;
	signal_mask_all ();

	if (	   conf_get_param_string ("RESULT_DIR", &dir_name) != 0
		|| conf_get_param_string ("RESULT_PREFIX", &prefix) != 0
		|| conf_get_param_string ("RESULT_SUFFIX", &suffix) != 0)
	{
		/* Internal error. Unknown parameter name or wrong type */
		alarm (0);
		signal_unmask ();
		return;
	}

	if (open_log != 0) {
		if (	   conf_get_param_string ("LOG_FILE", &log_file) != 0
			|| conf_get_param_syslog_facility (	"SYSLOG",
								&facility) != 0
			|| conf_get_param_string (	"DATE_FORMAT",
							&date_format) != 0)
		{
			/*
			 * Internal error. Unknown parameter name or
			 * wrong type
			 */
			alarm (0);
			signal_unmask ();
			return;
		}
		lwc_newLog (	PACKAGE_NAME,
				(facility != -1) ? 1 : 0,
				(facility != -1) ? (int)facility : 0,
				log_file,
				date_format);
	}

	/* Install the signal handler for SIGTERM */
	schedwi_memset (&sa, 0, sizeof (struct sigaction));
	sa.sa_handler = sigterm_action;
	sigemptyset (&(sa.sa_mask));
	sa.sa_flags = 0;
	sigaction (SIGTERM, &sa, NULL);
	if (sigemptyset (&mask) == 0 && sigaddset (&mask, SIGTERM) == 0) {
		sigprocmask (SIG_UNBLOCK, &mask, NULL);
	}

	do {
		c = waitpid (child, &status, 0);
	} while (c < 0 && errno == EINTR);
	id = split_jobid (job_id, &workload);
	if (c < 0) {
		lwc_writeLog (	LOG_CRIT,
				_("Workload %d: %s (id %s): waitpid: %s"),
				workload,
				(job_name == NULL)? "": job_name,
				id,
				strerror (errno));
		alarm (0);
		signal_unmask ();
		return;
	}

	remaining = alarm (0);

	l = 	  schedwi_strlen (dir_name)
		+ schedwi_strlen (DIR_SEP)
		+ schedwi_strlen (prefix)
		+ schedwi_strlen (job_id)
		+ schedwi_strlen (suffix) + 1;
	file_name = (char *) malloc (l);
	if (file_name == NULL) {
		lwc_writeLog (	LOG_CRIT,
			_("Workload %d: %s (id %s): Memory allocation error"),
				workload,
				(job_name == NULL)? "": job_name,
				id);
		signal_unmask ();
		return;
	}

	ret = snprintf (file_name, l,
				"%s%s%s%s%s",
				dir_name, DIR_SEP, prefix, job_id, suffix);
	if (ret >= l || ret < 0) {
		lwc_writeLog (	LOG_CRIT,
		_("Workload %d: %s (id %s): Internal error: buffer too small"),
				workload,
				(job_name == NULL)? "": job_name,
				id);
		free (file_name);
		signal_unmask ();
		return;
	}

	if (WIFEXITED (status) != 0) {
		if (write_job_ended_to_file (
				file_name,
				job_id,
				WEXITSTATUS (status),
				VERY_BIG_NUMBER_OF_SEC - remaining) != 0)
		{
			lwc_writeLog (	LOG_CRIT,
_("Workload %d: %s (id %s): Cannot write status (exit code: %d): %s: %s"),
					workload,
					(job_name == NULL)? "": job_name,
					id,
					WEXITSTATUS (status),
					file_name,
					strerror (errno));
			free (file_name);
			signal_unmask ();
			return;
		}
		lwc_writeLog (	LOG_INFO,
		_("Workload %d: %s (id %s): Finished with exit code %d"),
				workload,
				(job_name == NULL)? "": job_name,
				id,
				WEXITSTATUS (status));
	}
	else {
		if (write_job_killed_to_file (
				file_name,
				job_id,
				WTERMSIG (status),
				VERY_BIG_NUMBER_OF_SEC - remaining) != 0)
		{
			lwc_writeLog (	LOG_CRIT,
_("Workload %d: %s (id %s): Cannot write status (killed by signal: %d): %s: %s"),
					workload,
					(job_name == NULL)? "": job_name,
					id,
					WTERMSIG (status),
					file_name,
					strerror (errno));
			free (file_name);
			signal_unmask ();
			return;
		}
		lwc_writeLog (	LOG_INFO,
			_("Workload %d: %s (id %s): Killed by signal %d"),
				workload,
				(job_name == NULL)? "": job_name,
				id,
				WTERMSIG (status));
	}
	free (file_name);
	signal_unmask ();
	return;
}

/*-----------------============== End Of File ==============-----------------*/
