diff -ruN pbs-2.3.12-mpiexec/doc/man8/pbs_mom.8B pbs-2.3.12-mom-restart/doc/man8/pbs_mom.8B
--- pbs-2.3.12-mpiexec/doc/man8/pbs_mom.8B	Tue Aug  8 20:16:28 2000
+++ pbs-2.3.12-mom-restart/doc/man8/pbs_mom.8B	Thu Sep  4 11:10:03 2003
@@ -320,10 +320,12 @@
 results in a log file entry. The signal is used to limit the time taken by
 certain children processes, such as the prologue and epilogue.
 .IP "SIGINT and SIGTERM"
-Result in pbs_mom terminating all running children and exiting.  This is the
-action for the following signals as well: SIGXCPU, SIGXFSZ, SIGCPULIM,
-and SIGSHUTDN.
-.IP "SIGPIPE, SIGUSR1, SIGUSR2, SIGINFO"
+Result in pbs_mom exiting without terminating any running jobs.
+This is the action for the following signals as well: SIGXCPU, SIGXFSZ,
+SIGCPULIM, and SIGSHUTDN.
+.IP SIGUSR1
+causes mom to kill all running jobs on the node, then exit.
+.IP "SIGPIPE, SIGUSR1, SIGINFO"
 are ignored.
 .LP
 All other signals have their default behavior installed.
diff -ruN pbs-2.3.12-mpiexec/src/include/mom_func.h pbs-2.3.12-mom-restart/src/include/mom_func.h
--- pbs-2.3.12-mpiexec/src/include/mom_func.h	Tue Aug  8 20:17:01 2000
+++ pbs-2.3.12-mom-restart/src/include/mom_func.h	Thu Sep  4 11:10:03 2003
@@ -134,6 +134,7 @@
 extern void  mom_freenodes A_((job *));
 extern void  scan_for_exiting();
 extern void  scan_for_terminated();
+extern void scan_non_child_tasks(void);
 extern int   set_job A_((job *, struct startjob_rtn *));
 extern void  set_globid A_((job *, struct startjob_rtn *));
 extern int   set_mach_vars A_((job *, struct var_table *));
diff -ruN pbs-2.3.12-mpiexec/src/resmom/catch_child.c pbs-2.3.12-mom-restart/src/resmom/catch_child.c
--- pbs-2.3.12-mpiexec/src/resmom/catch_child.c	Tue Aug  8 20:17:56 2000
+++ pbs-2.3.12-mom-restart/src/resmom/catch_child.c	Thu Sep  4 11:10:03 2003
@@ -699,6 +699,10 @@
 		if (pj == NULL)
 			continue;
 
+		/* set the globid so mom does not coredump in response
+		 * to tm_spawn */
+		set_globid(pj, 0);
+
 		append_link(&svr_alljobs, &pj->ji_alljobs, pj);
 		job_nodes(pj);
 		task_recov(pj);
diff -ruN pbs-2.3.12-mpiexec/src/resmom/linux/mom_mach.c pbs-2.3.12-mom-restart/src/resmom/linux/mom_mach.c
--- pbs-2.3.12-mpiexec/src/resmom/linux/mom_mach.c	Tue Sep  5 20:12:46 2000
+++ pbs-2.3.12-mom-restart/src/resmom/linux/mom_mach.c	Thu Sep  4 11:10:03 2003
@@ -111,6 +111,7 @@
 #include "job.h"
 #include "log.h"
 #include "mom_mach.h"
+#include "mom_func.h"
 #include "resmon.h"
 #include "../rm_dep.h"
 
@@ -1052,8 +1053,22 @@
 		}
 
 		if (sesid == ps->session) {
-			(void)kill(ps->pid, sig);
-			++ct;
+			if (ps->pid == 0) {
+			    sprintf(log_buffer,
+			      "%s: not killing pid 0 with sig %d",
+			      __func__, sig);
+			    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
+			      ptask->ti_job->ji_qs.ji_jobid, log_buffer);
+			} else {
+			    sprintf(log_buffer,
+			      "%s: killing pid %d task %d with sig %d",
+			      __func__, ps->pid, ptask->ti_qs.ti_task, sig);
+			    log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
+			      ptask->ti_job->ji_qs.ji_jobid, log_buffer);
+
+			    (void)kill(ps->pid, sig);
+			    ++ct;
+			}
 		}
 	}
 	return ct;
@@ -2187,3 +2202,60 @@
 
 	return ret_string;
 }
+
+/*
+ * For a recovering (-p) mom, look through existing tasks in existing
+ * jobs for things that have exited that are not owned by us through a
+ * parent-child relationship.  Otherwise we cannot report back to tm
+ * clients when tasks have exited.
+ */
+void
+scan_non_child_tasks(void)
+{
+    job *job;
+    extern list_head svr_alljobs;
+
+    for (job = GET_NEXT(svr_alljobs); job; job = GET_NEXT(job->ji_alljobs)) {
+	task *task;
+	for (task = GET_NEXT(job->ji_tasks); task;
+	  task = GET_NEXT(task->ti_jobtask)) {
+	    struct dirent *dent;
+	    int found;
+
+	    /* only check on tasks that we think should still be around */
+	    if (task->ti_qs.ti_status != TI_STATE_RUNNING)
+		continue;
+
+	    /* look for processes with this session id */
+	    found = 0;
+	    rewinddir(pdir);
+	    while ((dent = readdir(pdir)) != NULL) {
+		proc_stat_t *ps;
+		if (!isdigit(dent->d_name[0]))
+			continue;
+		ps = get_proc_stat(atoi(dent->d_name));
+		if (!ps)
+		    continue;
+
+		if (ps->session == task->ti_qs.ti_sid) {
+		    ++found;
+		    break;
+		}
+	    }
+	    if (!found) {
+		char buf[1024];
+		extern int exiting_tasks;
+		sprintf(buf,
+		  "found exited session %d for task %d in job %s",
+		  task->ti_qs.ti_sid, task->ti_qs.ti_task, job->ji_qs.ji_jobid);
+		log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, buf);
+
+		task->ti_qs.ti_exitstat = 0;  /* actually unknown */
+		task->ti_qs.ti_status = TI_STATE_EXITED;
+		task_save(task);
+		exiting_tasks = 1;
+	    }
+	}
+    }
+}
+
diff -ruN pbs-2.3.12-mpiexec/src/resmom/linux/mom_start.c pbs-2.3.12-mom-restart/src/resmom/linux/mom_start.c
--- pbs-2.3.12-mpiexec/src/resmom/linux/mom_start.c	Tue Aug  8 20:18:11 2000
+++ pbs-2.3.12-mom-restart/src/resmom/linux/mom_start.c	Thu Sep  4 11:10:03 2003
@@ -264,7 +264,8 @@
 		ptask->ti_qs.ti_exitstat = exiteval;
 		ptask->ti_qs.ti_status = TI_STATE_EXITED;
 		task_save(ptask);
-		sprintf(log_buffer, "task %d terminated", ptask->ti_qs.ti_task);
+		sprintf(log_buffer, "%s: task %d terminated, sid %d",
+		  __func__, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid);
 		LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
 			pjob->ji_qs.ji_jobid, log_buffer);
 
diff -ruN pbs-2.3.12-mpiexec/src/resmom/mom_main.c pbs-2.3.12-mom-restart/src/resmom/mom_main.c
--- pbs-2.3.12-mpiexec/src/resmom/mom_main.c	Mon Jan 15 16:50:22 2001
+++ pbs-2.3.12-mom-restart/src/resmom/mom_main.c	Thu Sep  4 11:10:03 2003
@@ -181,10 +181,11 @@
 /* Local Data Items */
 
 static char	*log_file = (char *)0;
-static int	mom_run_state;
+static enum { MOM_RUN_STATE_RUNNING, MOM_RUN_STATE_EXIT, MOM_RUN_STATE_KILLALL } mom_run_state;
 static int	call_hup = 0;
 static int	nconfig;
 static char	*path_log;
+static int recover = 0;
 
 struct	config_list {
 	struct	config		c;
@@ -232,7 +233,7 @@
 
 /* Local public functions */
 
-void stop_me A_((int));
+static void stop_me A_((int sig));
 
 /* Local private functions */
 
@@ -1542,9 +1543,7 @@
  *	Kill a job.
  *	Call with the job pointer and a signal number.
  */
-int kill_job(pjob, sig)
-    job		*pjob;
-    int		sig;
+int kill_job(job *pjob, int sig)
 {
 	task	*ptask;
 	int	ct = 0;
@@ -1555,10 +1554,15 @@
 	ptask = (task *)GET_NEXT(pjob->ji_tasks);
 	while (ptask) {
 		if (ptask->ti_qs.ti_status == TI_STATE_RUNNING) {
+			log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
+			  pjob->ji_qs.ji_jobid,
+			  "kill_job found a task to kill");
 			ct += kill_task(ptask, sig);
 		}
 		ptask = (task *)GET_NEXT(ptask->ti_jobtask);
 	}
+	log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
+	  pjob->ji_qs.ji_jobid, "kill_job done");
 	return ct;
 }
 
@@ -1577,6 +1581,9 @@
 
 	if (termin_child)
 		scan_for_terminated();
+	/* if -p, must poll tasks inside jobs to look for completion */
+	if (recover == 2)
+		scan_non_child_tasks();
 	if (exiting_tasks)
 		scan_for_exiting();	
 
@@ -1771,7 +1778,6 @@
 	char		*mom_home;
 	task		*ptask;
 	char		*ptr;
-	int	 	recover = 0;
 	int		tryport;
 	int		rppfd;			/* fd for rm and im comm */
 	int		privfd;			/* fd for sending job info */
@@ -2065,7 +2071,6 @@
 	*/
 	act.sa_handler = SIG_IGN;
 	sigaction( SIGPIPE, &act, NULL);
-	sigaction( SIGUSR1, &act, NULL);
 	sigaction( SIGUSR2, &act, NULL);
 #ifdef	SIGINFO
 	sigaction( SIGINFO, &act, NULL);
@@ -2101,6 +2106,7 @@
 	act.sa_handler = stop_me;	/* shutdown for these */
 	sigaction( SIGINT, &act, NULL);
 	sigaction( SIGTERM, &act, NULL);
+	sigaction( SIGUSR1, &act, NULL);
 #ifdef	SIGXCPU
 	sigaction(SIGXCPU, &act, NULL);
 #endif
@@ -2213,7 +2219,8 @@
 	 * section constitutes the "main" loop of MOM
 	 */
 
-	for (mom_run_state=1; mom_run_state; finish_loop(wait_time)) {
+	mom_run_state = MOM_RUN_STATE_RUNNING;
+	for (; mom_run_state == MOM_RUN_STATE_RUNNING; finish_loop(wait_time)) {
 
 		if (call_hup)
 			process_hup();
@@ -2374,25 +2381,27 @@
 		}
 	}
 
-	/* kill any running jobs */
-
-	pjob = (job *)GET_NEXT(svr_alljobs);
-	while (pjob) {
-		if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) {
-			(void)kill_job(pjob, SIGKILL);
-			pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
-			job_save(pjob, SAVEJOB_QUICK);
-		}
-		else
-			term_job(pjob);
-
-		pjob = (job *)GET_NEXT(pjob->ji_alljobs);
+	if (mom_run_state == MOM_RUN_STATE_KILLALL) {
+	    /* kill any running jobs */
+	    pjob = (job *)GET_NEXT(svr_alljobs);
+	    while (pjob) {
+		    if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) {
+			    (void)kill_job(pjob, SIGKILL);
+			    pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
+			    job_save(pjob, SAVEJOB_QUICK);
+		    }
+		    else
+			    term_job(pjob);
+
+		    pjob = (job *)GET_NEXT(pjob->ji_alljobs);
+	    }
+
+	    if (termin_child)
+		    scan_for_terminated();
+	    if (exiting_tasks)
+		    scan_for_exiting();	
 	}
 
-	if (termin_child)
-		scan_for_terminated();
-	if (exiting_tasks)
-		scan_for_exiting();	
 	(void)mom_close_poll();
 
 	rpp_shutdown();
@@ -2427,11 +2436,22 @@
  * stop_me = signal handler for SIGTERM
  */
 
-void stop_me(sig)
-    int	sig;
+static void
+stop_me(int sig)
 {
-	sprintf(log_buffer, "caught signal %d", sig);
+	const char *dowhat;
+
+	if (sig == SIGUSR1) {
+	    /* kill all jobs, then exit */
+	    mom_run_state = MOM_RUN_STATE_KILLALL;
+	    dowhat = "killing all jobs then exiting";
+	} else {
+	    /* just exit, leaving jobs running */
+	    mom_run_state = MOM_RUN_STATE_EXIT;
+	    dowhat = "leaving jobs running, just exiting";
+	}
+
+	sprintf(log_buffer, "caught signal %d: %s", sig, dowhat);
 	log_record(PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER,
 		  msg_daemonname, log_buffer);
-	mom_run_state = 0;
 }
diff -ruN pbs-2.3.12-mpiexec/src/resmom/start_exec.c pbs-2.3.12-mom-restart/src/resmom/start_exec.c
--- pbs-2.3.12-mpiexec/src/resmom/start_exec.c	Thu Sep  4 11:10:44 2003
+++ pbs-2.3.12-mom-restart/src/resmom/start_exec.c	Thu Sep  4 11:10:03 2003
@@ -191,8 +191,7 @@
 }
 
 struct passwd *
-check_pwd(pjob)
-	job *pjob;
+check_pwd(job *pjob)
 {
 	struct passwd		*pwdp;
 	struct group		*grpp;
@@ -1490,6 +1489,17 @@
 	}
 
 	/*
+	 * A restarted mom will not have called this yet, but it is needed
+	 * to spawn tasks (ji_grpcache).
+	 */
+	if (!check_pwd(pjob)) {
+	    sprintf(log_buffer, "job %s task %d check_pwd failed",
+	      pjob->ji_qs.ji_jobid, ptask->ti_qs.ti_task);
+	    log_err(-1, id, log_buffer);
+	    return -1;
+	}
+
+	/*
 	** Begin a new process for the fledgling task.
 	*/
 	if ((pid = fork_me(-1)) == -1)
@@ -1538,7 +1548,9 @@
 			pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
 			job_save(pjob, SAVEJOB_QUICK);
 		}
-		(void)sprintf(log_buffer, "task started, %s", argv[0]);
+		(void)sprintf(log_buffer,
+		  "%s: task started, tid %d, sid %d, cmd %s",
+		  __func__, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid, argv[0]);
 		log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
 			pjob->ji_qs.ji_jobid, log_buffer);
 		return 0;


syntax highlighted by Code2HTML, v. 0.9.1