/*
* event.c - handle events
*
* $Id: event.c 388 2006-11-27 17:09:48Z pw $
*
* Copyright (C) 2005-6 Pete Wyckoff <pw@osc.edu>
*
* Distributed under the GNU Public License Version 2 or later (See LICENSE)
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h> /* memcpy */
#include <signal.h>
#include <unistd.h>
#include <errno.h>
#include "mpiexec.h"
/* global variable so anybody can walk it */
static LIST_HEAD(evts_list);
struct list_head *evts = &evts_list;
/*
* Using a full linked list for reasonable speed on insertion and deletion,
* at the cost of extra pointer operations and storage. Cannot use an auto-
* growing array as TM is given pointers into these structs so they must
* remain stable.
*/
void
evt_add(int evt, int client, int task, evt_type_t type)
{
evts_t *ep;
ep = Malloc(sizeof(*ep));
INIT_LIST_HEAD(&ep->list);
ep->evt = evt;
ep->client = client;
ep->task = task;
ep->type = type;
ep->dead = 0;
ep->obit_evt = 0;
list_add_tail(&ep->list, evts);
}
evts_t *
evt_lookup(int evt)
{
evts_t *ep;
list_for_each_entry(ep, evts, list) {
if (ep->evt == evt)
return ep;
}
return NULL;
}
void
evt_del(evts_t *ep)
{
list_del(&ep->list);
free(ep);
}
const char *
evt_type_string(evt_type_t type)
{
switch (type) {
case EVT_START: return "start";
case EVT_OBIT: return "obit";
case EVT_KILL: return "kill";
default: return "(unknown event type)";
}
}
void
evt_dump(void)
{
evts_t *ep;
printf("%s\n", __func__);
list_for_each_entry(ep, evts, list)
printf("evt %d client %d task %d type %s dead %d obit_evt %d\n",
ep->evt, ep->client, ep->task, evt_type_string(ep->type),
ep->dead, ep->obit_evt);
}
/*
* General signal handler can be called from myself after a delay (hence
* the use of alarm), or generated by the stdio listener in response to
* an MPI_Abort.
*/
static void
kill_others_now(int sig ATTR_UNUSED)
{
const int alarm_list[] = { SIGALRM };
debug(1, "%s: alarm went off, killing all other tasks", __func__);
handle_signals(alarm_list, list_count(alarm_list), SIG_DFL);
/* kill rest of tasks (hard), but still wait for them to die */
kill_tasks(SIGKILL);
}
static void
process_obit_event(evts_t *ep)
{
debug(1, "%s: evt %d task %d on %s stat %d",
__func__, ep->evt, ep->task, nodes[tasks[ep->task].node].name,
*tasks[ep->task].status);
if (startup_complete)
tasks[ep->task].done = DONE_OK;
else
tasks[ep->task].done = DONE_STARTUP_INCOMPLETE;
/*
* Kill everybody else if either
* --kill command-line argument
* or
* exit_status indicates non-normal exit (i.e. segv will cause all to be
* killed but exit(1) will not).
*/
if (!have_killed)
if (*tasks[ep->task].status >= PBS_SIG_OFFSET || cl_args->kill_others) {
/* schedule to kill all others in a little while */
const int alarm_list[] = { SIGALRM };
handle_signals(alarm_list, list_count(alarm_list), kill_others_now);
alarm(5);
have_killed = 1;
}
--numspawned;
}
static void
process_kill_event(evts_t *ep)
{
debug(1, "%s: evt %d task %d on %s", __func__, ep->evt, ep->task,
nodes[tasks[ep->task].node].name);
/* mostly ignore it, still wait for the obit */
}
static void
process_start_event(evts_t *ep)
{
int ret = 0;
debug(1, "%s: evt %d task %d on %s", __func__, ep->evt, ep->task,
nodes[tasks[ep->task].node].name);
--numtasks_waiting_start;
/* ask for an obit */
if (concurrent_master) {
/* (master does not enter his own tasks into the tids array) */
tm_event_t evt;
int err = tm_obit(tasks[ep->task].tid, tasks[ep->task].status, &evt);
if (err == TM_SUCCESS)
evt_add(evt, -1, ep->task, EVT_OBIT);
else if (err == TM_ENOTFOUND)
ret = 1; /* died */
else
error_tm(ret, "%s: tm_obit master %d", __func__, ep->task);
} else {
if (ep->obit_evt == -1)
ret = 1; /* died */
else
evt_add(ep->obit_evt, -1, ep->task, EVT_OBIT);
}
if (ret) {
/* mark done if obit failed due to task not found, and status
* field will be invalid */
debug(1, "%s: task %d on %s too fast, no obit", __func__,
ep->task, nodes[tasks[ep->task].node].name);
tasks[ep->task].done = DONE_NO_EXIT_STATUS;
*tasks[ep->task].status = -1;
--numspawned;
}
}
/*
* Work the effects of this event into the data structures.
*/
void
dispatch_event(evts_t *ep)
{
/* perhaps ignore or send to a client */
if (concurrent_master) {
if (ep->dead) {
/* ignore, event for deleted tid */
debug(2, "%s: ignoring dead event %d client %d task %d type %s",
__func__, ep->evt, ep->client, ep->task,
evt_type_string(ep->type));
goto out;
}
if (ep->client >= 0) {
cm_forward_event(ep);
goto out;
}
}
switch (ep->type) {
case EVT_OBIT:
process_obit_event(ep);
break;
case EVT_KILL:
process_kill_event(ep);
break;
case EVT_START:
process_start_event(ep);
break;
default:
error("%s: unknown event type %d", __func__, ep->type);
}
out:
evt_del(ep);
}
/*
* Grab the next tm event. Return 0 if non-block and nothing.
*/
static evts_t *
poll_or_block_event(int block)
{
evts_t *ep;
tm_event_t evt;
int remote_tm_error;
int err;
if (concurrent_master) {
redo:
/* never blocking here, must timeout to check other things */
err = tm_poll(TM_NULL_EVENT, &evt, 0, &remote_tm_error);
if (err == TM_SUCCESS) {
/* valid event, but perhaps did not finish correctly */
if (evt != TM_NULL_EVENT) {
if (remote_tm_error == TM_SUCCESS)
;
else if (remote_tm_error == TM_ESYSTEM)
/* issue warning, but look at event anyway */
warning("%s: evt %d remote system error", __func__, evt);
else
error_tm_or_pbs(remote_tm_error,
"%s: tm_poll remote %d", __func__, remote_tm_error);
}
} else if (err == TM_ENOTFOUND) {
evt = TM_NULL_EVENT; /* happens for -server when no tasks */
} else if (err == TM_ENOTCONNECTED) {
reconnect_to_mom();
goto redo;
} else
error_tm(err, "%s: tm_poll", __func__);
ep = 0;
if (evt != TM_NULL_EVENT) {
ep = evt_lookup(evt);
if (!ep)
error("%s: no event structure for %d", __func__, evt);
}
/*
* Check stdio listener. Non-master equivalent of this code is
* pushed down inside select() in concurrent_poll.
*/
if (ep == NULL && pipe_with_stdio >= 0) {
fd_set rfs;
struct timeval tv = { 0, 0 };
int n;
FD_ZERO(&rfs);
FD_SET(pipe_with_stdio, &rfs);
n = select(pipe_with_stdio+1, &rfs, 0, 0, &tv);
if (n < 0) {
if (errno != EINTR)
error_errno("%s: select", __func__);
}
if (n > 0)
stdio_msg_parent_read();
}
} else {
ep = concurrent_poll(block);
}
return ep;
}
evts_t *poll_event(void) { return poll_or_block_event(0); }
evts_t *block_event(void) { return poll_or_block_event(1); }
/*
* Loop over events, non-blocking, handling them until one that was an
* OBIT for myself happened. Then return 1. Return 0 if nothing bad
* happened or no events were found to process.
*/
int
poll_events_until_obit(void)
{
evts_t *ep;
int found_obit = 0;
while ((ep = poll_event())) {
if (ep->client == -1 && ep->type == EVT_OBIT)
found_obit = 1;
dispatch_event(ep);
if (found_obit)
break;
}
return found_obit;
}
syntax highlighted by Code2HTML, v. 0.9.1