/*
 * task.c - manage tasks for other clients, not myself
 *
 * $Id: task.c 388 2006-11-27 17:09:48Z pw $
 *
 * Copyright (C) 2005-6 Pete Wyckoff <pw@osc.edu>
 *
 * Distributed under the GNU Public License Version 2 or later (See LICENSE)
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <signal.h>
#include <unistd.h>
#include "mpiexec.h"

/* global variable so anybody can walk it */
static LIST_HEAD(tids_list);
struct list_head *tids = &tids_list;

tids_t *
tid_add(int tid, int client, int task)
{
    tids_t *tp;

    tp = Malloc(sizeof(*tp));
    tp->tid = tid;
    tp->client = client;
    tp->task = task;
    tp->status = 0;
    list_add_tail(&tp->list, tids);
    return tp;
}

/*
 * Lookup the tid entry for a given client's task number.
 */
extern tids_t *
tid_find(int client, int task)
{
    tids_t *tp;

    list_for_each_entry(tp, tids, list)
	if (tp->client == client && tp->task == task)
	    return tp;
    return NULL;
}

/*
 * Forget about a tid and delete all events associated with it.
 */
void
tid_del(tids_t *tp)
{
    evts_t *ep;

    list_for_each_entry(ep, evts, list)
	if (ep->client == tp->client && ep->task == tp->task)
	    ep->dead = 1;
    list_del(&tp->list);
    free(tp);
}

/*
 * Debugging
 */
void
tid_dump(void)
{
    tids_t *tp;

    printf("%s\n", __func__);
    list_for_each_entry(tp, tids, list)
	printf("tid %d client %d task %d status %d\n", tp->tid, tp->client,
	  tp->task, tp->status);
}

/*
 * For debugging printfs, find the node name for a given tm_node_id.
 */
const char *
node_name_from_nid(tm_node_id nid)
{
    int i, j;

    for (i=0; i<numnodes; i++)
	for (j=0; j<nodes[i].numcpu; j++)
	    if (nodes[i].ids[j] == nid)
		return nodes[i].name;
    error("%s: no such node id %d", __func__, nid);
}

/*
 * Get rid of this task, but keep around any events it might generate.
 * Return the new event number that will report when the kill is completed.
 */
tm_event_t
kill_tid(tids_t *tp)
{
    tm_event_t evt = -1;
    int ret;

    debug(2, "%s: kill client %d task %d", __func__, tp->client, tp->task);
    ret = tm_kill(tp->tid, SIGKILL, &evt);
    if (ret == TM_SUCCESS)
	evt_add(evt, tp->client, tp->task, EVT_KILL);
    else if (ret == TM_ENOTFOUND) {
	debug(2, "%s: delete already dead client %d task %d",
	  __func__, tp->client, tp->task);
	tid_del(tp);
    } else
	error_tm(ret, "%s: tm_kill client %d task %d", __func__,
	  tp->client, tp->task);
    return evt;
}

/* set once to true if somebody's early exit caused all the others to
 * be killed */
int have_killed = 0;

/*
 * Use tm to send a signal to all tasks.
 */
void
kill_tasks(int signum)
{
    int i;

    debug(1, "%s: killing all tasks", __func__);
    for (i=0; i<numtasks; i++) {
	/* only try to tasks that are running */
	if (tasks[i].done != DONE_NOT)
	    continue;
	if (concurrent_master) {
	    tm_event_t evt;
	    int ret;
	    debug(2, "%s: kill my task %d on %s", __func__,
	      i, nodes[tasks[i].node].name);
	    ret = tm_kill(tasks[i].tid, SIGKILL, &evt);
	    if (ret == TM_SUCCESS)
		evt_add(evt, -1, i, EVT_KILL);
	    else if (ret == TM_ENOTFOUND) {
		debug(2, "%s: tried to kill my already dead task %d",
		  __func__, i);
		/* but no tid to delete, and don't mark done until obit */
	    } else
		error_tm(ret, "%s: tm_kill my task %d", __func__, i);
	} else {
	    concurrent_request_kill(i, signum);
	}
    }
    have_killed = 1;
}

/*
 * Wait for tasks to finish, if any exit with non-zero status, perhaps
 * kill the rest.  Also, if concurrent_master, pay attention to other
 * mpiexec requests.
 */
void
wait_tasks(void)
{
    int last_numspawned = numspawned+1;
    int done;
    evts_t *ep;

    /*
     * Wait for all tasks to die, and all events to dry up.
     */
    for (;;) {
	done = 1;
	if (numspawned)
	    done = 0;
	else {
	    /* see if any events left, if so, try to drain them */
	    evts_t *ep;
	    list_for_each_entry(ep, evts, list) {
		if (ep->client == -1) {  /* self, any task */
		    done = 0;
		    break;
		}
	    }
	}
	if (done)
	    break;

	if (cl_args->verbose) {
	    if (numspawned > 0 && last_numspawned != numspawned) {
		int i, chars, more;
		const int width = 80;
		const int reserve = 18;
		last_numspawned = numspawned;
		fprintf(stderr, "%s: %s: waiting for", progname, __func__);
		chars = 32;
		more = 0;
		for (i=0; i<numtasks; i++)
		    if (tasks[i].done == DONE_NOT) {
			int len = strlen(nodes[tasks[i].node].name) + 1;
			if (more || chars + len > width - reserve) {
			    ++more;
			} else {
			    fprintf(stderr, " %s", nodes[tasks[i].node].name);
			    chars += len;
			}
		    }
		if (more)
		    fprintf(stderr, " and %d others", more);
		fprintf(stderr, ".\n");
	    }
	    if (numspawned == 0)
		debug(2, "%s: tasks done, but waiting on events", __func__);
	}

	/* look for and handle an event */
	if (concurrent_master) {
	    for (;;) {
		ep = poll_event();
		if (!ep)
		    break;
		dispatch_event(ep);
	    }
	    cm_check_clients();  /* includes timeout */
	} else {
	    ep = block_event();
	    if (ep)
		dispatch_event(ep);
	}
    }
}



syntax highlighted by Code2HTML, v. 0.9.1