/*
 * mpiexec.c - gather node settings from PBS, prepare for MPI runtime
 * environment and start tasks through the pbs task manager interface.
 * Attempts to duplicate mpirun  as much as possible, while getting
 * everything correct, and being faster than rsh.
 *
 * $Id: mpiexec.c 391 2006-11-27 22:17:25Z pw $
 *
 * Copyright (C) 2000-6 Pete Wyckoff <pw@osc.edu>
 *
 * Distributed under the GNU Public License Version 2 or later (See LICENSE)
 */
#define _GNU_SOURCE  /* hoping to get strsignal() from string.h */
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <limits.h>
#include <pwd.h>
#include <setjmp.h>
#include <signal.h>
#include <sys/signal.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <netdb.h>       /* gethostbyname */
#include "mpiexec.h"

/*
 * Define some globals.
 */
nodes_t *nodes;
tasks_t *tasks;
spawns_t *spawns;
cl_args_t *cl_args;

const char *progname;
char *progname_dir;
char *jobid;
int numnodes;
int numtasks;
int numspawns;
struct passwd *pswd;
struct sockaddr_in myaddr;

/*
 * Ensure it's executable.  Return true if so.
 */
int
stat_exe(const char *exe, int complain)
{
    struct stat sb;
    int rc;

    debug(3, "%s: testing \"%s\"", __func__, exe);
    rc = stat(exe, &sb);
    if (rc < 0) {
	if (complain)
	    error_errno("%s: %s", __func__, exe);
	return 0;
    }
    if (!S_ISREG(sb.st_mode)) {
	if (complain)
	    error("%s: file %s is not a regular file", __func__, exe);
	return 0;
    }
    if (access(exe, X_OK) < 0) {
	if (complain)
	    error("%s: file %s is not executable", __func__, exe);
	return 0;
    }
    return 1;
}

/*
 * Ensure the executable is in the user's path, or if it's in the cwd,
 * add a "." assuming that's what they meant.  XXX: Security risk?
 * Always returns a new string.
 */
char *
resolve_exe(const char *exe, int argv0_dir)
{
    const char *cp, *cq;
    growstr_t *g;

    /* absolute or relative (non-pathed) location? */
    if (strchr(exe, '/')) {
	stat_exe(exe, 1);
	debug(1, "%s: using absolute path \"%s\"", __func__, exe);
	return strsave(exe);
    }

    g = growstr_init();

    /* if requested, and there was a path-based invocation, lookup in the
     * same directory as argv[0], first */
    if (argv0_dir && progname_dir) {
	growstr_zero(g);
	/* already includes trailing slash */
	growstr_printf(g, "%s%s", progname_dir, exe);
	if (stat_exe(g->s, 0)) {
	    char *ret;
	    debug(1, "%s: found \"%s\" in argv[0] dir", __func__, g->s);
	    ret = strsave(g->s);
	    growstr_free(g);
	    return ret;
	}
    }
    
    /* look in path */
    cp = getenv("PATH");
    if (cp) {
	while (*cp) {
	    cq = strchr(cp, ':');
	    if (!cq)
		cq = cp + strlen(cp);

	    if (cq != cp) {
		growstr_zero(g);
		growstr_append(g, cp);
		g->s[g->len = cq-cp] = 0;  /* just up to, not incl, ':' */
		growstr_printf(g, "/%s", exe);
		if (stat_exe(g->s, 0)) {
		    debug(1, "%s: found \"%s\" in path", __func__, g->s);
		    growstr_free(g);
		    return strsave(exe);
		}
	    }
	    cp = cq;
	    if (*cp)
		++cp;  /* skip : */
	}
    }

    /* look in . */
    if (stat_exe(exe, 0)) {
	char *ret;
	growstr_zero(g);
	growstr_printf(g, "./%s", exe);
	debug(1, "%s: found \"%s\" in current directory", __func__, g->s);
	ret = strsave(g->s);
	growstr_free(g);
	return ret;
    }

    error("%s: executable \"%s\" not found in path or current dir",
      __func__, exe);

    /*NOTREACHED*/
    return 0;
}

/*
 * Convert a unix signal number to a symbolic form.  If the handy function
 * does not exist, just look for a few of the more popular ones.
 */
const char *
parse_signal_number(int sig)
{
#ifdef HAVE_STRSIGNAL
    const char *s = strsignal(sig);
    if (!s)
	s = "unknown";
    return s;
#else
    /* just try to get some of the big ones */
#   if defined(SIGILL)
    if (sig == SIGILL) return "SIGILL";
#   endif
#   if defined(SIGBUS)
    if (sig == SIGBUS) return "SIGBUS";
#   endif
#   if defined(SIGKILL)
    if (sig == SIGKILL) return "SIGKILL";
#   endif
#   if defined(SIGSEGV)
    if (sig == SIGSEGV) return "SIGSEGV";
#   endif
#   if defined(SIGTERM)
    if (sig == SIGTERM) return "SIGTERM";
#   endif
    return "unknown";
#endif
}

static int killall_sig = 0;
static jmp_buf jmp_env;

/*
 * Signal handling.
 */
void
killall(int sig)
{
    static int killall_count = 0;

    debug(1, "%s: caught signal %d (%s)", __func__, sig,
      parse_signal_number(sig));
    ++killall_count;
    killall_sig = sig;
    longjmp(jmp_env, killall_count);
}

/*
 * Enable one signal handler for a list of signals.  Do not defer
 * signal reception while handling these, to let the impatient user
 * interrupt again to really exit.
 */
void
handle_signals(const int *list, int num, void (*handler)(int sig))
{
    const int default_siglist[] = { SIGHUP, SIGINT, SIGTERM };
    struct sigaction act;
    int i, ret;

    if (!list) {
	list = default_siglist;
	num = list_count(default_siglist);
    }
    sigemptyset(&act.sa_mask);
    act.sa_flags = SA_NODEFER;
    act.sa_handler = handler;
    for (i=0; i<num; i++) {
	ret = sigaction(list[i], &act, 0);
	if (ret < 0)
	    error_errno("%s: sigaction %d", __func__, list[i]);
    }
}

/*
 * Just print a little version string.
 */
static void
version(FILE *fp)
{
    if (!strcmp(CONFIGURE_OPTIONS, ""))
	fprintf(fp, "Version %s, no configure options\n", VERSION);
    else
	fprintf(fp, "Version %s, configure options: %s\n", VERSION,
	  CONFIGURE_OPTIONS);
}

/*
 * Environment variable which can specify -comm if no command-line argument
 * is given.
 */
static const char MPIEXEC_COMM_ENV[] = "MPIEXEC_COMM";

/*
 * String verison of communication library name.
 */
static const char *
comm_name(int which)
{
    static const struct {
	int num;
	const char *const name;
    } name[] = {
	{ COMM_MPICH_GM, "mpich-gm" },
	{ COMM_MPICH_P4, "mpich-p4" },
	{ COMM_MPICH_IB, "mpich-ib" },
	{ COMM_MPICH_RAI, "mpich-rai" },
	{ COMM_MPICH2_PMI, "mpich2-pmi" },
	{ COMM_LAM, "lam" },
	{ COMM_SHMEM, "shmem" },
	{ COMM_EMP, "emp" },
	{ COMM_NONE, "none" },
    };
    int i;

    for (i=0; i<list_count(name); i++)
	if (name[i].num == which)
	    return name[i].name;
    return "unknown-comm";
}

/*
 * Usage.
 */
static void
usage(void)
{
    fprintf(stderr, "Usage: %s [<args>] <executable> [<exe args>]...\n",
      progname);
    fprintf(stderr, "   or: %s [<args>] -config[=]<file>\n",
      progname);
    fprintf(stderr, "   or: %s [<args>] -server\n",
      progname);
    fprintf(stderr,
      "  -n <numproc> : use only some of the allocated processors\n");
    fprintf(stderr,
      "     Default behavior allocates one process per allocated processor.\n");
    fprintf(stderr,
      "  -verbose : be verbose about mpiexec operation\n");
    fprintf(stderr,
      "  -nostdin : do not listen to stdin, allowing process to go into background\n");
    fprintf(stderr,
      "  -allstdin : send stdin to all processes (default just proc #0)\n");
    fprintf(stderr,
      "  -nostdout : do not redirect stdout/stderr, but let pbs accumulate it\n");
    fprintf(stderr,
      "  -comm (gm|mx|p4|ib|rai|pmi|lam|shmem|emp|none) : choose MPI (default %s)\n",
      comm_name(COMM_DEFAULT));
    fprintf(stderr,
      "    -mpich-p4-[no-]shmem : for MPICH/P4, specify if the library was\n"
      "                           compiled with shared memory support (default %s)\n", HAVE_P4_SHMEM ? "yes" : "no");
    fprintf(stderr,
      "  -pernode : allocate only one process per compute node\n");
    fprintf(stderr,
      "  -npernode <nprocs> : allocate no more than <nprocs> processes per compute node\n");
    fprintf(stderr,
      "  -nolocal : do not run any MPI processes on the local node\n");
    if (HAVE_SED)
	fprintf(stderr,
	  "  -transform-hostname[=]<sed expression> : use alternate names for MPI\n");
    fprintf(stderr,
      "  -transform-hostname-program[=]<executable> : use this script or program\n"
      "                                               to generate alternate names\n");
    fprintf(stderr,
      "  -tv : debug using totalview (ensure it is in your path)\n");
    fprintf(stderr,
      "  -kill : kill other processes if any one process exits\n");
    fprintf(stderr,
      "  -config[=]<file> : use heterogenous node specification file (\"-\" for stdin)\n");
    fprintf(stderr,
      "  -server : do not run any tasks, just serve other concurrent mpiexec clients\n");
    fprintf(stderr,
      "  -version : show version information\n");
    version(stderr);
#if 0  /* No this doesn't work, but if I don't check it in I'll forget where I put it.  */
    fprintf(stderr,
      "  -output <prefix> : send process output to separate files\n");
#endif
    exit(1);
}

static comm_t
parse_comm(const char *const s, const char *const where)
{
    growstr_t *g;

    if (HAVE_COMM_MPICH_GM)
	if (!strcasecmp(s, "gm") || !strcasecmp(s, "mpich-gm")
	 || !strcasecmp(s, "mpich/gm")
	 || !strcasecmp(s, "mx") || !strcasecmp(s, "mpich-mx")
	 || !strcasecmp(s, "mpich/mx"))
	    return COMM_MPICH_GM;
    if (HAVE_COMM_MPICH_P4)
	if (!strcasecmp(s, "p4") || !strcasecmp(s, "mpich-p4")
	 || !strcasecmp(s, "mpich/p4"))
	    return COMM_MPICH_P4;
    if (HAVE_COMM_MPICH_IB)
	if (!strcasecmp(s, "ib") || !strcasecmp(s, "mpich-ib")
	 || !strcasecmp(s, "mpich/ib") || !strcasecmp(s, "mvapich"))
	    return COMM_MPICH_IB;
    if (HAVE_COMM_MPICH_RAI)
	if (!strcasecmp(s, "rai") || !strcasecmp(s, "mpich-rai")
	 || !strcasecmp(s, "mpich/rai"))
	    return COMM_MPICH_RAI;
    if (HAVE_COMM_MPICH2_PMI)
	if (!strcasecmp(s, "mpich2") || !strcasecmp(s, "mpich2-pmi")
	 || !strcasecmp(s, "mpich2/pmi") || !strcasecmp(s, "pmi"))
	    return COMM_MPICH2_PMI;
    if (HAVE_COMM_LAM)
	if (!strcasecmp(s, "lam"))
	    return COMM_LAM;
    if (HAVE_COMM_SHMEM)
	if (!strcasecmp(s, "shmem"))
	    return COMM_SHMEM;
    if (HAVE_COMM_EMP)
	if (!strcasecmp(s, "emp"))
	    return COMM_EMP;
    if (HAVE_COMM_NONE)
	if (!strcasecmp(s, "none") || !strcasecmp(s, "no"))
	    return COMM_NONE;

    /* complain */
    g = growstr_init();
    growstr_append(g, "%s: unknown MPI library type \"%s\"");
    if (where)
	growstr_append(g, where);
    growstr_append(g, ".\n");
    growstr_append(g, "Available ones:");
    if (HAVE_COMM_MPICH_GM) growstr_printf(g, " %s", comm_name(COMM_MPICH_GM));
    if (HAVE_COMM_MPICH_P4) growstr_printf(g, " %s", comm_name(COMM_MPICH_P4));
    if (HAVE_COMM_MPICH_IB) growstr_printf(g, " %s", comm_name(COMM_MPICH_IB));
    if (HAVE_COMM_MPICH_RAI)
        growstr_printf(g, " %s", comm_name(COMM_MPICH_RAI));
    if (HAVE_COMM_MPICH2_PMI)
        growstr_printf(g, " %s", comm_name(COMM_MPICH2_PMI));
    if (HAVE_COMM_LAM)      growstr_printf(g, " %s", comm_name(COMM_LAM));
    if (HAVE_COMM_SHMEM)    growstr_printf(g, " %s", comm_name(COMM_SHMEM));
    if (HAVE_COMM_EMP)      growstr_printf(g, " %s", comm_name(COMM_EMP));
    if (HAVE_COMM_NONE)     growstr_printf(g, " %s", comm_name(COMM_NONE));
    growstr_printf(g, " (default %s)", comm_name(COMM_DEFAULT));
    error(g->s, __func__, s);

    /*NOTREACHED*/
    return COMM_UNSET;
}

/*
 * For highly flexible argument parsing, allow an option argument
 * to appear in many places.  The following are all equivalent:
 *   --np=3
 *   --np 3
 *   --np3
 * 
 * Note that the argument talked about here is not optional, it is a
 * required argument to an optional command-line option.
 */
static const char *
find_optarg(const char *cp, int *argcp, const char ***const argvp,
  const char *const which)
{
    /* argument could be in this one, or, if not, in the next arg */
    if (*cp) {
	/* optional = */
	if (*cp == '=')
	    ++cp;
    } else {
	if (++*argvp, --*argcp <= 0)
	    error("%s: option -%s requires an argument", __func__, which);
	cp = **argvp;
    }
    return cp;
}

#define MAX(a,b) ((a) > (b) ? (a) : (b))

/*
 * Chop up the mpiexec args, and return argc/argv which is the
 * parallel code to run, with its args only.
 */
static void
parse_args(int *argcp, const char ***argvp)
{
    int argc = *argcp;
    const char **argv = *argvp;
    int len;
    const char *cp, *cq;
    char *cr;

    /*
     * Look for arguments, which must come before exec and args.
     */
    cl_args = Malloc(sizeof(*cl_args));
    memset(cl_args, 0, sizeof(*cl_args));
    cl_args->which_stdin = STDIN_UNSET;
    cl_args->comm = COMM_UNSET;
    cl_args->mpich_p4_shmem = -1;

    while (++argv, --argc > 0) {
	cp = *argv;
	if (*cp++ != '-') break;
	if (*cp == '-')  ++cp;  /* optional second "-" */
	if ((cq = strchr(cp, '=')))  /* maybe optional = */
	    len = cq - cp;
	else
	    len = strlen(cp);
	if (len < 1)
	    usage();
	if (!strncmp(cp, "nostdout", MAX(6,len)))
	    cl_args->nostdout = 1;
	else if (!strncmp(cp, "nostdin", MAX(6,len))) {
	    if (cl_args->which_stdin == STDIN_ALL)
		error("arguments -nostdin and -allstdin conflict");
	    cl_args->which_stdin = STDIN_NONE;
	} else if (!strncmp(cp, "allstdin", len)) {
	    if (cl_args->which_stdin == STDIN_NONE)
		error("arguments -nostdin and -allstdin conflict");
	    cl_args->which_stdin = STDIN_ALL;
	} else if (!strncmp(cp, "pernode", MAX(1,len)))
	    cl_args->pernode = 1;
	else if (!strncmp(cp, "npernode", MAX(3,len))) {
	    long l;
	    cp += len;
	    cp = find_optarg(cp, &argc, &argv, "npernode");
	    l = strtol(cp, &cr, 10);
	    if (*cr || l <= 0)
		error("argument -npernode requires positive number of processes");
	    cl_args->pernode = l;
	} else if (!strncmp(cp, "nolocal", MAX(3,len)))
	    cl_args->nolocal = 1;
	else if (HAVE_COMM_MPICH_GM && !strncmp(cp, "no-shmem", MAX(2,len)))
	    warning("-no-shmem ignored, use GMPI_SHMEM=0 or MX_DISABLE_SHMEM=1"
	            " environment variable");
	/* keep this after other "n..." items */
	else if (!strncmp(cp, "np", len)) {
	    long l;
	    cp += len;
	    cp = find_optarg(cp, &argc, &argv, "n");
	    l = strtol(cp, &cr, 10);  /* negative value to strtoul is legal! */
	    if (*cr || l <= 0)
		error("argument -n requires positive integral number of nodes");
	    cl_args->numproc = l;
	} else if (!strcmp(cp, "tv") || !strncmp(cp, "totalview", MAX(2,len)))
	    cl_args->tview = 1;
	else if (!strncmp(cp, "config", MAX(3,len))) {
	    cp += MAX(3,len);
	    cl_args->config_file = find_optarg(cp, &argc, &argv, "config");
	} else if (!strncmp(cp, "kill", len))
	    cl_args->kill_others = 1;
	else if (!strncmp(cp, "version", MAX(4,len))) {
	    version(stdout);
	    exit(0);
	/* keep this after other "v..." items; allows old style -v */
	} else if (!strncmp(cp, "verbose", len))
	    ++cl_args->verbose;
	else if (HAVE_SED && !strncmp(cp, "transform-hostname", MAX(2,len))) {
	    cp += MAX(2,len);
	    cl_args->transform_hostname = find_optarg(cp, &argc, &argv,
	      "transform-hostname");
	} else if (HAVE_SED && !strncmp(cp, "gige", len))
	    cl_args->transform_hostname = "s/node/gige/";
	else if (!strncmp(cp, "transform-hostname-program", MAX(20,len))) {
	    cp += MAX(20,len);
	    cl_args->transform_hostname_program = find_optarg(cp, &argc, &argv,
	      "transform-hostname-program");
	} else if (!strncmp(cp, "comm", MAX(3,len))) {
	    if (cl_args->comm != COMM_UNSET)
		error("only choose one communication library");
	    cp += MAX(3,len);
	    cl_args->comm = parse_comm(
	      find_optarg(cp, &argc, &argv, "comm"), 0);
#if 0
	} else if (!strncmp(cp, "output", MAX(1,len))) {
	    cp += MAX(1,len);
	    cl_args->process_output = find_optarg(cp, &argc, &argv, "output");
#endif
	} else if (!strncmp(cp, "mpich-p4-shmem", len) && len == 14)
	    cl_args->mpich_p4_shmem = 1;
	else if (!strncmp(cp, "mpich-p4-no-shmem", len) && len == 17)
	    cl_args->mpich_p4_shmem = 0;
	else if (!strncmp(cp, "server", len))
	    cl_args->server_only = 1;
	else usage();
    }

    /*
     * A bunch of sanity checks.  Not all options are compatible with
     * each other, or the compile-time configuration.
     */
    if (cl_args->server_only) {
	/* many arguments do not make sense here, try to catch lots */
	if (cl_args->which_stdin != STDIN_UNSET
	 || cl_args->nostdout != 0)
	    error("%s: cannot use stdin/stdout arguments with -server",
	      __func__);
	if (cl_args->comm != COMM_UNSET)
	    error("%s: cannot use -comm argument with -server", __func__);
	if (cl_args->pernode)
	    error("%s: cannot use -pernode argument with -server", __func__);
	if (cl_args->nolocal)
	    error("%s: cannot use -nolocal argument with -server", __func__);
	if (cl_args->transform_hostname)
	    error("%s: cannot use -transform_hostname argument with -server",
	      __func__);
	if (cl_args->tview)
	    error("%s: cannot use -totalview argument with -server", __func__);
	if (cl_args->kill_others)
	    error("%s: cannot use -kill argument with -server", __func__);
	if (cl_args->config_file)
	    error("%s: cannot use -config argument with -server", __func__);
    }

    if (cl_args->which_stdin == STDIN_UNSET)
	cl_args->which_stdin = STDIN_ONE;  /* the default, just proc #0 */

    if (cl_args->comm == COMM_UNSET) {
	/*
	 * Accept setting from environment if none given on command line,
	 * else fall to compiled-in default.
	 */
	const char *comm_env = getenv(MPIEXEC_COMM_ENV);
	if (comm_env) {
	    growstr_t *g = growstr_init();
	    growstr_printf(g, "\n  in environment variable \"%s\"",
	      MPIEXEC_COMM_ENV);
	    cl_args->comm = parse_comm(comm_env, g->s);
	    growstr_free(g);
	} else
	    cl_args->comm = COMM_DEFAULT;
    }

    if (cl_args->mpich_p4_shmem == -1) {
	if (cl_args->comm == COMM_MPICH_P4)
	    cl_args->mpich_p4_shmem = HAVE_P4_SHMEM;  /* configure default */
    } else {
	if (cl_args->comm != COMM_MPICH_P4)
	    warning("%s: argument \"-mpich-p4-[no-]shmem\" ignored since\n"
	      "  communication library not MPICH/P4", __func__);
    }

    if (cl_args->config_file && !strcmp(cl_args->config_file, "-"))
	if (cl_args->which_stdin != STDIN_NONE) {
	    warning("reading the config file from stdin forces -nostdin");
	    cl_args->which_stdin = STDIN_NONE;
	}

    if (cl_args->transform_hostname && cl_args->transform_hostname_program)
	error("-transform-hostname and -transform-hostname-program conflict");

#if 0
    if (cl_args->process_output && cl_args->nostdout)
        warning("-output ignored since -nostdout specified");
#endif

    /*
     * Get full path to executable given on command line, resolved using
     * current PATH setting
     */
    if (cl_args->config_file) {
	if (argc != 0)
	    error("%s: extra command-line arguments with -config flag",
	      __func__);
    } else if (cl_args->server_only) {
	if (argc != 0)
	    error("%s: extra command-line arguments with -server flag",
	      __func__);
    } else {
	if (argc < 1)
	    usage();
	argv[0] = resolve_exe(argv[0], 0);
    }
    *argcp = argc;
    *argvp = argv;
}

static void show_exit_statuses(void);

int
main(int argc, const char *argv[])
{
    int i, j, jmp_return, ret;
    struct passwd *pswd_tmp;

    set_progname(argc, argv);
    parse_args(&argc, &argv);
    stdio_notice_streams();

    jobid = getenv("PBS_JOBID");
    if (!jobid)
	error("PBS_JOBID not set in environment.  Code must be run from a\n"
	      "  PBS script, perhaps interactively using \"qsub -I\"");

    /* copy the static passwd struct, since tm_ calls will overwrite it */
    pswd_tmp = getpwuid(getuid());
    if (!pswd_tmp)
	error("%s: no passwd entry for uid %d", __func__, (int) getuid());
    pswd = Malloc(sizeof(*pswd));
    memcpy(pswd, pswd_tmp, sizeof(*pswd));

    /* see if there is a master socket in the case of concurrent mpiexec */
    concurrent_init();

    if (cl_args->server_only && !concurrent_master)
	error("%s: not concurrent master yet -server flag specified", __func__);

    /*
     * Reset signals to a sane state, in case we were spawned weirdly.  Saw
     * a wrapper script that sets SIGCHLD to SIG_IGN, but this won't fly
     * when the popen() in PBSD_authenticate forks and waits.
     */
    {
	const int siglist[] = { SIGHUP, SIGTERM, SIGQUIT, SIGCHLD };
	handle_signals(siglist, list_count(siglist), SIG_DFL);
    }

    /* get taskids from tm, then hostnames from pbs */
    if (concurrent_master)
	get_hosts();
    else
	concurrent_get_nodes();

    if (cl_args->verbose)
	for (i=0; i<numtasks; i++)
	    printf("node %2d: name %s, cpu avail %u\n", i,
	      nodes[i].name, nodes[i].availcpu);

#if 0
    {
	/* debug, cannot strace until after setuid pbs stuff done
	 * in get_hosts(). */
	char s[256];
	sprintf(s, "strace -tt -T -vFf -s 2000 -o o%d -p %d > /dev/null 2>&1 &",
	  getpid(), getpid());
	system(s);
	sleep(1);  /* wait for attach */
    }
#endif

    if (cl_args->server_only) {
	cm_permit_new_clients(1);
	handle_signals(0, 0, killall);
	numtasks = 0;
	numspawns = 0;
	numspawned = 0;
	pipe_with_stdio = -1;
	goto server_only;
    }

    /*
     * Now look at the command-line constraints.
     */
    constrain_nodes();

    /*
     * Finally map the config file requirements onto the available tasks,
     * or for command-line, let them all do the same thing.
     */
    if (cl_args->config_file)
	parse_config();
    else
	argcv_config(argc, argv);

    /* 
     * Identify nodes with multiple identical jobs and squeeze them
     * down since mpich/p4 (and shmem) will use shmem on the same node.
     * This reduces numtasks since we have fewer tasks to spawn.
     */
    tasks_shmem_reduce();

    /*
     * Build the initial spawn group.
     */
    numspawns = 1;
    spawns = Malloc(numspawns * sizeof(*spawns));
    spawns[0].task_start = 0;
    spawns[0].task_end = numtasks;
    /* this indirection is to keep the obit pointers constant while
     * we move around the tasks array */
    spawns[0].obits = Malloc(numtasks * sizeof(*spawns[0].obits));
    spawns[0].ranks2hosts_response = NULL;
    for (i=0; i<numtasks; i++)
	tasks[i].status = &spawns[0].obits[i];

    /*
     * Count the nodes as allocated
     */
    if (concurrent_master) {
	for (i=0; i<numtasks; i++) {
	    nodes[tasks[i].node].cm_availcpu -= tasks[i].num_copies;
	    for (j=0; j<tasks[i].num_copies; j++)
		nodes[tasks[i].node].cm_cpu_free[tasks[i].cpu_index[j]] = 1;
	}
	cm_permit_new_clients(1);
    } else
	concurrent_node_alloc();

    /*
     * Figure out who I am for MPI startup.
     * 
     * Many MPI libs use some out-of-band startup protocol that runs over
     * the same communication fabric that PBS uses, some sort of standard
     * ethernet usually.  Mpiexec serves as the focal point of that protocol
     * and thus will create a listening socket bound to the IP of the
     * hostname used by the other tasks as they try to connect to their
     * startup host.
     *
     * We don't allow for mpiexec to be run from any node other than the
     * mother superior, so this is always nodes[0] that fills this role.
     *
     * Hope that the DNS lookup for our hostname returns only one IP, since
     * we only use the first in the list.
     */
    if (cl_args->comm == COMM_MPICH_GM || cl_args->comm == COMM_MPICH_IB
      || cl_args->comm == COMM_MPICH_P4 || cl_args->comm == COMM_MPICH2_PMI
      || cl_args->comm == COMM_MPICH_RAI) {
	struct hostent *he;

	he = gethostbyname(nodes[0].name);
	if (!he)
	    error("%s: gethostbyname cannot find my name %s", __func__,
	      nodes[0].name);
	if (he->h_length != sizeof(myaddr.sin_addr))
	    error("%s: gethostbyname returns %d-byte addresses, expecting %d",
	      __func__, he->h_length, (int) sizeof(myaddr.sin_addr));
	myaddr.sin_family = (unsigned short) he->h_addrtype;
	memcpy(&myaddr.sin_addr, he->h_addr_list[0], sizeof(myaddr.sin_addr));
    }

    /*
     * Run the tasks and wait for them to finish.  Use of setjmp is
     * to avoid complex shutdown activity in the signal handler.
     */
  server_only:
    jmp_return = setjmp(jmp_env);
    switch (jmp_return) {
        case 0:
	    if (cl_args->server_only) {
		cm_serve_clients();
		concurrent_exit();   /* not reached, wait for ctrl-c path */
	    } else {
		/* return value ignored; it changes cs->exe if okay */
		distribute_executable();
		ret = start_tasks(0);
		if (ret)
		    kill_tasks(SIGTERM);
		wait_tasks();
	    }
	    break;

	/* reached by setjmp return likely */
	case 1:
	    kill_tasks(killall_sig);
	    if (concurrent_master) {
		int num_clients_killed;

		cm_permit_new_clients(0);  /* no new connections */
		num_clients_killed = cm_kill_clients();
		/*
		 * If no clients were connected, hand out exit-status 0 as
		 * the only way a server-only master can exit is with a signal
		 * and it may have been intended.
		 */
		if (num_clients_killed == 0 && cl_args->server_only)
		    jmp_return = 0;
	    }
	    wait_tasks();
	    /* if master, we will end up waiting for client events/tasks */
	    break;

	/* second ctrl-c, don't try to communicate with anything, just die */
	default:
	    handle_signals(0, 0, SIG_DFL);
	    break;
    }

    /* deallocate our tasks */
    if (concurrent_master)
	for (i=0; i<numtasks; i++) {
	    nodes[tasks[i].node].cm_availcpu += tasks[i].num_copies;
	    for (j=0; j<tasks[i].num_copies; j++)
		nodes[tasks[i].node].cm_cpu_free[tasks[i].cpu_index[j]] = 0;
	}

    /*
     * Tell the stdio thread to exit, show status of finished tasks, disconnect
     * from other mpiexecs.  Nothing here waits.
     */
    kill_stdio();
    show_exit_statuses();
    if (concurrent_master)
	cm_serve_clients();
    handle_signals(0, 0, SIG_DFL);
    concurrent_exit();

    free(cl_args);

    if (numtasks)
	return *tasks[0].status;
    else
	return jmp_return;
}

/*
 * Attempt to shrink output if lots of tasks died for the same
 * reason.  Most of the code below just implements some auto-
 * growing arrays.  Could extentd growstr to be generic, but
 * by then might as well switch to C++ and use some templated
 * class library.
 */
static void
show_exit_statuses(void)
{
    struct {
	int status;
	done_how_t done;
	int *tasks;
	int numtasks;
	int maxtasks;
    } *slist = 0;
    int numslist = 0;
    int maxslist = 0;
    int i, j;
    growstr_t *g = growstr_init();

    /* assume okay if we didn't actually get an exit status; and null out
     * the statuses that were never filled */
    for (i=0; i<numtasks; i++) {
	/* if we died, abandoning tasks, perhaps from lots of ctrl-c, do not
	 * report any exit status */
	/* do not report anything if we never got an exit status */
	if (tasks[i].done == DONE_NOT) {
	    debug(2, "%s: task %d was not done", __func__, i);
	    tasks[i].done = DONE_OK;
	    *tasks[i].status = 0;
	}
	if (tasks[i].done == DONE_NO_EXIT_STATUS) {
	    debug(2, "%s: task %d had no exit status", __func__, i);
	    tasks[i].done = DONE_OK;
	    *tasks[i].status = 0;
	}
	/* null out the statuses that were never filled */
	if (tasks[i].done == DONE_NOT_STARTED) {
	    *tasks[i].status = 0;
	}
    }

    for (i=0; i<numtasks; i++) {
	if (*tasks[i].status == 0 && tasks[i].done == DONE_OK)
	    continue;
	for (j=0; j<numslist; j++)
	    if (slist[j].status == *tasks[i].status
	     && slist[j].done == tasks[i].done)
		break;
	if (j == numslist) {
	    if (numslist == maxslist) {
		void *x = slist;
		maxslist += 10;
		slist = Malloc(maxslist * sizeof(*slist));
		if (x) {
		    memcpy(slist, x, numslist * sizeof(*slist));
		    free(x);
		}
	    }
	    slist[j].status = *tasks[i].status;
	    slist[j].done = tasks[i].done;
	    slist[j].tasks = 0;
	    slist[j].numtasks = 0;
	    slist[j].maxtasks = 0;
	    ++numslist;
	}
	if (slist[j].numtasks == slist[j].maxtasks) {
	    void *x = slist[j].tasks;
	    slist[j].maxtasks += 10;
	    slist[j].tasks = Malloc(slist[j].maxtasks
	      * sizeof(*slist[j].tasks));
	    if (x) {
		memcpy(slist[j].tasks, x, slist[j].numtasks
		  * sizeof(*slist[j].tasks));
		free(x);
	    }
	}
	slist[j].tasks[slist[j].numtasks] = i;
	++slist[j].numtasks;
    }

    for (j=0; j<numslist; j++) {
	int inrange, needcomma, needdash;

	growstr_zero(g);
	growstr_printf(g, "task%s ", slist[j].numtasks > 1 ? "s" : "");

	inrange = -1;
	needcomma = 0;
	needdash = 0;
	for (i=0; i<slist[j].numtasks; i++) {
	    if (inrange >= 0) {
		int rangeok = 0;
		if (slist[j].tasks[i] == inrange + 1) {
		    ++inrange;
		    rangeok = 1;
		    needdash = 1;
		    if (i < slist[j].numtasks - 1)
			continue;  /* else fall through and terminate */
		}
		if (needdash) {
		    growstr_printf(g, "-%d", inrange);
		    inrange = -1;
		    needdash = 0;
		}
		if (rangeok == 1)
		    continue;  /* else fall to do the next range */
	    }
	    if (needcomma)
		growstr_append(g, ",");
	    growstr_printf(g, "%d", slist[j].tasks[i]);
	    inrange = slist[j].tasks[i];
	    needcomma = 1;
	}

	/* DONE_NOT cannot happen, hopefully */
	if (slist[j].done == DONE_STARTUP_INCOMPLETE)
	    growstr_append(g, " exited before completing MPI startup");
	else if (slist[j].done == DONE_NOT_STARTED)
	    growstr_printf(g, " %s never spawned due to earlier errors",
		           slist[j].numtasks > 1 ? "were" : "was");
	else if (slist[j].status >= PBS_SIG_OFFSET) {
	    int sig = slist[j].status - PBS_SIG_OFFSET;
	    growstr_printf(g, " died with signal %d (%s)",
	      sig, parse_signal_number(sig));
	} else if (slist[j].status != 0) {
	    growstr_printf(g, " exited with status %d", slist[j].status);
	} else {
	    growstr_printf(g, " exited oddly---report bug: status %d done %d",
	                   slist[j].status, slist[j].done);
	}
	warning(g->s);
    }
    growstr_free(g);
}



syntax highlighted by Code2HTML, v. 0.9.1