diff -ruN --exclude='.nfs*' torque-1.1.0p0/README.mpiexec torque-1.1.0p0-bp/README.mpiexec --- torque-1.1.0p0/README.mpiexec 1970-01-01 10:00:00.000000000 +1000 +++ torque-1.1.0p0-bp/README.mpiexec 2004-08-19 16:41:56.000000000 +1000 @@ -0,0 +1,24 @@ + +Documentation of changes applied as part of the mpiexec patch. +http://www.osc.edu/~pw/mpiexec/ +Last changed 04 Dec 2001. + + +General bug fix: + + src/lib/Libifl/tm.c + + Do not call DIS_tcp_setup(-1) if connection to the local mom failed + after 5 tries. + +Functionality additions: + + src/resmom/start_exec.c + + Added extension to get stdin/out/err files from the environment rather + than always using /dev/null during tm_spawn. + + src/cmds/pbs_demux.c + + Never expect a PBS_JOBCOOKIE to be delivered from a process. + diff -ruN --exclude='.nfs*' torque-1.1.0p0/src/cmds/pbs_demux.c torque-1.1.0p0-bp/src/cmds/pbs_demux.c --- torque-1.1.0p0/src/cmds/pbs_demux.c 2004-06-03 04:54:43.000000000 +1000 +++ torque-1.1.0p0-bp/src/cmds/pbs_demux.c 2004-08-19 16:41:56.000000000 +1000 @@ -106,10 +106,8 @@ struct routem { enum rwhere r_where; short r_nl; - short r_first; }; fd_set readset; -char *cookie = 0; void readit(int sock, struct routem *prm) @@ -127,18 +125,6 @@ i = 0; if ((amt = read(sock, buf, 256)) > 0) { - if (prm->r_first == 1) { - - /* first data on connection must be the cookie to validate it */ - - i = strlen(cookie); - if (strncmp(buf, cookie, i) != 0) { - (void)close(sock); - prm->r_where = invalid; - FD_CLR(sock, &readset); - } - prm->r_first = 0; - } for (pc = buf+i; pc < buf+amt; ++pc) { #ifdef DEBUG if (prm->r_nl) { @@ -176,22 +162,12 @@ parent = getppid(); - cookie = getenv("PBS_JOBCOOKIE"); - if (cookie == 0) { - fprintf(stderr, "%s: no PBS_JOBCOOKIE found in the env\n", - argv[0]); - exit(3); - } -#ifdef DEBUG - printf("Cookie found in environment: %s\n", cookie); -#endif maxfd = sysconf(_SC_OPEN_MAX); routem = (struct routem *)malloc(maxfd*sizeof(struct routem)); for (i=0; ir_where = invalid; (routem+i)->r_nl = 1; - (routem+i)->r_first = 0; } (routem+main_sock_out)->r_where = new_out; (routem+main_sock_err)->r_where = new_err; @@ -244,7 +220,6 @@ newsock = accept(i, 0, 0); (routem+newsock)->r_where = (routem+i)->r_where== new_out ? old_out : old_err; FD_SET(newsock, &readset); - (routem+newsock)->r_first = 1; break; case old_out: case old_err: diff -ruN --exclude='.nfs*' torque-1.1.0p0/src/lib/Libifl/tm.c torque-1.1.0p0-bp/src/lib/Libifl/tm.c --- torque-1.1.0p0/src/lib/Libifl/tm.c 2004-06-03 04:54:44.000000000 +1000 +++ torque-1.1.0p0-bp/src/lib/Libifl/tm.c 2004-08-19 16:41:56.000000000 +1000 @@ -458,7 +458,8 @@ } } - DIS_tcp_setup(local_conn); + if (local_conn >= 0) + DIS_tcp_setup(local_conn); return (local_conn); } diff -ruN --exclude='.nfs*' torque-1.1.0p0/src/resmom/start_exec.c torque-1.1.0p0-bp/src/resmom/start_exec.c --- torque-1.1.0p0/src/resmom/start_exec.c 2004-06-03 04:54:44.000000000 +1000 +++ torque-1.1.0p0-bp/src/resmom/start_exec.c 2004-08-19 17:12:48.000000000 +1000 @@ -1858,6 +1858,42 @@ /* + * Look for a certain environment variable which has a port# which should + * be opened on the MS to establish communication for one of the 3 stdio + * streams. >=0 return is that valid fd, -1 means no env var found, + * -2 means malformed env value or failure to connect. + */ +static int +search_env_and_open(const char *envname, u_long ipaddr) +{ + static char *id = "search_env_and_open"; + int i, len = strlen(envname); + + for (i=0; i 0) - close(fd); + (void)dup2(fd0, 0); + if (fd0 > 0) + (void)close(fd0); } + /* look through env for a port# on MS we should use for stdout/err */ + if ((fd1 = search_env_and_open("MPIEXEC_STDOUT_PORT", ipaddr)) == -2) + starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); + if ((fd2 = search_env_and_open("MPIEXEC_STDERR_PORT", ipaddr)) == -2) + starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); + if (pjob->ji_numnodes > 1) { /* ** Open sockets to demux proc for stdout and stderr. */ - if ((fd = open_demux(ipaddr,pjob->ji_stdout)) == -1) + if (fd1 < 0 && (fd1 = open_demux(ipaddr,pjob->ji_stdout)) == -1) { starter_return(kid_write,kid_read,JOB_EXEC_FAIL2,&sjr); @@ -2292,12 +2337,12 @@ exit(1); } - dup2(fd,1); + dup2(fd1,1); - if (fd > 1) - close(fd); + if (fd1 > 1) + close(fd1); - if ((fd = open_demux(ipaddr,pjob->ji_stderr)) == -1) + if (fd2 < 0 && (fd2 = open_demux(ipaddr,pjob->ji_stderr)) == -1) { starter_return(kid_write,kid_read,JOB_EXEC_FAIL2,&sjr); @@ -2306,44 +2351,70 @@ exit(1); } - dup2(fd,2); + dup2(fd2,2); - if (fd > 2) - close(fd); + if (fd2 > 2) + close(fd2); - write(1,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str, - strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str)); - - write(2,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str, - strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str)); } else if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags&ATR_VFLAG_SET) && (pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long > 0)) { /* interactive job, single node, write to pty */ - if ((pts = open_pty(pjob)) < 0) - { - log_err(errno,id,"cannot open slave"); + pts = -1; + if (fd1 < 0 || fd2 < 0) { + if ((pts = open_pty(pjob)) < 0) + { + log_err(errno,id,"cannot open slave"); - starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr); - } + starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr); + } + + if (fd1 < 0) + fd1 = pts; + if (fd2 < 0) + fd2 = pts; - dup2(pts,1); - dup2(pts,2); + } + + (void)dup2(fd1, 1); + (void)dup2(fd2, 2); + if (fd1 != pts) + (void) close(fd1); + if (fd2 != pts) + (void) close(fd2); } else { /* normal batch job, single node, write straight to files */ - if (open_std_out_err(pjob) == -1) - { - log_err(errno,id,"cannot open stderr/stdout"); + pts = -1; + if (fd1 < 0 || fd2 < 0) { - starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr); - } + if (open_std_out_err(pjob) == -1) + { + log_err(errno,id,"cannot open stderr/stdout"); + + starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr); + } } + if (fd1 >= 0) { + (void) close(1); + (void) dup2(fd1, 1); + if (fd1 > 1) + (void) close(fd1); + } + if (fd2 >= 0) { + (void) close(2); + (void) dup2(fd2, 2); + if (fd2 > 2) + (void) close(fd2); + } + + } + log_close(0); starter_return(