diff -ruN pbs-2.2.11-stock/README.pw pbs-2.2.11/README.pw --- pbs-2.2.11-stock/README.pw Wed Dec 31 19:00:00 1969 +++ pbs-2.2.11/README.pw Fri Dec 22 12:13:08 2000 @@ -0,0 +1,35 @@ +#!/bin/bash +# +# Call configure with the right stuff used locally. +# +./configure \ + --prefix=/usr/local/pbs-2.2.11 \ + --set-server-home=/var/spool/pbs +# now do: make -j3 MAKE="make -j3" + +# +# Documentation of changes applied as part of the mpiexec patch. +# +## +# src/include/tm.h +# Fixed misplaced "typedef" in struct tm_whattodo.h:89. +# +# src/lib/Libifl/rpp.c +# Avoid EINTR in recvfrom by looping it, else bogus error result. +# Incrcease rpp retry count. +# +# src/lib/Libifl/tm.c +# Do not call DIS_tcp_setup(-1) if connection to the local mom failed +# after 5 tries. +# +# src/resmom/mom_comm.c +# Fix off-by-one error in saving of envp during TM_SPAWN processing. Avoid +# some coredumps due to corruption elsewhere. +# +# src/resmom/start_exec.c +# Added extension to get stdin/out/err files from the environment rather than +# always using /dev/null during tm_spawn. +# +# src/include/net_connect.h +# Increase max connection idle time. +# diff -ruN pbs-2.2.11-stock/src/include/net_connect.h pbs-2.2.11/src/include/net_connect.h --- pbs-2.2.11-stock/src/include/net_connect.h Mon Oct 18 17:22:01 1999 +++ pbs-2.2.11/src/include/net_connect.h Mon Dec 18 15:44:12 2000 @@ -62,7 +62,8 @@ #define PBS_NET_TYPE #endif -#define PBS_NET_MAXCONNECTIDLE 900 +/* for use with mpiexec, connection may last quite a bit longer than 15 min */ +#define PBS_NET_MAXCONNECTIDLE 2592000 #define PBS_NET_CONN_AUTHENTICATED 1 #define PBS_NET_CONN_FROM_PRIVIL 2 diff -ruN pbs-2.2.11-stock/src/include/tm.h pbs-2.2.11/src/include/tm.h --- pbs-2.2.11-stock/src/include/tm.h Wed Jul 21 18:56:43 1999 +++ pbs-2.2.11/src/include/tm.h Mon Dec 18 15:44:12 2000 @@ -82,11 +82,11 @@ ** The tm_whattodo structure contains data for the last ** tm_register event polled. This is not implemented yet. */ -struct tm_whattodo { +typedef struct tm_whattodo { int tm_todo; tm_task_id tm_what; tm_node_id tm_where; -} typedef tm_whattodo_t; +} tm_whattodo_t; /* ** Prototypes for all the TM API calls. diff -ruN pbs-2.2.11-stock/src/lib/Libifl/rpp.c pbs-2.2.11/src/lib/Libifl/rpp.c --- pbs-2.2.11-stock/src/lib/Libifl/rpp.c Fri Jul 23 16:58:57 1999 +++ pbs-2.2.11/src/lib/Libifl/rpp.c Mon Dec 18 15:44:12 2000 @@ -148,7 +148,7 @@ /*every RPP_TIMEOUT seconds */ -#define RPP_RETRY 20 /*after this many sendto attempts on a */ +#define RPP_RETRY 200 /*after this many sendto attempts on a */ /*packet, give up and declare the stream */ /*to be in the "stale" state */ @@ -1154,8 +1154,13 @@ assert(data != NULL); flen = sizeof(struct sockaddr_in); - len = recvfrom(fd, data, RPP_PKT_SIZE, 0, - (struct sockaddr *)&addr, &flen); + /* + * loop added to avoid hanging up due to receipt of a signal --pw + */ + do { + len = recvfrom(fd, data, RPP_PKT_SIZE, 0, + (struct sockaddr *)&addr, &flen); + } while (len == -1 && errno == EINTR); if (len == -1) { free(data); if (errno == EWOULDBLOCK || diff -ruN pbs-2.2.11-stock/src/lib/Libifl/tm.c pbs-2.2.11/src/lib/Libifl/tm.c --- pbs-2.2.11-stock/src/lib/Libifl/tm.c Fri Apr 16 15:08:19 1999 +++ pbs-2.2.11/src/lib/Libifl/tm.c Mon Dec 18 15:44:12 2000 @@ -411,7 +411,9 @@ } } - DIS_tcp_setup(local_conn); + if (local_conn >= 0) + DIS_tcp_setup(local_conn); + return (local_conn); } diff -ruN pbs-2.2.11-stock/src/resmom/mom_comm.c pbs-2.2.11/src/resmom/mom_comm.c --- pbs-2.2.11-stock/src/resmom/mom_comm.c Thu Feb 17 17:56:51 2000 +++ pbs-2.2.11/src/resmom/mom_comm.c Mon Dec 18 15:44:12 2000 @@ -537,6 +537,19 @@ stream_addr = rpp_getaddr(stream); node_addr = rpp_getaddr(hp->hn_stream); + /* + * these might become corrupt, avoid coredump --pw + */ + if (!stream_addr) { + sprintf(log_buffer, "find_node: stream id %d does not exist", + stream); + log_err(-1, id, log_buffer); + } + if (!node_addr) { + sprintf(log_buffer, "find_node: node id %d does not exist", + hp->hn_stream); + log_err(-1, id, log_buffer); + } if (memcmp(&stream_addr->sin_addr, &node_addr->sin_addr, sizeof(node_addr->sin_addr)) != 0) { sprintf(log_buffer, @@ -2581,7 +2594,10 @@ free(env); break; } - if (i == numele) { + /* need to remember extra slot for NULL at + * the end --pw + */ + if (i == numele-1) { numele *= 2; envp = (char **)realloc(envp, numele*sizeof(char **)); diff -ruN pbs-2.2.11-stock/src/resmom/start_exec.c pbs-2.2.11/src/resmom/start_exec.c --- pbs-2.2.11-stock/src/resmom/start_exec.c Thu Feb 17 17:56:52 2000 +++ pbs-2.2.11/src/resmom/start_exec.c Fri Dec 22 12:12:05 2000 @@ -1360,6 +1360,42 @@ } /* + * Look for a certain environment variable which has a port# which should + * be opened on the MS to establish communication for one of the 3 stdio + * streams. >=0 return is that valid fd, -1 means no env var found, + * -2 means malformed env value or failure to connect. + */ +static int +search_env_and_open(const char *envname, u_long ipaddr) +{ + static char *id = "search_env_and_open"; + int i, len = strlen(envname); + + for (i=0; i 0) - (void)close(fd); + (void)dup2(fd0, 0); + if (fd0 > 0) + (void)close(fd0); } + /* look through env for a port# on MS we should use for stdout/err */ + if ((fd1 = search_env_and_open("MPIEXEC_STDOUT_PORT", ipaddr)) == -2) + starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); + if ((fd2 = search_env_and_open("MPIEXEC_STDERR_PORT", ipaddr)) == -2) + starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); + if (pjob->ji_numnodes > 1) { /* ** Open sockets to demux proc for stdout and stderr. */ - if ((fd = open_demux(ipaddr, pjob->ji_stdout)) == -1) + if (fd1 < 0 && (fd1 = open_demux(ipaddr,pjob->ji_stdout)) == -1) starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); - (void)dup2(fd, 1); - if (fd > 1) - (void)close(fd); - if ((fd = open_demux(ipaddr, pjob->ji_stderr)) == -1) + (void)dup2(fd1, 1); + if (fd1 > 1) + (void)close(fd1); + if (fd2 < 0 && (fd2 = open_demux(ipaddr,pjob->ji_stderr)) == -1) starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); - (void)dup2(fd, 2); - if (fd > 2) - (void)close(fd); + (void)dup2(fd2, 2); + if (fd2 > 2) + (void)close(fd2); (void)write(1,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str, strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str)); @@ -1631,17 +1677,41 @@ (pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long > 0)) { /* interactive job, single node, write to pty */ - if ((pts = open_pty(pjob)) < 0) { + pts = -1; + if (fd1 < 0 || fd2 < 0) { + if ((pts = open_pty(pjob)) < 0) { log_err(errno, id,"cannot open slave"); starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr); + } + if (fd1 < 0) + fd1 = pts; + if (fd2 < 0) + fd2 = pts; } - (void)dup2(pts, 1); - (void)dup2(pts, 2); - + (void)dup2(fd1, 1); + (void)dup2(fd2, 2); + if (fd1 != pts) + (void) close(fd1); + if (fd2 != pts) + (void) close(fd2); } else { /* normal batch job, single node, write straight to files */ - if (open_std_out_err(pjob) == -1) { + pts = -1; + if (fd1 < 0 || fd2 < 0) { + if (open_std_out_err(pjob) == -1) starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr); + } + if (fd1 >= 0) { + (void) close(1); + (void) dup2(fd1, 1); + if (fd1 > 1) + (void) close(fd1); + } + if (fd2 >= 0) { + (void) close(2); + (void) dup2(fd2, 2); + if (fd2 > 2) + (void) close(fd2); } }