#!/usr/bin/perl -w # # Run some tests. # # Copyright (C) 2000-6 Pete Wyckoff # # $Id: runtests.pl 388 2006-11-27 17:09:48Z pw $ # use strict qw(subs refs); use English; # # If "qsub" is not your qsub program, change $qsub to include the explicit # path. You can also add other qsub options here too. # $qsub = "qsub"; # # If the backend nodes don't mount this build directory, or they mount # it under a different name, you'll have to change $dir to suit them. # If the directory is not available, copy the mpiexec and hello codes # somewhere the compute nodes can get to them, and point $dir there. # $dir = `pwd`; chomp($dir); # # This script assumes that it will be able to get $available_nodes nodes and # $smpsize processors per node from pbs during the testing phase. Change # these numbers if this won't work for your site. # $available_nodes = 4; $smpsize = 2; # # If your PBS configuration has time-shared nodes (generally only for largish # SMPs), you should set this to 1 which tells the script to request, e.g. # "ncpus=2" instead of "nodes=4:ppn=2". In this case, $available_nodes above # is ignored, only 1 node is used. Node properties (below) are also ignored # when this is on. # #$use_ncpus = 1; # # Any special node properties? This string will get appended to the # nodes line, separated by a colon, e.g.: -l nodes=2:ppn=2:amd # If your PBS scheduler is clever and tries to run single-node jobs # on hardware that does not have a fancy network card, the job may fail # when it tries to open the network card; add the right property here. # #$nodeprops = "amd"; #$nodeprops = "myrinet"; #$nodeprops = "ib"; #$nodeprops = "gige"; # # If, for testing purposes, you want to pass some arguments to mpiexec # every time, put them here. Perhaps specify a communication library # that is different from the compiled-in default. # #$mpcommargs = "--comm=pmi"; # # Similarly, if you need to set up your environment or load modules or # whatever, do it in this line that will be run immediately before the # mpiexec test line, once in every batch script. # #$mpenvsetup = "mod load mpich-p4"; # # If you know that it is not possible in your configuration to # use your default shell, so you want a permanent "#PBS -S /bin/sh" # line in every script, put it here. This also disables the # various shell tests. Set it to "" to use the user's default shell # and still disable testings all the various shells. # #$fixed_shell = "/bin/sh"; $fixed_shell = ""; # # No need to change below here, hopefully. # $testlines = 5; # number of lines to feed to hello across stdin $sleep_interval = 1.0; # seconds between dots during wait # initialize possibly empty variables $nodeprops = "" unless defined($nodeprops); $mpcommargs = "" unless defined($mpcommargs); $use_ncpus = 0 unless defined($use_ncpus); # # Grep through an output file. Set this flag before the run if you # only want to count the lines. Problems with pbs_demux mixing up the # output prevents testing the actual contents of each line reliably in # the case of -nostdout. # $lookhello_count_only = 0; sub lookhello($$$) { my ($file, $lines_expected, $file_description) = @_; my $lines = 0; if (!open(OUT, $file)) { print "Missing $file_description output file: $file.\n"; return; } while () { ++$lines; if ($lookhello_count_only) { next; } if (!/^hello /) { if (defined($ignore_regex) && /$ignore_regex/) { --$lines; # explicitly requested this line be ignored } else { print "File $file: unexpected line: $_"; } } } close(OUT); if ($lines != $lines_expected) { print "File $file: got $lines lines, expected $lines_expected.\n" } } # # Run one test. Verify results. # args: # args to mpexec # args to hello # lines in mpiexec joined stdout/err # lines in pbs joined stdout/err # $iter = 1; sub run($$$$) { my ($mpargs, $helloargs, $expectmp, $expectpbs) = @_; ## debugging #$mpargs .= " -v"; my $suffix = sprintf("%d.%02s", $$, $iter); my $scrfile = "testqs.$suffix"; my $outfile = "testqo.$suffix"; my $helloout = "testho.$suffix"; open(TMP, ">$scrfile"); print TMP "#!/bin/sh\n"; my $str; if ($use_ncpus > 0) { $str = "#PBS -l ncpus=$smpsize"; # nodes is always 1 } else { $str = "#PBS -l nodes=$nodes:ppn=$smpsize"; if (length($nodeprops)) { $str .= ":$nodeprops"; } } print TMP "$str\n"; print TMP "#PBS -l walltime=5:00\n"; print TMP "#PBS -l cput=", 5 * $nodes * $smpsize, ":00\n"; print TMP "#PBS -j oe\n"; print TMP "#PBS -o $outfile\n"; if (length($shell)) { print TMP "#PBS -S $shell\n"; } print TMP "cd $dir\n"; if (defined($mpenvsetup) && $mpenvsetup =~ /./) { print TMP "$mpenvsetup\n"; } if (defined($stdin_no_pipe)) { print TMP $stdin_no_pipe; } elsif (defined($stdin_gets)) { print TMP "echo $stdin_gets | "; } else { # What is a good way to do this? brooks@aero.org pointed out a # nice way to use perl, but it may not be on the compute nodes. # echo -e is not on bsd. seq, jot are non-standard. print TMP "echo '[1+pd$testlines>x]sx0lxxq' | dc | "; } print TMP "./mpiexec $mpcommargs $mpargs"; if ($mpargs !~ /-config/) { print TMP " hello $helloargs"; } print TMP " > $helloout 2>&1\n"; close(TMP); my $qsub_res; my $qsub_errfile = "/tmp/mpiexec-runtests-qsub.err"; for (;;) { unlink($qsub_errfile); $qsub_res = `sh -c '$qsub $scrfile 2>$qsub_errfile'`; if (-s $qsub_errfile) { print "Runtests.pl: qsub failed: "; system("cat $qsub_errfile"); unlink($qsub_errfile); print "Runtests.pl: trying again in 1 second.\n"; sleep(1); next; } unlink($qsub_errfile); last; } # wait for the results $qsub_res =~ s/\..*\n//; print "$qsub_res to $outfile mpiexec"; if (length($mpcommargs)) { print " $mpcommargs"; } if (length($mpargs)) { print " $mpargs"; } print " hello"; if (length($helloargs)) { print " $helloargs"; } if (length($shell)) { print " shell=\"$shell\""; } print " "; for (; ! -f $outfile;) { print "."; select(undef, undef, undef, $sleep_interval); } print "\n"; # wait a bit for the file contents to completely appear, NFS or copy select(undef, undef, undef, 2 * $sleep_interval); lookhello($helloout, $expectmp, "mpiexec stdout"); lookhello($outfile, $expectpbs, "PBS -o output"); # inc iter for next ++$iter; # let gm and pbs catch and clean up sleep(1); } sub numproc_tests(@) { my @testlist = @_; # -n for my $i (@testlist) { run("-n $i", "", $i, 0); } } sub pernode_nolocal_tests() { if ($smpsize > 1) { run("-pernode", "", $nodes, 0); } if ($nodes > 1) { run("-nolocal", "", ($nodes - 1) * $smpsize, 0); run("-nolocal -pernode", "", $nodes - 1, 0); } } # # stdio tests # sub stdio_tests($) { my ($count) = @_; # 1..6 hello should ignore the stdin run("", "", $count, 0); run("-nostdin", "", $count, 0); run("-allstdin", "", $count, 0); $lookhello_count_only = 1; run("-nostdout", "", 0, $count); run("-nostdin -nostdout", "", 0, $count); run("-allstdin -nostdout", "", 0, $count); $lookhello_count_only = 0; # 7..12 hello will try to cat the stdin run("", "-l", $count + $testlines, 0); run("-nostdin", "-l", $count, 0); run("-allstdin", "-l", $count + $count * $testlines, 0); $lookhello_count_only = 1; run("-nostdout", "-l", 0, $count + $testlines); run("-nostdout -nostdin", "-l", 0, $count); run("-nostdout -allstdin", "-l", 0, $count + $count * $testlines); $lookhello_count_only = 0; # 13..18 things should work on stderr too run("", "-stderr", $count, 0); run("-nostdin", "-stderr", $count, 0); run("-allstdin", "-stderr", $count, 0); $lookhello_count_only = 1; run("-nostdout", "-stderr", 0, $count); run("-nostdin -nostdout", "-stderr", 0, $count); run("-allstdin -nostdout", "-stderr", 0, $count); $lookhello_count_only = 0; # 19..24 sleepy hello is slow to exit after it closes all stdio run("", "-sleep", $count, 0); run("-nostdin", "-sleep", $count, 0); run("-allstdin", "-sleep", $count, 0); $lookhello_count_only = 1; run("-nostdout", "-sleep", 0, $count); run("-nostdout -nostdin", "-sleep", 0, $count); run("-nostdout -allstdin", "-sleep", 0, $count); $lookhello_count_only = 0; # closing stdin should work, even if no -nostdin (though silly) # Note if you configured your torque with --enable-shell-pipe # (the default), bash will exit when this closes its command # stream. If there's no .ho file, that's why. Use --enable-shell-argv # instead. $stdin_no_pipe = "exec 0<&-\n"; run("", "-l", $count, 0); undef($stdin_no_pipe); } # # Test what happens when a process dies with segv. # sub segv_tests($@) { my $count = shift @_; my @testlist = @_; # The various MPIs say different things as processes die oddly that we try # to catch and ignore here. Mpich/p4 catches the segv and does exit(1) # instead, meaning the others may or may not exit properly. Dumb. Worse # yet, each task tries to call up all the others and sleeps 2 sec between # failed attempts. That really scales, yeah. Could add -kill here to avoid # that issue, although it covers up the mpich problem. print "Note: mpich1/p4 catches SIGSEGV then tries to contact all other\n"; print "processes, with a 2 second delay after each. If you are using\n"; print "that MPI implementation, these tests could hang until the job\n"; print "walltime (5 min) is reached.\n"; # the selected segvs after 2 sec, rest sleep 6 sec $ignore_regex = "(died with signal|exited with status|p4_error|net_send)"; for my $i (@testlist) { run("", "-sleep -segv " . ($i-1), $count, 0); } # all do segv before MPI_Init $ignore_regex = "(died with signal|exited before completing MPI startup|were never spawned)"; run("", "-segvearly", 0, 0); undef($ignore_regex); } # # Test when a process calls MPI_Abort(). Note that mpich2 (using PMI) does # not support abort. The other tasks may hang in MPI_Finalize and these # tests will exit only after the batch system time limit. # sub abort_tests($@) { my $count = shift @_; my @testlist = @_; print "Note: MPI_Abort is not implemented properly in mpich2. If you\n"; print "are using that MPI implementation, these tests will hang until\n"; print "the job walltime (5 min) is reached.\n"; # the given task calls MPI_Abort after 2 sec, while the rest sleep 6 sec $ignore_regex = "(died with signal|exited with status|p4_error|net_send" . "|MPI[_ ]Abort|Aborting program|aborting job)"; for my $i (@testlist) { run("", "-sleep -abort " . ($i-1), $count, 0); } } # # Check exit value returned to environment. # sub exitval_tests($) { my $count = shift @_; for (my $i=0; $i<=2; $i++) { if ($i > 0) { $ignore_regex = "exited with status $i\\\."; } run("", "-exitval $i", $count, 0); undef($ignore_regex); } } # # Hostname transformation. # sub transform_hostname_tests($) { my $count = shift @_; run("--transform-hostname=s/hiya/hiya/", "", $count, 0); run("--transform-hostname-program=cat", "", $count, 0); } # # Test use of config file, once from a file and once from stdin. # Builds the config file or stdin string, then calls run() to do # the dirty work and verify the results. # sub config_tests(@) { my @testlist = @_; for my $i (@testlist) { my $s = sprintf("testc.%d.%02s", $$, $iter); open(TMP, ">$s"); print TMP "# mpiexec config\n"; print TMP "-n $i : hello\n"; close(TMP); run("-config $s", "", $i, 0); } for my $i (@testlist) { $stdin_gets = "\"-n $i : hello\""; run("-config=- -nostdin", "", $i, 0); undef($stdin_gets); } # Test constraint handling too: more or fewer lines in config # file than are available, with or without -np on command line. # Just test max size in list. my $i = $testlist[$#testlist]; my $j; # config file specifies more than available, with and without -np $j = $i + 1; $stdin_gets = "\"-n $j : hello\""; run("-config=- -nostdin", "", $i, 0); run("-config=- -nostdin -np $i", "", $i, 0); # config file specifies fewer than available, with and without -np if ($i > 1) { $j = $i - 1; $stdin_gets = "\"-n $j : hello\""; run("-config=- -nostdin", "", $j, 0); # should error, asked for $i but config file only gave $i-1 $ignore_regex = "(specifies $i processors|only matched $j\\\.)"; run("-config=- -nostdin -np $i", "", 0, 0); undef($ignore_regex); } # done undef($stdin_gets); } # # Test each shell, make sure the "cd" in the mpiexec startup line works, # and test passing a single quote. # # Some shells are just unsuitable for use non-interactively. # In particular, csh and tcsh here complain about "no access # to tty (Bad file descriptor)", but work anyway. # sub shell_test() { $nodes = 1; my $count = $nodes * $smpsize; my @shells; print "Testing the various shells on your system.\n"; if (open(SHELLS, "/etc/shells")) { while () { chomp; if (-x $_) { push @shells, $_; } } close(SHELLS); } else { # reasonable defaults? for ( "/bin/sh", "/bin/csh" ) { if (-x $_) { push @shells, $_; } } } # modify global $shell for each test for $shell (@shells) { print "shell is $shell\n"; # these escapes in perl yield "don\'doit" in file run("", "don\\\'tdoit", $count, 0); } } # # main() # $| = 1; # see output immediately if (not -x "hello") { print "No executable \"hello\". You might \"make hello\" first.\n"; exit(1); } # Common user error: PBS is not happy running jobs for root. if ($EUID == 0) { print "PBS will not run jobs for root.", " Run this script as a normal user.\n"; exit(1); } # what shell if (defined($fixed_shell) && $fixed_shell =~ /./) { $shell = $fixed_shell; } else { $shell = ""; # use user default } # test max nodes and just one node if ($use_ncpus > 0) { $available_nodes = 1; # no multi-node for time-shared } @nodeset = ($available_nodes); if ($available_nodes > 1) { # test nodes=1 case too push @nodeset, 1; } foreach $nodes (@nodeset) { print "Testing $nodes node", ($nodes > 1 ? "s" : ""), " with SMP size $smpsize.\n"; # total number of processes my $count = $nodes * $smpsize; # # Only test some subset of all possible combinations. Just beyond # the SMP size, then the full size. # my @testlist; my $max = $smpsize + 1; if ($max > $count) { @testlist = (1..$count); } else { @testlist = (1..$max,$count); } numproc_tests(@testlist); pernode_nolocal_tests(); stdio_tests($count); segv_tests($count, @testlist); abort_tests($count, @testlist); exitval_tests($count); transform_hostname_tests($count); config_tests(@testlist); } if (!defined($fixed_shell)) { shell_test(); } exit 0;