#!/usr/public/bin/perl
# $Id: momspider,v 1.8 1994/08/10 10:18:29 fielding Exp $
# ---------------------------------------------------------------------------
# Copyright (c) 1994 Regents of the University of California.
# All rights reserved.
#
# MOMspider -- A World-Wide Web spider for multi-owner maintenance of
#              distributed hypertext infostructures.
#
# This software has been developed by Roy Fielding <fielding@ics.uci.edu> as
# part of the Arcadia project at the University of California, Irvine.
# See the file README.html for distribution info and pointers to documentation.
# See the file docs/INSTALL.txt for installation instructions.
# See the file MOM_Changes.pl for known problems and version information.
# See below for usage information.
#
# The latest version of MOMspider can always be obtained from
#     <http://www.ics.uci.edu/WebSoft/MOMspider/>
#  or <ftp://liege.ics.uci.edu/pub/arcadia/MOMspider/>
#
# If you have any suggestions, bug reports, fixes, or enhancements,
# send them to the author Roy Fielding at <fielding@ics.uci.edu>.
#
# Redistribution and use in source and binary forms are permitted,
# subject to the restriction noted below, provided that the above
# copyright notice and this paragraph and the following paragraphs are
# duplicated in all such forms and that any documentation, advertising
# materials, and other materials related to such distribution and use
# acknowledge that the software was developed in part by the University of
# California, Irvine.  The name of the University may not be used to
# endorse or promote products derived from this software without
# specific prior written permission.  THIS SOFTWARE IS PROVIDED ``AS IS''
# AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
# LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE.
#   
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#   
# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION
# (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY
# OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ---------------------------------------------------------------------------

umask(022);    # Allow this process to produce world-readable files

$WWWlib = ($ENV{'LIBWWW_PERL'}    || '.');
$MOMlib = ($ENV{'MOMSPIDER_HOME'} || '.');

if ($WWWlib ne '.') { unshift(@INC, $WWWlib); }
if ($MOMlib ne '.') { unshift(@INC, $MOMlib); }

require "MOM_Changes.pl";
require "getopts.pl";
require "www.pl";
require "momconfig.pl";
require "momhistory.pl";
require "momevent.pl";
require "momavoid.pl";
require "momvisit.pl";

# ==========================================================================
# Get the default configuration options from momconfig.pl

$InstructFile = ($momconfig'InstructFile || ".momspider-instruct");
$SystemAvoid  = ($momconfig'SystemAvoid  || "system-avoid");
$SystemSites  = ($momconfig'SystemSites  || "system-sites");
$AvoidFile    = ($momconfig'AvoidFile    || ".momspider-avoid");
$SitesFile    = ($momconfig'SitesFile    || ".momspider-sites");
$MaxDepth     = ($momconfig'MaxDepth     || 20);

# ==========================================================================
# ==========================================================================
# Print the usage information if help requested (-h) or a bad option given.
#
sub usage
{
    die <<"EndUsage";
usage: momspider [-h] [-e errorfile] [-o outfile] [-i instructfile]
                      [-d maxdepth] [-a avoidfile] [-s sitesfile]
                      [-A system_avoidfile] [-S system_sitesfile]
$Version
WWW Spider for multi-owner maintenance of distributed hypertext infostructures.
Options:                                                        [DEFAULT]
     -h  Help    -- just display this message and quit.
     -e  Append error  history to the following file.           [STDERR]
     -o  Append output history to the following file.           [STDOUT]
     -i  Get your instructions from the following file. 
         [$InstructFile]
     -d  Maximum traversal depth.                               [$MaxDepth]
     -a  Read/write the user's URLs to avoid into the following file. 
         [$AvoidFile]
     -s  Read/write the user's sites visited into the following file. 
         [$SitesFile]
     -A  Read the systemwide URLs to avoid from the following file. 
         [$SystemAvoid]
     -S  Read the systemwide sites visited from the following file. 
         [$SystemSites]
EndUsage
}


# ==========================================================================
# Get the command-line options

if (!(&Getopts('hi:o:e:d:a:s:A:S:')) || $opt_h) { &usage; }

if ($opt_e) { 
    close(STDERR);
    open (STDERR,">> $opt_e") || die "Error opening err file: $!, stopped";
}
select((select(STDERR), $| = 1)[0]);      # Make STDERR unbuffered

if ($opt_o) {
    if (-e $opt_o) { rename($opt_o, "$opt_o.bak"); }
    close(STDOUT);
    open (STDOUT,"> $opt_o")  || die "Error opening log file: $!, stopped";
}
$| = 1;                                   # Make STDOUT unbuffered

if ($opt_d) { $MaxDepth     = $opt_d; }
if ($opt_i) { $InstructFile = $opt_i; }
if ($opt_a) { $AvoidFile    = $opt_a; }
if ($opt_s) { $SitesFile    = $opt_s; }
if ($opt_A) { $SystemAvoid  = $opt_A; }
if ($opt_S) { $SystemSites  = $opt_S; }

# ==========================================================================
# Start working -- load the instructions and the avoid files

&momevent'begin_program($opt_e);

&read_instruct;                          # Get the instructions for each task

&www'set_def_header('http', 'User-Agent', $Version);
&momvisit'setMaxDepth($MaxDepth);

&momavoid'load($SystemAvoid, $SystemSites, 'R');  # Load systemwide avoids
&momavoid'load($AvoidFile,   $SitesFile,   'W');  # Load user's avoids

&momhistory'remember_tops(*TaskTopURL, *TaskIndexURL);

$task = 1;
while ($task <= $#TaskType)
{
    next unless $TaskType[$task];

    foreach $exurl (split(/#/, $TaskExclude[$task]))
    {
        &momavoid'exclude($exurl);
    }

    &momvisit'infostruct($task, $TaskType[$task], $TaskName[$task],
                         $TaskTopURL[$task]);

    &momavoid'clear_excludes;
}
continue { $task++; }

&momavoid'save($AvoidFile, $SitesFile);  # Write user's avoid and sites files

&momevent'end_program($opt_e);

exit(0);


# ==========================================================================
# ==========================================================================
# read_instruct(): Read the task instructions from the InstructFile
#
sub read_instruct
{
    local($task, $intask, $innum, $type, $reason);

    print "Reading instructions from $InstructFile\n";

    if (!open(INSTRUCT, $InstructFile))
    {
        print STDERR "Cannot open the instruction file: $!\n";
        &usage;
    }

    @TaskType         = ();  # Required task type = ('Owner','Tree','Site').
    @TaskName         = ();  # Required task name for descriptive use.
    @TaskTopURL       = ();  # Required WWW URL for the starting top document.
    @TaskIndexURL     = ();  # Required WWW URL for the index document.
    @TaskIndexFile    = ();  # Required full Unix pathname to index file.
    @TaskIndexTitle   = ();  # Optional title string for created index file.
    @TaskEmailAddress = ();  # Optional email address to send alerts.
    @TaskEmailBroken  = ();  # Optional -- send alert if a link is broken?
    @TaskEmailChanged = ();  # Optional -- send alert if modified within X days
    @TaskEmailExpired = ();  # Optional -- send alert if expires  within X days
    @TaskEmailRedirected = (); # Optional -- send alert if a link is redirected
    @TaskChangeWindow = ();  # Optional days a change is still interesting
    @TaskExpireWindow = ();  # Optional days before expiring is interesting
    @TaskExclude      = ();  # Optional URLs to exclude (leaf) from this task

    $task   = 0;
    $intask = 0;
    $innum  = 0;

    while (<INSTRUCT>)
    {
        next if ( /^$/ || /^\#/ );    # Ignore blank and comment lines

        if (!$intask)                 # We are not within a task,
        {
            $innum++;
            if ( /^<(\w+)\b/ )             # Start of next task?
            {
                $type = $1;
                if ($momconfig'Allowed{$type})
                {
                    $task++;
                    $TaskType[$task]    = $type;
                    $TaskExclude[$task] = '';
                    $intask = 1;
                }
                else
                {
                    print STDERR "Instruction type $type is not allowed";
                    print STDERR ", skipping instruction $innum\n";
                    while (<INSTRUCT>)
                    {
                        last if /^>$/;
                    }
                }
            }
            elsif ( /^SystemAvoid\s+(\S+)\s/ )
            {
                if (!$opt_A) { $SystemAvoid = $1; }
            }
            elsif ( /^SystemSites\s+(\S+)\s/ )
            {
                if (!$opt_S) { $SystemSites = $1; }
            }
            elsif ( /^AvoidFile\s+(\S+)\s/ )
            {
                if (!$opt_a) { $AvoidFile = $1; }
            }
            elsif ( /^SitesFile\s+(\S+)\s/ )
            {
                if (!$opt_s) { $SitesFile = $1; }
            }
            elsif ( /^SitesCheck\s+(\d+)\s/ )
            {
                &momavoid'setCheckInterval($1);
            }
            elsif ( /^ReplyTo\s+(\S.*)/ )
            {
                &www'set_def_header('http','From',$1);
            }
            elsif ( /^MaxDepth\s+(\d+)\s/ )
            {
                if (!$opt_d) { $MaxDepth = $1; }
            }
            else
            {
                print STDERR "Unrecognized instruction $innum at line $.\n";
                print STDERR "  of $InstructFile\n";
            }
        }
        else                          # We are currently within a task
        {
            if ( /^>\s*$/ )             # Line indicates End of Task?
            {
                $reason = '';
                if (! $TaskType[$task])   # Check for missing requirements
                {
                    $reason = "has no task Type";
                }
                elsif (! $TaskName[$task])
                {
                    $reason = "has no Name";
                }
                elsif (! $TaskTopURL[$task])
                {
                    $reason = "has no TopURL";
                }
                elsif (! $TaskIndexURL[$task] )
                {
                    $reason = "has no IndexURL";
                }
                elsif (! $TaskIndexFile[$task])
                {
                    $reason = "has no IndexFile";
                }
                elsif ((! $TaskEmailAddress[$task]) &&
                         ($TaskEmailBroken[$task]  ||
                          $TaskEmailChanged[$task] ||
                          $TaskEmailExpired[$task] ||
                          $TaskEmailRedirected[$task]  ))
                {
                    $reason = "has no EmailAddress";
                }

                if ($reason)               # If a task requirement was not met,
                {                          #        then undo its task options.
                    print(STDERR "Instruction $innum ", $reason,
                                 ", skipping it\n");
                    undef $TaskType[$task];
                    undef $TaskName[$task];
                    undef $TaskTopURL[$task];
                    undef $TaskIndexURL[$task];
                    undef $TaskIndexFile[$task];
                    undef $TaskIndexTitle[$task];
                    undef $TaskEmailAddress[$task];
                    undef $TaskEmailBroken[$task];
                    undef $TaskEmailChanged[$task];
                    undef $TaskEmailExpired[$task];
                    undef $TaskEmailRedirected[$task];
                    undef $TaskChangeWindow[$task];
                    undef $TaskExpireWindow[$task];
                    undef $TaskExclude[$task];
                    $task--;
                }
                else                        # Fill in the defaults if needed
                {
                    if (! $TaskIndexTitle[$task])
                    {
                        $TaskIndexTitle[$task] = "MOMspider Index for " .
                                                 $TaskName[$task];
                    }
                    if (!defined($TaskChangeWindow[$task]))
                    {
                        $TaskChangeWindow[$task] = 7;
                    }
                    if (!defined($TaskExpireWindow[$task]))
                    {
                        $TaskExpireWindow[$task] = 0;
                    }
                    $TaskExclude[$task] .= $TaskIndexURL[$task];
                }
                $intask = 0;
            }
            elsif ( /^</ )                 # Line indicates Beginning of Task?
            {
                die "Instruction $innum is not properly terminated, stopped";
            }
            elsif ( /^\s*Name\s+(\S+)/ )
            {
                $TaskName[$task] = $1;
            }
            elsif ( /^\s*TopURL\s+(\S+)/ )
            {
                $TaskTopURL[$task] = $1;
            }
            elsif ( /^\s*IndexURL\s+(\S+)/ )
            {
                $TaskIndexURL[$task] = $1;
            }
            elsif ( /^\s*IndexFile\s+(\S+)/ )
            {
                $TaskIndexFile[$task] = $1;
            }
            elsif ( /^\s*IndexTitle\s+(\S.*)/ )
            {
                $TaskIndexTitle[$task] = $1;
            }
            elsif ( /^\s*EmailAddress\s+(\S.*)/ )
            {
                $TaskEmailAddress[$task] = $1;
            }
            elsif ( /^\s*EmailBroken\s/ )
            {
                $TaskEmailBroken[$task] = 1;
            }
            elsif ( /^\s*EmailChanged\s+(\d+)/ )
            {
                $TaskEmailChanged[$task] = $1;
            }
            elsif ( /^\s*EmailExpired\s+(\d+)/ )
            {
                $TaskEmailExpired[$task] = $1;
            }
            elsif ( /^\s*EmailRedirected\s/ )
            {
                $TaskEmailRedirected[$task] = 1;
            }
            elsif ( /^\s*ChangeWindow\s+(\d+)/ )
            {
                $TaskChangeWindow[$task] = $1;
            }
            elsif ( /^\s*ExpireWindow\s+(\d+)/ )
            {
                $TaskExpireWindow[$task] = $1;
            }
            elsif ( /^\s*Exclude\s+([^#\s]+)/ )
            {
                $TaskExclude[$task] .= $1 . '#';
            }
            else
            {
                print STDERR "Unrecognized task option in instruction $innum\n";
                print STDERR "  at line $. of $InstructFile\n";
            }
        }
    }
    if ($intask)
    {
        die "Last instruction is not properly terminated, stopped";
    }
    close INSTRUCT;
}

# ==========================================================================

1;