#!/usr/bin/env perl
#
# script4rss  v. 0.4b by Pieter Edelman
# This is a perl script which attempts to generate out a perl script (no this 
# isn't a typo) which converts a (single) HTML page to an RSS feed.
# WARNING: This script is usable but not ready for prime time.
#
# Copyright (c) 2004 Pieter Edelman
# Released under the terms of the GNU General Public License (GPL) Version 2.
# See http://www.gnu.org/ for details.

use Getopt::Std;

# The number of catagories which needs to be extracted (normally this will be 1)
$num_match_catagories = 0;

# The required and optional variables in bitmask
# 1 is always on, 0/2 means is scalar/array, 0/4 means optional/required,
# 8 means belonging to a match
# 16 means a string, 32 means a regex, 64 means a boolean, 128 means a time string
$variables{"script_name"}         = 0b00010101;
$variables{"script_author"}       = 0b00010101;
$variables{"license"}             = 0b00010111;
$variables{"comment"}             = 0b00010011;
$variables{"feed_title"}          = 0b00010101;
$variables{"feed_uri"}            = 0b00010101;
$variables{"feed_description"}    = 0b00010011;
$variables{"feed_image_uri"}      = 0b00010001;
$variables{"feed_interval"}       = 0b10000001;
$variables{"fix_html"}            = 0b01000001;
$variables{"read_line_multiple"}  = 0b01000001;
$variables{"search"}              = 0b00101111;
$variables{"match"}               = 0b00101111;
$variables{"match_start"}         = 0b00101011;
$variables{"match_end"}           = 0b00101011;
$variables{"title"}               = 0b00011111;
$variables{"title_prefix"}        = 0b00011011;
$variables{"title_postfix"}       = 0b00011011;
$variables{"link"}                = 0b00011111;
$variables{"link_prefix"}         = 0b00011011;
$variables{"link_postfix"}        = 0b00011011;
$variables{"description"}         = 0b00011011;
$variables{"description_prefix"}  = 0b00011011;
$variables{"description_postfix"} = 0b00011011;

$defaults{"fix_html"} = 1;

my %extra_matches;

# Call the main parts
&printBlurb;
&getInputFile;
&defineStrings;
&processDescriptionFile;
&printScript;

sub printBlurb {
# Prints the welcome message to the user
  print "\n** This is script4rss, version 0.4b **\n";
  print "** http://script4rss.sf.net/ **\n";
  print "If you create a useful script, please consider donating it at\n";
  print "http://home.kcore.de/~kiza/software/snownews/snowscripts/\n\n";
}


sub printUsage {
# Prints out how the program should be used
  print "Usage: script4rss.pl [-o outfile] description_file[.s4r]\n";
  print "See doc/doc.html for more information\n";
  exit;
}

sub getInputFile {
  # Parses the command line for an input file name and optionally the output file name
  # The file name is given on the command line
  if (!getopts('o:h')) {
    &printUsage();
  } else {
    if (defined($opt_h)) {
      # If -h switch was given
      &printUsage();
    }
    if (defined($opt_o)) {
      # If an output file was specified
      $out_file_name = $opt_o;
    }
    if (!$ARGV[0]) {
      # If no file name was given
      &printUsage();
    } else {
      $in_file_name = $ARGV[0];
      # Open the file, try the extension .s4r if it can't be found
      if (!(-e $in_file_name)) {
        if (-e $in_file_name.'.s4r') {
          $in_file_name .= '.s4r';
        } else {
          &raise("\"$in_file_name\" or \"$in_file_name.s4r\" can't be opened...");
        }
      } 
      print "Using \"$in_file_name\" as input file\n";
      open(in_file, $in_file_name);
    }
  }
}

sub createComment {
  # Creates a properly formatted and indented comment
  # Arguments: (comment,  indentation level, lines to skip)
  local $comment = "";
  
  $comment .= &indent($_[1]);
  $comment .= '# '.$_[0]."\n";
  print out_file $comment;
  skipLines($_[2]);
}

sub createCommand {
  # Creates a properly formatted and indented command
  # Arguments (command,  indentation level, lines to skip)
  local $command = "";
  
  $command .= &indent($_[1]);
  $command .= $_[0]."\n";
  print out_file $command;
  skipLines($_[2]);
}

sub createPrintCommand {
  # Creates a properly formatted and indented command
  # Arguments: (command,encoding ('|"), indentation level, output indentation level, lines to skip)
  local $command = "";
  
  $command .= &indent($_[2])."print ";
  if ($_[1] =~ /\'/) {
    $command .= "'".&indent($_[3]).$_[0].'\'."\n";';
  } else {
    $command .= '"'.&indent($_[3]).$_[0].'\n";';
  }
  createCommand($command, $_[4]);
}

sub indent {
  # Adds spaces for indentation according to the level as argument
  local ($spaces, $i) = ("", 0, 0);
  
  for ($i = 0; $i < ($_[0] * 2); $i++) {$spaces .= " "}
  $spaces;
}

sub skipLines {
  # Skips a number of lines
  if ($_[0] > 0) {
    for ($i = 0; $i < $_[0]; $i++) {
      print out_file "\n";
    }
  }
}

sub constructMatchedPattern {
  # Construct a string from a var name, and its pre-and postfix
  # Arguments are the var name as a string and the index if applicable
  local ($type, $type_prefix, $type_postfix, $content);
  $type_prefix  = "";
  $type_postfix = "";

  if (defined($_[1])) {
    # The match is part of a non-user-defined match
    eval "\$type = \$$_[0]\[$_[1]];";
    eval "if (defined(\$$_[0]_prefix[$_[1]])) {\$type_prefix = \$$_[0]_prefix[$_[1]];}";
    eval "if (defined(\$$_[0]_postfix[$_[1]])) {\$type_postfix = \$$_[0]_postfix[$_[1]];}";
  } else {
    eval "\$type = \$$_[0];";
    eval "if (defined(\$$_[0]_prefix)) {\$type_prefix = \$$_[0]_prefix;}";
    eval "if (defined(\$$_[0]_postfix)) {\$type_postfix = \$$_[0]_postfix;}";
  }
  
  if ($type) {
    $content = $type_prefix;
    if ($type =~ /^\d+$/) {
      # If there's only a number
      $content .= "'.";
      ($content .= $type) =~ s/(\d+)/&cleanup(\$match\[\1\])/g;
      $content .= ".'";
    } else {
      # A string should be processed and backreferences converted to matches
      $orig = $type;
      @parts = (split /(.*?)(\\+)(\d+)(.*)/, $orig);
      if (scalar(@parts) > 1) {
        $stripped = "";
        while (length($parts[3]) != 0) {
          if ((length($parts[2])/2) != int(length($parts[2])/2)) {
            $stripped .= $parts[1];
            for ($i = 1; $i < (length($parts[2])); $i++) {
              $stripped .= '\\';
            }
            $stripped .= '\'.&cleanup($match['.$parts[3].']).\'';
            $orig = $parts[4];
          } else {
            $stripped .= $parts[1].$parts[2].$parts[3];
            $orig = $parts[4];
          }
          @parts = (split /(.*?)(\\+)(\d+)(.*)/, $orig); 
        } 
      } else {
        $stripped = $orig;
      }

      $orig = $stripped;
      @parts = (split /(.*?)(\\+)(\w+)(.*)/, $orig);
      if (scalar(@parts) > 1) {
        $stripped = "";
        while (length($parts[3]) != 0) {
          print "@parts\n";
          if ((length($parts[2])/2) != int(length($parts[2])/2)) {
            $stripped .= $parts[1];
            for ($i = 1; $i < (length($parts[2])); $i++) {
              $stripped .= '\\';
            }
            $stripped .= '\'.$'.$parts[3].'_name.\'';
            $orig = $parts[4];
          } else {
            $stripped .= $parts[1].$parts[2].$parts[3];
            $orig = $parts[4];
          }
          @parts = (split /(.*?)(\\+)(\d+)(.*)/, $orig); 
        }
      } else {
        $stripped = $orig;
      }
      $content = $stripped;
    }
    $content .= $type_postfix;
  }
  
  # Filter out empty strings
  $content =~ s/''\.//g;
  $content =~ s/\.''//g;
  $content;
}

sub createFeedEntry {
  # Create a properly formatted feed entry, the name of which is gaven as an argument
  local ($type);
  
  eval "\$type = \$$_[0]\[$html_cat_num];";
  if ($type) {
    $command = "print '".&indent(3)."<$_[0]>".&constructMatchedPattern($_[0], $html_cat_num)."<\/$_[0]>'.\"\\n\";";
    &createCommand($command, 3);
  }
}

sub constructExtraMatchBlock {
  # Construct an extra, user defined match
  local ($match_name, $test);
  $match_name = $_[0];
  
  &createComment("Find $match_name", 1);
  eval '$test = $'.$match_name.'_search';
  &createCommand('if ($line =~ m'.$test.') {', 1);

  eval "\$test = \$$match_name_start;";
  if (defined($test)) {
    &createComment("Skip to the first occurence of $test", 2);
    &createCommand("while (\$line !~ m$test) {", 2);
    if ($fix_html) {$command = '$line = &fixHTML(shift(@lines));'} else {$command = '$line = shift(@lines);'}
    &createCommand($command, 3);
    &createCommand("}", 2);
  }
  
  # Create the part which concats the next line untile the closing tag, if needed
  eval '$test = $'.$match_name.'_end;';
  if (defined($test)) {
    &createComment("Create a line up until $test", 2);
    &createCommand("while (\$line !~ m$test) {", 2);
    &createCommand('chomp($line);', 3);
    if ($fix_html) {$command = '$line .= &fixHTML(shift(@lines));'} else {$command = '$line .= shift(@lines);'}
    &createCommand($command, 3);
    &createCommand('}', 2);
  }
  
  eval '$test = $'.$match_name.'_match;';
  &createComment('Find the $match_name name', 2);
  &createCommand('@match = (split'.$test.', $line);', 2);
  &createCommand('$'.$match_name."_name = '".&constructMatchedPattern($match_name).'\';', 2);
  
  &createCommand('}', 1, 1);
}

sub addExtraMatch {
  local ($var_name, $var_value, $var_identifier);
  ($var_name, $var_value) = @_;
  
  ($var_identifier = $var_name) =~ s/(.*)(_match|_search|_start|_end|_prefix|_postfix)/\1/;
  $extra_matches{$var_identifier} = 1;
  $variables{$var_identifier}            = 0b00010101;
  $variables{$var_identifier.'_search'}  = 0b00100101;
  $variables{$var_identifier.'_match'}   = 0b00100101;
  $variables{$var_identifier.'_start'}   = 0b00100001;
  $variables{$var_identifier.'_end'}     = 0b00100001;
  $variables{$var_identifier.'_prefix'}  = 0b00010001;
  $variables{$var_identifier.'_postfix'} = 0b00010001;
  &setScalar($var_name, $var_value);
}

sub raise {
  # Prints an error to the console and aborts
  print STDERR "$_[0]\nAborted!\n";
  exit;
}

sub checkVariable {
  # Check if the variable is not defined twice and formatted properly
  local ($var_name, $var_value, $already_defined);
  ($var_name, $var_value) = ($_[0], $_[1]);
  
  if ($variables{$var_name}) {
    # Check for multiple definitions in scalars
    if (not($variables{$var_name} & 2)) {
      eval('$already_defined = defined($'.$var_name.');');
      if ($already_defined) {&raise("\"$var_name\" is defined twice...")};
    }

    if ($variables{$var_name} & 16) {
      # Convert double backslashes in strings to single ones
      $var_value =~ s/[\\][\\]/\\/g;
    } elsif ($variables{$var_name} & 64) {
      # Should be a boolean
      if ($var_value =~ /^(false|no|0)$/i) {
        $var_value = 0;
      }
      elsif ($var_value =~ /^(true|yes|[1-9]*)$/i) {
        $var_value = 1;
      }
      else {
        &raise("\"$var_name\" should be a boolean...");
      }
    } elsif ($variables{$var_name} & 32) {
      # Should be a regex
      if ($var_value !~ /\/.*\/[ig]*/) {
        raise "$var_name should be a regular expression";
      }
    } elsif ($variables{$var_name} & 128) {
      # Should be a time string
      if ($var_value !~ /(\d+w)?(\d+d)?(\d+h)?(\d+m)?/) {
        raise "$var_name should be a time formatted as XXwXXdXXhXXm";
      } else {
        local ($interval_str);
        $interval_str = $var_value;
        $interval_str =~ s/(\d*\D)(.)/\1+\2/g;
        $interval_str =~ s/w/*10080/;
        $interval_str =~ s/d/*1440/;
        $interval_str =~ s/h/*60/;
        $interval_str =~ s/m//;
        eval '$var_value = '.$interval_str.';';
      }
    }
  }
  
  $var_value;
}

sub setScalar {
  # Sets a scalar from the input file to a given value, after some checks
  local ($var_value);
  $var_value = &checkVariable($_[0], $_[1]);

  if (($variables{$var_name} & 16) || ($variables{$var_name} & 32)) {
    # If the scalar is an array or a regex
    eval "\$$_[0] = \$var_value";
  } else {
    eval "\$$_[0] = $var_value";
  }
}

sub setArray {
  # Adds or sets a value to an array, after some proper checking
  local ($var_name, $var_value) = @_;
  $var_value = &checkVariable($var_name, $var_value);

  if ($variables{$var_name} & 8) {
    # Belongs to a match
    eval "\$$var_name\[".($num_match_catagories-1)."\] = \$var_value";
  } else {
    eval "push(\@$var_name, '$var_value')";
  }
}

sub processDescriptionFile {
  local ($var_name, $var_content);

  # Process the contents of the output file
  foreach (<in_file>) {
    # Strip out comments, leave escaped hashes alone
    $orig = $_;
    @parts = (split /(.*?)(\\*)(#+)(.*)/, $orig); 
    $stripped = "";
    # While a hash mark is found
    while (length($parts[3]) == 1) {
      if ((length($parts[2])/2) != int(length($parts[2])/2)) {
        # If its prepended by an odd number of backslashes, add it and try to find hashes in de next part
        $stripped .= $parts[1];
        for ($i = 1; $i < (length($parts[2]) - 1); $i++) {
          $stripped .= '\\';
        }
        $stripped .= $parts[3];
        $orig = $parts[4];
        @parts = (split /(.*?)(\\*)(#+)(.*)/, $orig); 
      } else {
        # If it's not escaped, add the first part and throw away the rest
        $stripped .= $parts[1].$parts[2];
        $orig = "";
        last;
      }
    }
    $stripped .= $orig;
    $_ = $stripped;
    
    # Find catagories
    if ($_ =~ /^\s*\[.*\]\s*$/) {
      $num_match_catagories++;
    }
    # Process lines that are in the form name : value
    elsif ($_ =~ /^.*[^\s]+.*:.*[^\s]+.*$/) {
      # Extract the variable name and content
      split(/^\s*(.*?)\s*:\s*(.*)\s*$/, $_);
      $var_name    = $_[1];
      $var_content = $_[2];
      
      if ($variables{$var_name}) {
        if ($variables{$var_name} & 2) {
        # Array
          setArray($var_name, $var_content);
        } else {
          setScalar($var_name, $var_content);
        }
      } else {
        # The user apparently created an additional match pattern for his own purpose
        addExtraMatch($var_name, $var_content);
      }
    }
  }
  # Close the file
  close(in_file);
  
  # Check if all the required parameters are there
  foreach $var_name (keys(%variables)) {
    if ($variables{$var_name} & 4) { #Required
      if ($variables{$var_name} & 2) { #Array
        if ($variables{$var_name} & 8) { #Belongs to a match
          eval "\@test = \@$var_name;";
          for ($i = 0; $i < $num_match_catagories; $i++) {
            if (not defined($test[$i])) {
              #~ print "@test"."\n";
              &raise("\"$var_name\" missing in match catagory ".($i + 1)."...");
            }
          }
        } else { # Is a multiline variable
          eval "\@test = \@$var_name;";
          if (not defined(@test)) {
            &raise("\"$var_name\" is not defined...");
          }
        }
      } else { # Is a scalar
        eval "\$test = \$$var_name;";
        if (not defined($test)) {
          &raise("\"$var_name\" is not defined...");
        }
      }
    }
  }
  # Set unset vars to their default value
  foreach $var_name (keys(%defaults)) {
    eval "\$test = \$$var_name;";
    if (not defined($test)) {
      eval "\$$var_name = \$defaults{$var_name};";
    }
  }
  # TODO: check for defined _prefixes and _postfixes without defined matches and stuff
}

sub printScript {
  # Open the script to write
  if (!defined($out_file_name)) {
    $out_file_name = $script_name;
  }
  if ($out_file_name eq $in_file_name) {
    &raise("Input and output names are the same. That doesn't look really smart to me...");
  }
  open(out_file, ">$out_file_name");
  print "Writing to \"$out_file_name\"\n";
  
  # Print out the credits and comments
  print out_file "#!/usr/bin/env perl\n";
  &createComment();
  &createComment("$script_name by $script_author");
  &createComment("Generated by script4rss (http://script4rss.sf.net)");
  &createComment();
  &createComment("This script converts $feed_uri to an RSS feed");
  if (@comment) {
    &createComment();
    &createComment("==============================================================================");
    foreach (@comment) {
      &createComment($_);
    }
    &createComment("==============================================================================");
  }
  &createComment();
  @date = localtime();
  &createComment("Copyright(c) ".(@date[5] + 1900)." @script_author");
  foreach (@license) {
    &createComment($_);
  }
  skipLines(2);

  # If HTML should be fixed before it is matched, create the sub to do so
  if ($fix_html) {
    print out_file $sub_fixHTML;
  }

  # Print out the header defined in a string
  print out_file $header;
  
  # Print out the various parts of the feed based on the information available
  if (defined($feed_title)) {&createPrintCommand('<title>'.$feed_title.'</title>', "", 0, 2)}
  if (defined($feed_uri)) {&createPrintCommand('<link>'.$feed_uri.'</link>', "", 0, 2)}
  if (defined($feed_description)) {&createPrintCommand('<description>'.$feed_description.'</description>', "", 0, 2)}
  if (defined($feed_interval)) {&createPrintCommand('<ttl>'.$feed_interval.'</ttl>', "", 0, 2)
  }
  if (defined($feed_image_uri))   {
    &createPrintCommand('<image>', "", 0, 2);
    &createPrintCommand('<url>'.$feed_image_uri.'</url>', "", 0, 3);
    &createPrintCommand('</image>', "", 0, 2)
  }
  &skipLines(1);  
  
  # Print out the main loop, which shifts lines and searches for matches in them
  &createComment('Keep reading lines until they are all used');
  &createCommand('while (scalar(@lines) > 0) {');
  $fix_html ? &createCommand('$line = &fixHTML(shift(@lines));', 1, 1) : &createCommand('$line = shift(@lines);', 1, 1);
  
  foreach $match_name (keys(%extra_matches)) {
    &constructExtraMatchBlock($match_name);
  }
  
  # Create a part for each catagory
  for ($html_cat_num = 0; $html_cat_num < $num_match_catagories; $html_cat_num++) {
    $command = (($html_cat_num > 0 ) ? 'els': '').'if ($line =~ m'.$search[$html_cat_num].') {';
    &createComment('Find the stories', 1);
    &createCommand($command, 1);
    
    # Create the part which searches for the next line, if needed
    # TODO: include a fixed number skipped lines
    if (defined($match_start[$html_cat_num])) {
      &createComment("Skip to the first occurence of $match_start[$html_cat_num]", 2);
      &createCommand("while (\$line !~ m$match_start[$catagorie_num]) {", 2);
      if ($fix_html) {$command = '$line = &fixHTML(shift(@lines));'} else {$command = '$line = shift(@lines);'}
      &createCommand($command, 3);
      &createCommand("}", 2);
    }
    
    # Create the part which concats the next line untile the closing tag, if needed
    if (defined($match_end[$html_cat_num])) {
      &createComment("Create a line up until $match_end[$html_cat_num] is found", 2);
      &createCommand("while (\$line !~ m$match_end[$html_cat_num]) {", 2);
      &createCommand('chomp($line);', 3);
      if ($fix_html) {$command = '$line .= &fixHTML(shift(@lines));'} else {$command = '$line .= shift(@lines);'}
      &createCommand($command, 3);
      &createCommand('}', 2);
    }
    
    # Create the main munching code...oh boy oh boy...
    &createComment('Find the different parts', 2);
    &createCommand('@match = (split'.$match[$html_cat_num].', $line);', 2);
    &createComment('Print out the item', 2);
    &createCommand('if (scalar(@match) > 1) {', 2)
    &createPrintCommand('<item>', "", 3, 2);
    &createFeedEntry('title');
    &createFeedEntry('link');
    &createFeedEntry('description');
    &createPrintCommand('<\/item>', "", 3, 2);
    &createCommand('}', 2);
   
    &createCommand('}', 1);
  }
  
  &createCommand('}', 0, 1);
  print out_file $footer;
  
  close(out_file);
  print "Done\n";
}

sub defineStrings {
  $header = 'sub cleanup {
  # Convert HTML entries into valid HTML
  #$_[0] =~ s/&(?!\S*;)/&amp;/g;
  $_[0] =~ s/&/&amp;/g;
  $_[0] =~ s/</&lt;/g;
  $_[0] =~ s/>/&gt;/g;
  $_[0];
}

# Read lines from stdin
@lines = <>;

# Print the header
print "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
print "<!DOCTYPE rss PUBLIC \"-//Netscape Communications//DTD RSS 0.91//EN\"\n";
print "\"http://my.netscape.com/publish/formats/rss-0.91.dtd\">\n";
print "<rss version=\"0.91\">\n";
print "  <channel>\n";
';
  $footer = '# Print the footer
print "  <\/channel>\n";
print "<\/rss>\n";
';

  $sub_fixHTML = 'sub fixHTML {
  # Close open tags before new ones are opened
  # Rewrite everything within < and > with a fixed number of dashes
  local($clean_string, $line, $in_tag);
  
  $in_tag = 0;
  
  $clean_string = "";
  split/([<>])/, $_[0];
  while (scalar(@_)) {
    $line = shift(@_);
    if ($line =~ /^<$/) {
      if ($in_tag) {
        $clean_string .= ">";
      }
      $in_tag = 1;
      $clean_string = $clean_string.$line;
      $line = shift(@_);
      $line =~ s/\s*=\s*/=/g;
      $line =~ s/\s+/ /g;
    } elsif ($line =~ /^>$/) {
      $in_tag = 0;
    }
    $clean_string = $clean_string.$line;
  };
  $clean_string;
}

';
}