#!/usr/bin/env perl # # script4rss v. 0.4b by Pieter Edelman # This is a perl script which attempts to generate out a perl script (no this # isn't a typo) which converts a (single) HTML page to an RSS feed. # WARNING: This script is usable but not ready for prime time. # # Copyright (c) 2004 Pieter Edelman # Released under the terms of the GNU General Public License (GPL) Version 2. # See http://www.gnu.org/ for details. use Getopt::Std; # The number of catagories which needs to be extracted (normally this will be 1) $num_match_catagories = 0; # The required and optional variables in bitmask # 1 is always on, 0/2 means is scalar/array, 0/4 means optional/required, # 8 means belonging to a match # 16 means a string, 32 means a regex, 64 means a boolean, 128 means a time string $variables{"script_name"} = 0b00010101; $variables{"script_author"} = 0b00010101; $variables{"license"} = 0b00010111; $variables{"comment"} = 0b00010011; $variables{"feed_title"} = 0b00010101; $variables{"feed_uri"} = 0b00010101; $variables{"feed_description"} = 0b00010011; $variables{"feed_image_uri"} = 0b00010001; $variables{"feed_interval"} = 0b10000001; $variables{"fix_html"} = 0b01000001; $variables{"read_line_multiple"} = 0b01000001; $variables{"search"} = 0b00101111; $variables{"match"} = 0b00101111; $variables{"match_start"} = 0b00101011; $variables{"match_end"} = 0b00101011; $variables{"title"} = 0b00011111; $variables{"title_prefix"} = 0b00011011; $variables{"title_postfix"} = 0b00011011; $variables{"link"} = 0b00011111; $variables{"link_prefix"} = 0b00011011; $variables{"link_postfix"} = 0b00011011; $variables{"description"} = 0b00011011; $variables{"description_prefix"} = 0b00011011; $variables{"description_postfix"} = 0b00011011; $defaults{"fix_html"} = 1; my %extra_matches; # Call the main parts &printBlurb; &getInputFile; &defineStrings; &processDescriptionFile; &printScript; sub printBlurb { # Prints the welcome message to the user print "\n** This is script4rss, version 0.4b **\n"; print "** http://script4rss.sf.net/ **\n"; print "If you create a useful script, please consider donating it at\n"; print "http://home.kcore.de/~kiza/software/snownews/snowscripts/\n\n"; } sub printUsage { # Prints out how the program should be used print "Usage: script4rss.pl [-o outfile] description_file[.s4r]\n"; print "See doc/doc.html for more information\n"; exit; } sub getInputFile { # Parses the command line for an input file name and optionally the output file name # The file name is given on the command line if (!getopts('o:h')) { &printUsage(); } else { if (defined($opt_h)) { # If -h switch was given &printUsage(); } if (defined($opt_o)) { # If an output file was specified $out_file_name = $opt_o; } if (!$ARGV[0]) { # If no file name was given &printUsage(); } else { $in_file_name = $ARGV[0]; # Open the file, try the extension .s4r if it can't be found if (!(-e $in_file_name)) { if (-e $in_file_name.'.s4r') { $in_file_name .= '.s4r'; } else { &raise("\"$in_file_name\" or \"$in_file_name.s4r\" can't be opened..."); } } print "Using \"$in_file_name\" as input file\n"; open(in_file, $in_file_name); } } } sub createComment { # Creates a properly formatted and indented comment # Arguments: (comment, indentation level, lines to skip) local $comment = ""; $comment .= &indent($_[1]); $comment .= '# '.$_[0]."\n"; print out_file $comment; skipLines($_[2]); } sub createCommand { # Creates a properly formatted and indented command # Arguments (command, indentation level, lines to skip) local $command = ""; $command .= &indent($_[1]); $command .= $_[0]."\n"; print out_file $command; skipLines($_[2]); } sub createPrintCommand { # Creates a properly formatted and indented command # Arguments: (command,encoding ('|"), indentation level, output indentation level, lines to skip) local $command = ""; $command .= &indent($_[2])."print "; if ($_[1] =~ /\'/) { $command .= "'".&indent($_[3]).$_[0].'\'."\n";'; } else { $command .= '"'.&indent($_[3]).$_[0].'\n";'; } createCommand($command, $_[4]); } sub indent { # Adds spaces for indentation according to the level as argument local ($spaces, $i) = ("", 0, 0); for ($i = 0; $i < ($_[0] * 2); $i++) {$spaces .= " "} $spaces; } sub skipLines { # Skips a number of lines if ($_[0] > 0) { for ($i = 0; $i < $_[0]; $i++) { print out_file "\n"; } } } sub constructMatchedPattern { # Construct a string from a var name, and its pre-and postfix # Arguments are the var name as a string and the index if applicable local ($type, $type_prefix, $type_postfix, $content); $type_prefix = ""; $type_postfix = ""; if (defined($_[1])) { # The match is part of a non-user-defined match eval "\$type = \$$_[0]\[$_[1]];"; eval "if (defined(\$$_[0]_prefix[$_[1]])) {\$type_prefix = \$$_[0]_prefix[$_[1]];}"; eval "if (defined(\$$_[0]_postfix[$_[1]])) {\$type_postfix = \$$_[0]_postfix[$_[1]];}"; } else { eval "\$type = \$$_[0];"; eval "if (defined(\$$_[0]_prefix)) {\$type_prefix = \$$_[0]_prefix;}"; eval "if (defined(\$$_[0]_postfix)) {\$type_postfix = \$$_[0]_postfix;}"; } if ($type) { $content = $type_prefix; if ($type =~ /^\d+$/) { # If there's only a number $content .= "'."; ($content .= $type) =~ s/(\d+)/&cleanup(\$match\[\1\])/g; $content .= ".'"; } else { # A string should be processed and backreferences converted to matches $orig = $type; @parts = (split /(.*?)(\\+)(\d+)(.*)/, $orig); if (scalar(@parts) > 1) { $stripped = ""; while (length($parts[3]) != 0) { if ((length($parts[2])/2) != int(length($parts[2])/2)) { $stripped .= $parts[1]; for ($i = 1; $i < (length($parts[2])); $i++) { $stripped .= '\\'; } $stripped .= '\'.&cleanup($match['.$parts[3].']).\''; $orig = $parts[4]; } else { $stripped .= $parts[1].$parts[2].$parts[3]; $orig = $parts[4]; } @parts = (split /(.*?)(\\+)(\d+)(.*)/, $orig); } } else { $stripped = $orig; } $orig = $stripped; @parts = (split /(.*?)(\\+)(\w+)(.*)/, $orig); if (scalar(@parts) > 1) { $stripped = ""; while (length($parts[3]) != 0) { print "@parts\n"; if ((length($parts[2])/2) != int(length($parts[2])/2)) { $stripped .= $parts[1]; for ($i = 1; $i < (length($parts[2])); $i++) { $stripped .= '\\'; } $stripped .= '\'.$'.$parts[3].'_name.\''; $orig = $parts[4]; } else { $stripped .= $parts[1].$parts[2].$parts[3]; $orig = $parts[4]; } @parts = (split /(.*?)(\\+)(\d+)(.*)/, $orig); } } else { $stripped = $orig; } $content = $stripped; } $content .= $type_postfix; } # Filter out empty strings $content =~ s/''\.//g; $content =~ s/\.''//g; $content; } sub createFeedEntry { # Create a properly formatted feed entry, the name of which is gaven as an argument local ($type); eval "\$type = \$$_[0]\[$html_cat_num];"; if ($type) { $command = "print '".&indent(3)."<$_[0]>".&constructMatchedPattern($_[0], $html_cat_num)."<\/$_[0]>'.\"\\n\";"; &createCommand($command, 3); } } sub constructExtraMatchBlock { # Construct an extra, user defined match local ($match_name, $test); $match_name = $_[0]; &createComment("Find $match_name", 1); eval '$test = $'.$match_name.'_search'; &createCommand('if ($line =~ m'.$test.') {', 1); eval "\$test = \$$match_name_start;"; if (defined($test)) { &createComment("Skip to the first occurence of $test", 2); &createCommand("while (\$line !~ m$test) {", 2); if ($fix_html) {$command = '$line = &fixHTML(shift(@lines));'} else {$command = '$line = shift(@lines);'} &createCommand($command, 3); &createCommand("}", 2); } # Create the part which concats the next line untile the closing tag, if needed eval '$test = $'.$match_name.'_end;'; if (defined($test)) { &createComment("Create a line up until $test", 2); &createCommand("while (\$line !~ m$test) {", 2); &createCommand('chomp($line);', 3); if ($fix_html) {$command = '$line .= &fixHTML(shift(@lines));'} else {$command = '$line .= shift(@lines);'} &createCommand($command, 3); &createCommand('}', 2); } eval '$test = $'.$match_name.'_match;'; &createComment('Find the $match_name name', 2); &createCommand('@match = (split'.$test.', $line);', 2); &createCommand('$'.$match_name."_name = '".&constructMatchedPattern($match_name).'\';', 2); &createCommand('}', 1, 1); } sub addExtraMatch { local ($var_name, $var_value, $var_identifier); ($var_name, $var_value) = @_; ($var_identifier = $var_name) =~ s/(.*)(_match|_search|_start|_end|_prefix|_postfix)/\1/; $extra_matches{$var_identifier} = 1; $variables{$var_identifier} = 0b00010101; $variables{$var_identifier.'_search'} = 0b00100101; $variables{$var_identifier.'_match'} = 0b00100101; $variables{$var_identifier.'_start'} = 0b00100001; $variables{$var_identifier.'_end'} = 0b00100001; $variables{$var_identifier.'_prefix'} = 0b00010001; $variables{$var_identifier.'_postfix'} = 0b00010001; &setScalar($var_name, $var_value); } sub raise { # Prints an error to the console and aborts print STDERR "$_[0]\nAborted!\n"; exit; } sub checkVariable { # Check if the variable is not defined twice and formatted properly local ($var_name, $var_value, $already_defined); ($var_name, $var_value) = ($_[0], $_[1]); if ($variables{$var_name}) { # Check for multiple definitions in scalars if (not($variables{$var_name} & 2)) { eval('$already_defined = defined($'.$var_name.');'); if ($already_defined) {&raise("\"$var_name\" is defined twice...")}; } if ($variables{$var_name} & 16) { # Convert double backslashes in strings to single ones $var_value =~ s/[\\][\\]/\\/g; } elsif ($variables{$var_name} & 64) { # Should be a boolean if ($var_value =~ /^(false|no|0)$/i) { $var_value = 0; } elsif ($var_value =~ /^(true|yes|[1-9]*)$/i) { $var_value = 1; } else { &raise("\"$var_name\" should be a boolean..."); } } elsif ($variables{$var_name} & 32) { # Should be a regex if ($var_value !~ /\/.*\/[ig]*/) { raise "$var_name should be a regular expression"; } } elsif ($variables{$var_name} & 128) { # Should be a time string if ($var_value !~ /(\d+w)?(\d+d)?(\d+h)?(\d+m)?/) { raise "$var_name should be a time formatted as XXwXXdXXhXXm"; } else { local ($interval_str); $interval_str = $var_value; $interval_str =~ s/(\d*\D)(.)/\1+\2/g; $interval_str =~ s/w/*10080/; $interval_str =~ s/d/*1440/; $interval_str =~ s/h/*60/; $interval_str =~ s/m//; eval '$var_value = '.$interval_str.';'; } } } $var_value; } sub setScalar { # Sets a scalar from the input file to a given value, after some checks local ($var_value); $var_value = &checkVariable($_[0], $_[1]); if (($variables{$var_name} & 16) || ($variables{$var_name} & 32)) { # If the scalar is an array or a regex eval "\$$_[0] = \$var_value"; } else { eval "\$$_[0] = $var_value"; } } sub setArray { # Adds or sets a value to an array, after some proper checking local ($var_name, $var_value) = @_; $var_value = &checkVariable($var_name, $var_value); if ($variables{$var_name} & 8) { # Belongs to a match eval "\$$var_name\[".($num_match_catagories-1)."\] = \$var_value"; } else { eval "push(\@$var_name, '$var_value')"; } } sub processDescriptionFile { local ($var_name, $var_content); # Process the contents of the output file foreach () { # Strip out comments, leave escaped hashes alone $orig = $_; @parts = (split /(.*?)(\\*)(#+)(.*)/, $orig); $stripped = ""; # While a hash mark is found while (length($parts[3]) == 1) { if ((length($parts[2])/2) != int(length($parts[2])/2)) { # If its prepended by an odd number of backslashes, add it and try to find hashes in de next part $stripped .= $parts[1]; for ($i = 1; $i < (length($parts[2]) - 1); $i++) { $stripped .= '\\'; } $stripped .= $parts[3]; $orig = $parts[4]; @parts = (split /(.*?)(\\*)(#+)(.*)/, $orig); } else { # If it's not escaped, add the first part and throw away the rest $stripped .= $parts[1].$parts[2]; $orig = ""; last; } } $stripped .= $orig; $_ = $stripped; # Find catagories if ($_ =~ /^\s*\[.*\]\s*$/) { $num_match_catagories++; } # Process lines that are in the form name : value elsif ($_ =~ /^.*[^\s]+.*:.*[^\s]+.*$/) { # Extract the variable name and content split(/^\s*(.*?)\s*:\s*(.*)\s*$/, $_); $var_name = $_[1]; $var_content = $_[2]; if ($variables{$var_name}) { if ($variables{$var_name} & 2) { # Array setArray($var_name, $var_content); } else { setScalar($var_name, $var_content); } } else { # The user apparently created an additional match pattern for his own purpose addExtraMatch($var_name, $var_content); } } } # Close the file close(in_file); # Check if all the required parameters are there foreach $var_name (keys(%variables)) { if ($variables{$var_name} & 4) { #Required if ($variables{$var_name} & 2) { #Array if ($variables{$var_name} & 8) { #Belongs to a match eval "\@test = \@$var_name;"; for ($i = 0; $i < $num_match_catagories; $i++) { if (not defined($test[$i])) { #~ print "@test"."\n"; &raise("\"$var_name\" missing in match catagory ".($i + 1)."..."); } } } else { # Is a multiline variable eval "\@test = \@$var_name;"; if (not defined(@test)) { &raise("\"$var_name\" is not defined..."); } } } else { # Is a scalar eval "\$test = \$$var_name;"; if (not defined($test)) { &raise("\"$var_name\" is not defined..."); } } } } # Set unset vars to their default value foreach $var_name (keys(%defaults)) { eval "\$test = \$$var_name;"; if (not defined($test)) { eval "\$$var_name = \$defaults{$var_name};"; } } # TODO: check for defined _prefixes and _postfixes without defined matches and stuff } sub printScript { # Open the script to write if (!defined($out_file_name)) { $out_file_name = $script_name; } if ($out_file_name eq $in_file_name) { &raise("Input and output names are the same. That doesn't look really smart to me..."); } open(out_file, ">$out_file_name"); print "Writing to \"$out_file_name\"\n"; # Print out the credits and comments print out_file "#!/usr/bin/env perl\n"; &createComment(); &createComment("$script_name by $script_author"); &createComment("Generated by script4rss (http://script4rss.sf.net)"); &createComment(); &createComment("This script converts $feed_uri to an RSS feed"); if (@comment) { &createComment(); &createComment("=============================================================================="); foreach (@comment) { &createComment($_); } &createComment("=============================================================================="); } &createComment(); @date = localtime(); &createComment("Copyright(c) ".(@date[5] + 1900)." @script_author"); foreach (@license) { &createComment($_); } skipLines(2); # If HTML should be fixed before it is matched, create the sub to do so if ($fix_html) { print out_file $sub_fixHTML; } # Print out the header defined in a string print out_file $header; # Print out the various parts of the feed based on the information available if (defined($feed_title)) {&createPrintCommand(''.$feed_title.'', "", 0, 2)} if (defined($feed_uri)) {&createPrintCommand(''.$feed_uri.'', "", 0, 2)} if (defined($feed_description)) {&createPrintCommand(''.$feed_description.'', "", 0, 2)} if (defined($feed_interval)) {&createPrintCommand(''.$feed_interval.'', "", 0, 2) } if (defined($feed_image_uri)) { &createPrintCommand('', "", 0, 2); &createPrintCommand(''.$feed_image_uri.'', "", 0, 3); &createPrintCommand('', "", 0, 2) } &skipLines(1); # Print out the main loop, which shifts lines and searches for matches in them &createComment('Keep reading lines until they are all used'); &createCommand('while (scalar(@lines) > 0) {'); $fix_html ? &createCommand('$line = &fixHTML(shift(@lines));', 1, 1) : &createCommand('$line = shift(@lines);', 1, 1); foreach $match_name (keys(%extra_matches)) { &constructExtraMatchBlock($match_name); } # Create a part for each catagory for ($html_cat_num = 0; $html_cat_num < $num_match_catagories; $html_cat_num++) { $command = (($html_cat_num > 0 ) ? 'els': '').'if ($line =~ m'.$search[$html_cat_num].') {'; &createComment('Find the stories', 1); &createCommand($command, 1); # Create the part which searches for the next line, if needed # TODO: include a fixed number skipped lines if (defined($match_start[$html_cat_num])) { &createComment("Skip to the first occurence of $match_start[$html_cat_num]", 2); &createCommand("while (\$line !~ m$match_start[$catagorie_num]) {", 2); if ($fix_html) {$command = '$line = &fixHTML(shift(@lines));'} else {$command = '$line = shift(@lines);'} &createCommand($command, 3); &createCommand("}", 2); } # Create the part which concats the next line untile the closing tag, if needed if (defined($match_end[$html_cat_num])) { &createComment("Create a line up until $match_end[$html_cat_num] is found", 2); &createCommand("while (\$line !~ m$match_end[$html_cat_num]) {", 2); &createCommand('chomp($line);', 3); if ($fix_html) {$command = '$line .= &fixHTML(shift(@lines));'} else {$command = '$line .= shift(@lines);'} &createCommand($command, 3); &createCommand('}', 2); } # Create the main munching code...oh boy oh boy... &createComment('Find the different parts', 2); &createCommand('@match = (split'.$match[$html_cat_num].', $line);', 2); &createComment('Print out the item', 2); &createCommand('if (scalar(@match) > 1) {', 2) &createPrintCommand('', "", 3, 2); &createFeedEntry('title'); &createFeedEntry('link'); &createFeedEntry('description'); &createPrintCommand('<\/item>', "", 3, 2); &createCommand('}', 2); &createCommand('}', 1); } &createCommand('}', 0, 1); print out_file $footer; close(out_file); print "Done\n"; } sub defineStrings { $header = 'sub cleanup { # Convert HTML entries into valid HTML #$_[0] =~ s/&(?!\S*;)/&/g; $_[0] =~ s/&/&/g; $_[0] =~ s//>/g; $_[0]; } # Read lines from stdin @lines = <>; # Print the header print "\n"; print "\n"; print "\n"; print " \n"; '; $footer = '# Print the footer print " <\/channel>\n"; print "<\/rss>\n"; '; $sub_fixHTML = 'sub fixHTML { # Close open tags before new ones are opened # Rewrite everything within < and > with a fixed number of dashes local($clean_string, $line, $in_tag); $in_tag = 0; $clean_string = ""; split/([<>])/, $_[0]; while (scalar(@_)) { $line = shift(@_); if ($line =~ /^<$/) { if ($in_tag) { $clean_string .= ">"; } $in_tag = 1; $clean_string = $clean_string.$line; $line = shift(@_); $line =~ s/\s*=\s*/=/g; $line =~ s/\s+/ /g; } elsif ($line =~ /^>$/) { $in_tag = 0; } $clean_string = $clean_string.$line; }; $clean_string; } '; }