#!/usr/bin/perl
# Used by grok, not a standalone tool.
use re 'eval';
use Memoize;
memoize('strip_code');
memoize('delimiter_score');
$| = 1;
our $opts;
my $config = readconfig($opts->{"f"});
unless (defined($config)) {
debug(0, "Syntax error in config?");
exit(1);
}
setup($config);
# Remove things that are too general
my @skippable = qw(
DATA GREEDYDATA USER USERNAME WORD NOTSPACE PID PROG YEAR
URIHOST URIPARAM URIPATH
);
my $skip_re = "^(?:" . join("|",@skippable) . ')$';
our %MATCH;
my @sorted_patterns = grep {!/$skip_re/} sort { compare_by_complexity($a,$b); } keys(%MATCH);
#print join("\n", @sorted_patterns);
#exit(1);
#for my $i (@sorted_patterns) { print noncapture_length(pattern2regex("%$i%")), " ", $i, "\n"; }; exit;
#for my $i (@sorted_patterns) { print delimiter_score(pattern2regex("%$i%")), " ", $i, "\n"; }; exit;
while (<STDIN>) {
chomp($_);
analyze($_);
}
finish_and_exit();
sub strip_code {
$_[0] =~ s/\(\?\?\{ handle_capture[^}]+\}\)//g;
return $_[0];
}
sub delimiter_score {
my $x = strip_code(shift);
my $score = 0;
my @m;
# spaces are valued
#print "re: $x\n";
@m = $x =~ m/(?:\s|\\s)/g;
$score += 1.5 * scalar(@m);
#print " space: " . scalar(@m) . "\n";
# dots are good too
@m = $x =~ m/(?:[.])/g;
$score += 1 * scalar(@m);
#print " dots: " . scalar(@m) . "\n";
# some other punctuation
@m = $x =~ m/(?:['":_,=+-])/g;
$score += .6 * scalar(@m);
#print " punct: " . scalar(@m) . "\n";
return $score;
}
sub noncapture_length {
my $x = strip_code(shift);
return length($x);
}
# Comparator for sorting regexps by some kind of complexity
# - score first is checked by number of delimiters checked
# - if equal, compare the lengths of the regexps(*)
# (*) length calculated on regexps with any (??{ }) removed..
sub compare_by_complexity {
my ($a,$b) = @_;
my $a_re = pattern2regex($MATCH{$a});
my $b_re = pattern2regex($MATCH{$b});
my $a_delim = delimiter_score($a_re);
my $b_delim = delimiter_score($b_re);
my $a_len = noncapture_length($a_re);
my $b_len = noncapture_length($b_re);
#print "$a_re\n";
#print "-> $a_delim\n";
#print "$a_re\nvs\n$b_re\n\n";
return $b_delim <=> $a_delim or $b_len <=> $a_len
}
sub analyze {
my $line = shift;
#print "Checking '$line'\n";
my $count = 0;
# Counts of uses of each pattern name
my %patcounts;
# list of 2-tuple containing start/end ranges of already performed matches.
my @skip_ranges;
while (++$count <= 10) {
pos($line) = 0;
# Track how many matches were this round.
my $round_matches = 0;
foreach my $name (@sorted_patterns) {
last if (pos($line) == $#line);
#next if ($name =~ m/$skip_re/);
# Skip %FOO% and %FOO:BAR%
#print "str: " . substr($line, pos($line)) . "\n";
#print "skip? " . ($line =~ m/\G(%[A-z0-9:]+%)/) . "\n";
#while ($line =~ m/\G(%[A-z0-9:]+% *)/) {
#pos($line) = $+[1];
##print "Skipping past (endpos $+[1]): $&\n";
#}
# match %FOO~/[^A-z/% -
# Make sure this token is not just a plain word
my $regex = pattern2regex("%$name~/[^A-z0-9]/%");
#print " -> against $name\n";
#print " // $regex\n";
#print " ... " . substr($line, pos($line)) . "\n";
try:
if ($line =~ m/\G.*?($regex)/) {
my $match = $1;
my ($start, $end) = ($-[1], $+[1]);
for my $i (@skip_ranges) {
my ($rstart,$rend) = @$i;
if (($start > $rstart && $start < $rend)
|| ($end > $rstart && $end < $rend)) {
#print "Skipping to pos $rend: match was inside " . substr($line, $rstart, $rend - $rstart) . "\n";
pos($line) = $rend;
goto try;
}
}
$round_matches++;
my $subname = "";
$subname = ":" . ($patcounts{$name} + 1) if ($patcounts{$name});
my $patstr = "%$name$subname%";
#print " -> against $name\n";
#print " -> str " . substr($line, $start) . "\n";
substr($line, $start, $end - $start) = $patstr;
#print " -> $line\n";
pos($line) = $start + length($patstr);
# Shift everything left that's to the right of our match
for my $i (@skip_ranges) {
my ($rstart,$rend) = @$i;
if ($rstart > $end) {
$i->[0] = $rstart - ($end - $start) + length($patstr);
$i->[1] = $rend - ($end - $start) + length($patstr);
}
}
push(@skip_ranges, [$start, pos($line)]);
#print "Pos: " . pos($line) . "\n";
$patcounts{$name}++;
last;
}
}
last if ($round_matches == 0);
}
print "$line\n";
}
1;
syntax highlighted by Code2HTML, v. 0.9.1