#!/usr/bin/perl -I /usr/local/raqdevil/perl # $Id: split_logs,v 1.5 2005/12/13 20:13:24 dodell Exp $ # Copyright 2000, 2001 Sun Microsystems, Inc., All rights reserved. # Consolidated log distribution by virtual site use strict; # Global Variables my $Analog_cmd = '/usr/local/bin/analog'; my $Htgroup_dir = '/home/.sites'; my $Logdir = 'logs'; # site logs directory my %domains; # hash of vsite fqdn's my $curbuf = 0; # current number of lines buffered my $maxbuf = 25000; # maximum number of lines to buffer my %vsite_dir; # hash mapping all site fqdns to base directory my %vsite_fqdn; # hash mapping all site groups to fqdn my %run_stats; # hash mapping all site groups to Analog execution boolean my %cache_log; # hash mapping active groups to log data string my %active_sites; # hash mapping active groups to 1 my %pids; # hash mapping all fqdn to per-site file owner pids my %gids; # hash mapping all fqdn to gids my $custom = 0; # global flag indicating a date override is used # Hardcode locale to be english, otherwise 'date' spits out japanese # strings and that messes up our parsing. $ENV{'LC_ALL'} = 'en_US'; my $DEBUG = 0; $DEBUG && warn "\n".`date`."$0\n".join(':', @ARGV)."\n"; use IO::File; use Base::HomeDir qw(homedir_get_group_dir); use POSIX qw(strftime mktime); use CCE; use File::Path; # set a sane umask umask(002); # make sure $Htgroup_dir exists before going on if (! -d $Htgroup_dir) { mkpath($Htgroup_dir, 0, 0755); } my $cce = new CCE; $cce->connectuds(); # Fetch "server" log pid and gid my $server_pid = (getpwnam('admin'))[2]; my $server_gid = (getgrnam('users'))[2]; $DEBUG && warn "Server log pid, gid: $server_pid, $server_gid\n"; # Build a list of virtual site fully qualified domain names, # excluding fqdn aliases my $oid; my(@vsite_oids) = $cce->find('Vsite'); foreach $oid (@vsite_oids) { my($ok, $vsite) = $cce->get($oid); $domains{$vsite->{hostname}} = 1 if ($vsite->{hostname}); $domains{$vsite->{domain}} = 1 if ($vsite->{domain}); $domains{$vsite->{fqdn}} = 1 if ($vsite->{fqdn}); $vsite_dir{$vsite->{fqdn}} = $vsite->{basedir}; $vsite_fqdn{$vsite->{name}} = $vsite->{fqdn}; my($aok, $sitestats) = $cce->get($oid, 'SiteStats'); $run_stats{$vsite->{fqdn}} = $sitestats->{enabled}; $pids{$vsite->{fqdn}} = (getpwnam($sitestats->{owner}))[2]; $gids{$vsite->{fqdn}} = (getgrnam($vsite->{name}))[2]; $DEBUG && warn 'Found site: '.$vsite->{fqdn}.', group: '. $vsite->{hostname}, ', basedir: '.$vsite->{basedir}. ', sitestats: '.$sitestats->{enabled}, ', pid/gid: '.$pids{$vsite->{fqdn}}.'/'. $gids{$vsite->{fqdn}}."\n"; } # used to convert month strings from logs to values to pass to mktime my %month_convert = ('Jan' => 0, 'Feb' => 1, 'Mar' => 2, 'Apr' => 3, 'May' => 4, 'Jun' => 5, 'Jul' => 6, 'Aug' => 7, 'Sep' => 8, 'Oct' => 9, 'Nov' => 10, 'Dec' => 11); # default is to run this for yesterday my (undef, undef, undef, $day, $month, $year) = localtime(time - 86400); # figure out the stop and start times that should be processed from the log # by default we start processing at midnight yesterday my $first_date = mktime(0, 0, 0, $day, $month, $year); # by default we stop processing at midnight today (12:00 am) my $stop_time = $first_date + (24 * 60 * 60); $DEBUG && print STDERR "current time - stop time is ", (time() - $stop_time), "\n"; # adjust time to more human understandable $year += 1900; # gmtime returns ( year - 1900 ) $month += 1; # gmtime returns ( month - 1 ) my $date_dir = $year . "/" . $month . "/" . $day . "/"; if ($ARGV[1]) { $custom = 1; $date_dir = $ARGV[1]; if ($date_dir =~ /(\d+)\/(\d+)\/(\d+)/) { ($year, $month, $day) = ($1, $2, $3); # # if a custom date was passed, start time is 12:00 am of the # custom date, and stop time is 12:00 am of the day immediately # after the custom date # $first_date = mktime(0, 0, 0, $day, $month - 1, $year - 1900); $stop_time = $first_date + (24 * 60 * 60); } } $DEBUG && warn "date_dir: $date_dir "; my $log_type = $ARGV[0]; $log_type =~ /(web|ftp|mail|net)/ || die "Invalid log type: $log_type. Most be web, ftp or mail.\n"; ## Begin SPLIT # Create the current server-level cache/stats directory &setup_dirs($Htgroup_dir.'/server', $Logdir, $date_dir, 00701, $server_pid, $server_gid); my $output_dir = $Htgroup_dir.'/server/'.$Logdir.'/'.$date_dir; # clear the existing cache file $DEBUG && warn 'Clearing '.$output_dir.'/'.$log_type.".cache\n"; unlink($output_dir.'/'.$log_type.'.cache'); my $analog_args = " -C'DEFAULTLOGFORMAT COMBINED' " . "-C'CACHEOUTFILE $output_dir/$log_type.cache'"; # Open up main report $DEBUG && warn "Attempting to invoke $Analog_cmd $analog_args -\n"; open(MR, "| $Analog_cmd $analog_args - > /dev/null") || die "Could not execute $Analog_cmd for main site: $!"; my $regexp = get_regexp($log_type); $DEBUG && warn "Using regex: $regexp\n"; my $double_regexp = qq($regexp $regexp); my %daily_flush; # cache unlinking of per-site daily log files # this specifies how to process system log files for this log type my %log_process = ( 'mail' => '/usr/local/sbin/maillog2commonlog.pl sendmail |', 'ftp' => '/usr/local/sbin/ftplog2commonlog |', 'net' => '', 'web' => ''); my %old_logs = (); my $sample = ''; if (!$custom) { # # filenames of logs from two days ago # being run by logrotate # logrotate renames the old log files before running even the # prerotate command, so yesterday's log is in the .2.gz file # %old_logs = ('mail' => '/var/log/maillog.2.gz', 'ftp' => '/var/log/xferlog.2.gz', 'net' => '/var/log/ipacct.2.gz', 'web' => '/var/log/httpd/access.2.gz'); # # no date specified, so need to sample the current log file # to see which day we should look for in the old log file. This is # done so stats information doesn't get lost if logrotate doesn't get # run some days if the box is down. # $sample = ; # if this is a logrotate run, check the sample for the date to find if ($sample ne '') { $sample =~ m#\[([^/]+)/([^/]+)/([^\:]+)\:#; # the start time needs to be bumped to 12:00 am of this day $first_date = mktime(0, 0, 0, $1, $month_convert{$2}, $3 - 1900); } } else { # # running outside of the normal logrotate run # use the logs that were rotated out this morning # they end in .1.gz # %old_logs = ('mail' => '/var/log/maillog.1.gz', 'ftp' => '/var/log/xferlog.1.gz', 'net' => '/var/log/ipacct.1.gz', 'web' => '/var/log/httpd/access.1.gz'); } if ($DEBUG) { print STDERR "start time: ", POSIX::ctime($first_date); print STDERR "stop time: ", POSIX::ctime($stop_time); } # check the old log file if it exists if (-f $old_logs{$log_type}) { $DEBUG && print STDERR "processing old log $old_logs{$log_type}\n"; my $pipe_cmd = "/bin/zcat " . $old_logs{$log_type} . " | " . $log_process{$log_type}; open(LOGFILE, $pipe_cmd) or die "$!\n"; while (my $line = ) { if (!&process_line($line, $log_type, *MR, $regexp, $double_regexp, $custom)) { next; } if ($curbuf >= $maxbuf) { &flush_buffers($log_type, $custom); $curbuf = 0; } } } $DEBUG && print STDERR "about to process most recent log\n"; # if we took a sample above, process that here my $i = 1; if ($sample ne '') { ($DEBUG > 1) && print STDERR "process line ", $i++, "\n"; &process_line($sample, $log_type, *MR, $regexp, $double_regexp, $custom); } while (my $line = ) { ($DEBUG > 1) && print STDERR "process line ", $i++, "\n"; if (!&process_line($line, $log_type, *MR, $regexp, $double_regexp, $custom)) { next; } ($DEBUG > 1) && print STDERR "$curbuf\t$maxbuf\n"; # Avoid overusing RAM if ($curbuf >= $maxbuf) { &flush_buffers($log_type, $custom); $curbuf = 0; } } # The following code looks is largely duplicate of that above if ($curbuf) { &flush_buffers($log_type, $custom); } close(MR); chown($server_pid, $server_gid, "$output_dir/$log_type.cache"); chmod(0600, "$output_dir/$log_type.cache"); # Invoke analog over, and over, and over, and over foreach my $site (keys %active_sites) { $DEBUG && warn "Analog for site $site: ".$run_stats{$site}."\n"; next unless ($run_stats{$site}); # Set up the environment my $basedir = $vsite_dir{$site}; my $log_file = "$basedir/$Logdir/$log_type.log"; $log_file = "$basedir/$Logdir/$log_type.daily" if ($custom); next unless (-e $log_file); $DEBUG && warn "Analog setup: $site using logfile: ". "$log_file\n"; my $output_dir = "$basedir/$Logdir/$date_dir"; &setup_dirs($basedir, $Logdir, $date_dir, 02770, $pids{$site}, $gids{$site}); next unless (-d $output_dir); my $cache_file = "$basedir/$Logdir/$date_dir$log_type.cache"; unlink($cache_file); system("rm -f $cache_file") if (-e $cache_file); # # figure out which files to look in for data # this is easy, because we know logrotate runs once daily # so only need to look in at most two log files (current and # most recently rotated) # my @log_files = (); if ($custom) { # same day generation, this already parses out dates above push @log_files, $log_file; } else { push @log_files, "$log_file.1.gz", $log_file; } if ($DEBUG) { my @start = (localtime($first_date))[3, 4, 5]; my @end = (localtime($stop_time))[3, 4, 5]; print STDERR "searching for lines beginning at midnight ", join('/', $start[1] + 1, $start[0], $start[2] + 1900), " and ending at midnight ", join('/', $end[1] + 1, $end[0], $end[2] + 1900), "\n"; } $DEBUG && warn "Running Analog for site $site, $output_dir\n"; open(ANALOG, "|-") || exec("$Analog_cmd -C'DEFAULTLOGFORMAT COMBINED' ". "-C'CACHEOUTFILE $cache_file' ". "- >/dev/null 2>&1"); for my $file (@log_files) { if (! -f $file) { next; } if ($file =~ /\.gz$/) { # safe pipe read from gzipped file open(LOGFILE, "-|") || exec('/bin/zcat', $file); } else { # regular file open(LOGFILE, $file) or die "$!\n"; } while () { ($DEBUG > 2) && print STDERR $_; m#\[([^/]+)/([^/]+)/([^\:]+):#; my $log_time = mktime(0, 0, 0, $1, $month_convert{$2}, $3 - 1900); if (($first_date <= $log_time) && ($log_time < $stop_time)) { print ANALOG $_; } elsif ($DEBUG) { /(\[[^\]]+\])/; print STDERR "Skipping out of range record: $1\n"; } } close(LOGFILE); } close(ANALOG); chown($pids{$site}, $gids{$site}, $cache_file); chmod(0660, $cache_file); } exit 0; ## End SPLIT sub get_regexp { my $type = shift; my $regexp; # We need differed regualr expressions to get which site # it's for depending on the log type. # Web logs just begin with the full domain name of the site.. return '^(\S+) (.*)$' if $type eq 'web'; # Mail logs have the domain name as the referrer or the # browser in the log file.. return qq("http://([^/]+)/") if $type eq 'mail'; # Ftp we see the full file name of the file it grabbed and work # it out from the site name there.. return '/.sites/\d+/([^/]+)/' if $type eq 'ftp'; # there are no vsites to match for net... server wide only return "/Dontmatchanythingnovsites/" if $type eq 'net'; } sub setup_dirs { my ($basedir, $log, $date_dir, $mode, $pid, $gid) = @_; $mode ||= 02770; my $chown = 0; $chown = 1 if (($pid =~ /^\d+$/) && ($gid =~ /^\d+$/)); my @dirs = ($basedir, "$basedir/$log"); my(@dates) = split('/', $date_dir); return 0 unless ($dates[2] =~ /./); push (@dirs, "$basedir/$log/".$dates[0]); push (@dirs, "$basedir/$log/".$dates[0].'/'.$dates[1]); push (@dirs, "$basedir/$log/".$dates[0].'/'.$dates[1].'/'.$dates[2]); foreach my $dir (@dirs) { $DEBUG && warn "Test/setup $dir perms $mode\n"; unless(-d $dir) { mkdir($dir, $mode) || next; chown ($pid, $gid, $dir) if ($chown); chmod ($mode, $dir); } } return 1; } sub process_line { my $data = shift; my $type = shift; my $analog_fh = shift; my $regexp = shift; my $double_regexp = shift; my $custom = shift; my $rawdata = $data; my $current_domain; # filtering lines out based on date $rawdata =~ m#\[([^/]+)/([^/]+)/([^\:]+)\:#; my $log_time = mktime(0, 0, 0, $1, $month_convert{$2}, $3 - 1900); if ($log_time < $first_date) { if ($DEBUG) { $data =~ /(\[[^\]]+\])/; print STDERR "Skipping old record dated: $1\n"; } return(0); } # check if this line is after the stop time if ($log_time >= $stop_time) { if ($DEBUG) { $rawdata =~ /(\[[^\]]+\])/; print STDERR "Skipping new record dated: $1\n"; } return(0); } # only include certain web requests if ($type eq 'web' && $rawdata !~ /(GET|POST|PUT) \//o) { if ($DEBUG > 2) { $data =~ /\]\s*(.*)$/; print STDERR "Skipping non-web log line: $1\n"; } return(0); } if ($type eq 'web') { ($current_domain, $data) = ($data =~ /^(\S+) (.*$)/o); $data .= "\n"; # add the newline back $data =~ s:(GET|POST|PUT) /:$1 /$current_domain/:; # $DEBUG && warn "Constructed Web data: $data\n"; } if (($rawdata =~ m#$double_regexp#o) && ($type eq 'mail')) { # if we don't know about the sending or receiving domain, just # skip this one unless ($domains{$1} || $domains{$2} || ($2 eq 'localhost')) { $DEBUG && warn "Sending and Receiving domains unknown: $1, $2\n"; return(0); } ($DEBUG > 1) && warn "Double: $data"; if ($domains{$1}) { $cache_log{$1} .= $data; $curbuf++; } if ($domains{$2}) { $cache_log{$2} .= $data; $curbuf++; } } elsif ($rawdata =~ m#$regexp#o) { if (!$domains{$1} && ($type ne 'ftp')) { $DEBUG && warn "Skipping unknown domain: $1\n"; return(0); } ($DEBUG > 1) && warn "Single: $data"; if ($type eq 'ftp') { $current_domain = $vsite_fqdn{$1}; } $cache_log{$current_domain} .= $data; $curbuf++; } if ($type eq 'web' || $type eq 'net') { print $analog_fh $rawdata; ($DEBUG > 1) && warn "Writing: $rawdata\n"; } else { print $analog_fh $data; ($DEBUG > 1) && warn "Writing: $data\n"; } return(1); } sub flush_buffers { my $type = shift; my $daily = shift; foreach my $site (keys %cache_log) { if ($cache_log{$site}) { my $basedir = $vsite_dir{$site}; my $file = "$basedir/$Logdir/$type.log"; my $daily_log = "$basedir/$Logdir/" . "$type.daily"; if ($daily) { unlink($daily_log) unless ($daily_flush{$daily_log}); $daily_flush{$daily_log} = 1; $file = $daily_log; } open(SITELOG, ">> $file") || next; print SITELOG $cache_log{$site}; close(SITELOG); $DEBUG && warn "pid, gid for $file: ". $pids{$site}.', '. $gids{$site}."\n"; chown($pids{$site}, $gids{$site}, $file); chmod(0660, $file); $active_sites{$site} = 1; } delete($cache_log{$site}); } }