Source Code
extract_mail_stats
  Prev   Next
#!/usr/bin/perl
#
# extract_mail_stats
#
#   Extract sendmail stats from syslog file, processing them and saving 
#   result that is later used by the 'report_mail_stats' script.
#
#
# Typical usage:    extract_mail_stats
#
# flags:
#       -d          debug mode
#       -e          print strange lines to stderr
#       -o <file>   output file
#       -l <file>   log file
#       -T          print entries with a "to=<>" or "from=<>" (missing address)
#       -m          reduce to local mbox is possible
#
#
# Cron entry:
#
# 4 0 * * *                       /usr/local/etc/syswatch/bin/extract_mail_stats
#

require( '/usr/local/etc/syswatch/bin/sw_common.pl' );
do set_common_vars();


$PrintStrangeLines  = 1;


#
# Some systems keep the mail log file here:
#
$TheRawSysLog       = "/var/log/mail";
$TheRawSysLog       = "/var/log/maillog"    if( ! -e $TheRawSysLog );   # But could be here, too
$TheRawSysLog       = "/var/adm/SYSLOG"     if( ! -e $TheRawSysLog );   # But could be here, too

$TheDataDay         = '-yesterday';                                     # default to yesterday

$TheDataOutputFile  = '';


#
# Where data goes
#
$DATA_FILE_DIR      = "/var/log/mailtraffic";                       # dir containing data files

#
# Some log entries can be: "... from=<>, size=..."
# Use this string as a placeholder.
#
$NO_FROM_USER       = '<blank \'from\'>';
$NO_TO_USER         = '<blank \'to\'>';


#
# Main part of program:
#
    do handle_args ();
    do hash_passwd () if $ReduceToLocalMailbox;
    
    $data_file = do get_raw_data( $TheRawSysLog );
    do gather_stats( $data_file );
    unlink $data_file;                              # don't need them any more
    
    do output_stats( $TheDataOutputFile );
    
    exit( 0 );
#
# -- end --
#


#
# handle_args
#
# Parse the args and set out internal flags
#
sub handle_args
{
    while( $ARGV[ 0 ] =~ /^-/ )
    {
        shift ARGV, $TheDataOutputFile = $ARGV[ 0 ], shift ARGV, next   if( $ARGV[ 0 ] eq '-o' );
        shift ARGV, $TheMailLog        = $ARGV[ 0 ], shift ARGV, next   if( $ARGV[ 0 ] eq '-l' );
        
        $TheDataDay = $ARGV[ 0 ], shift ARGV, next  if( $ARGV[ 0 ] eq '-yesterday'
                                                     || $ARGV[ 0 ] eq '-today' );
        
        $ARGV[ 0 ] =~ s/^-//; 
        
        foreach $flag ( split(//, $ARGV[ 0 ]) )
        {
            if( 'emdT' !~ /$flag/ )
            {
                printf stderr "unknown flag: $flag\n";
                print "$program [flags]
    -d          debug mode
    -e          print strange lines to stderr
    -o <file>   output file (overrides \"" . &CalcOutputFile() . "\")
    -l <file>   log file    (overrides \"" . &CalcLogFile()    . "\")
    -T          print entries with a \"to=<>\" or \"from=<>\" (missing address)
    -m          reduce to local mbox is possible
";
                exit -1;
            } 
            die "$0: '$flag' flag already set\n" if( $flags{ $flag }++ );
        } 
        shift ARGV;
    }
    
    $ReduceToLocalMailbox       = $flags{ 'm' };
    $PrintStrangeLines          = $flags{ 'e' };
    $PrintLinesWithNoFromOrTo   = $flags{ 'T' };
    $Debug                      = $flags{ 'd' };
    
    print "Debug mode\n" if( $Debug );
    
    if( ! $TheDataOutputFile )
    {
        $TheDataOutputFile = do CalcOutputFile();
        print "Output file is $TheDataOutputFile\n" if( $Debug );
    }
    
    $TheRawSysLog = do CalcLogFile();
    print "SysLog = $TheRawSysLog\n"    if( $Debug );
}


#
# CalcOutputFile()
#
sub CalcOutputFile
{
    return $TheDataOutputFile   if( $TheDataOutputFile );
    
    #
    # Calc default
    #
    local ( $month_let ) = `$GET_DATE $TheDataDay -monthLMod 2`;
    local ( $day_num   ) = `$GET_DATE $TheDataDay -day`;
    
    return &mk_data_file_name( $month_let, $day_num );
}


sub CalcLogFile
{
    $TheMailLog = "$SW_HOME/ml" if( $Debug && -e "$SW_HOME/ml" );
    return $TheMailLog          if( $TheMailLog );
    
    return $TheRawSysLog;
}


#
# get_raw_data()
#
# Separate the raw data into a temp file
#
sub get_raw_data
{
    local ( $raw_sys_log ) = @_;
    
    #
    # Info is logged as:
    #
    #   Jun 13 08:32:45 ptolemy sendmail[22383]: IAA22383: from=<opentpt@lis...
    #   Jun 13 08:32:45 ptolemy sendmail[22384]: IAA22383: to=<matt@kamson.com>...
    #
    # So we need to extract only those lines from the general
    # log file
    #
    local ( $search_date ) = `$GET_DATE $TheDataDay -month -space -spacePaddedDay`;
    
    #
    # Grep out the day's lines to a separate file:
    #
    local ( $days_mail_entries ) = "$SW_TMP/eml.$$";
    local ( $x ) = system( "$GREP \"^$search_date\" $raw_sys_log | $GREP sendmail > $days_mail_entries" );
    
    return $days_mail_entries;
}


#
# mk_data_file_name()
#
# Construct the name of the data file.
# They are typically of the form: "/var/log/mail_traffic/A_day.24"
#
sub mk_data_file_name
{
    local ( $month_letter,              # '' or 'A', 'B'
            $day ) = @_;                # '-yesterday', '-today' or 1, 2, 3, etc
    local ( $fn, $day_num );
    
    $month_letter = `$GET_DATE -monthLMod 2`            if( ! $month_letter );
    $day_num      = "$day"                              if( int( $day ) > 0 );
    $day_num      = "0$day"                             if( length( "$day" ) == 1 );
    $day_num      = `$GET_DATE $day -zeroPaddedDay` if( $day eq '-yesterday' || $day eq '-today' );
    
    if( ! $day_num )
    {
        print "Bad 'day': $day, (month_let=$month_letter)\n";
        exit -1;
    }
    
    local ( $dir ) = $DATA_FILE_DIR;
    $dir = $SW_TMP          if( $Debug );
    $fn = "$dir/${month_letter}_day.$day_num";
    return $fn;
}



#
# gather_stats
#
sub gather_stats
{
    local ( $mail_log_fn ) = @_;
    
    open( MAIL_LOG, $mail_log_fn ) || die "Can't open $mail_log_fn";
    
    while( <MAIL_LOG> )
    {
        if( /lost input channel from/
         || /, stat=queued/
         || /stat=Service unavailable/ )
        {
            # ignore these
        }
        
        #
        # A "from" line?
        #
        #   Jun 13 08:32:45 ptolemy sendmail[22383]: IAA22383: from=<opentpt@list.stairways.com>,
        #    size=7978, class=-60, pri=145978, nrcpts=1, msgid=<35959685@cupid.Dartmouth.EDU>,
        #    proto=ESMTP, relay=angstrom.metawire.com [198.147.96.73]
        #
        elsif( / (..):\d\d:\d\d .*: ([A-Z]+\d+): from=(.*), size=(\d+)/ )
        {
            do parse_from_line( $1, $2, $3, $4 );
        }
            
        #
        # Some kind of error?
        #
        # Like:
        #   Jun 17 08:41:34 bandit sendmail[15290]: IAA13746: to=<'sharden@ghproductions.com'>,
        #        ctladdr=<lconners@cyberstudios.com> (10758/101), delay=00:00:03, xdelay=00:00:00,
        #        mailer=smtp8, relay=ghproductions.com', stat=Host unknown (Name server:
        #        ghproductions.com': host not found)
        #
        elsif( /stat=Host unknown/
            || /stat= Host unknown/
            || /: .ost not found/
            || /User unknown/
            || /user address required/
            || /could not send message for past \d+ hours/
            || /Connection reset by/
            || /stat=I\/O error: Error 0/
            || /collect: I\/O error on connection from/
            || /collect: premature EOM: Error 0/
            || /collect: unexpected close on connection from/
            || /timeout waiting for input from/
            || /Local configuration error/
            || /Remote protocol error/ )
        {
            print $_    if $PrintStrangeLines;
            $NErrors++;
        }
        
        #
        # A "to" line?
        #
        #   Jun 13 08:32:45 ptolemy sendmail[22384]: IAA22383: to=<matt@kamson.com>,
        #    delay=00:00:02, xdelay=00:00:00, mailer=local, stat=Sent
        #
        #   Jun 13 06:02:36 ptolemy sendmail[21730]: GAA21728: to=devnull@metawire.com,
        #    ctladdr=mailcollector (500/12), delay=00:00:36, xdelay=00:00:36, mailer=esmtp,
        #    relay=mail.metawire.com. [198.147.96.73], stat=Sent (FAA05583 Message accepted
        #    for delivery)
        #
        #   Jun 17 00:08:46 bandit sendmail[9468]: PAA10510: to=<adamross@mail2.DELTANET.COM>,
        #    delay=1+08:37:48, xdelay=00:00:00, mailer=smtp8, relay=mail2.deltanet.com.,
        #    stat=Deferred: 418 <>... can't resolve your name, check your DNS
        #
        elsif( /: ([A-Z]+\d+): to=(.*)/ )
        {
            do parse_to_line( $1, $2 );
        }
        
        #
        # Don't count these as errors
        #
        #   || /aliases/ 
        #
        elsif( /message-id/
            || /locked/ 
            || /alias database (auto|)rebuilt/
            || /rebuilding alias database/ 
            || /Authentication-Warning: .*: nuucp set sender to/ )
        {
            # print "  Noted, didn't count line\n";
        }
        
        #
        # Is a "blocking" line?
        #
        elsif( / blocking: / )
        {
            if( / has expired/ )
            {
                $NBlockedExpired++;
            }
            else
            {
                $NBlocked++;
            }
        }
        
        #
        # Is a "blocking" deubg line?
        #
        elsif( / blocking - / )
        {
            # ignore
        }
        
        #
        # or an "unblocking" line?
        #
        elsif( / unblocking: / )
        {
            $NUnblocked++;
        }
        
        else
        {
            #
            # Count as an error
            #
            $NErrors++;
            
            #
            # Jun 18 12:30:17 bandit sendmail[12994]: dropenvelope: MAA03138: q_flags = 6019,
            #   paddr = <tony@sportspin.com>
            # Jun 18 14:55:57 bandit sendmail[14228]: dropenvelope: OAA09328: q_flags = 6019, 
            #   paddr = <thor@oz.com>
            #
            next if( /dropenvelope: .*: q_flags = 6019/ );

            #
            # Don't know what kind of line it - print it out
            #
            print $_    if $PrintStrangeLines;
        }
    }
    
    close( MAIL_LOG );
}


#
# A "from" line
#
#   Jun 13 08:32:45 ptolemy sendmail[22383]: IAA22383: from=<opentpt@list.stairways.com>,
#    size=7978, class=-60, pri=145978, nrcpts=1, msgid=<35959685@cupid.Dartmouth.EDU>,
#    proto=ESMTP, relay=angstrom.metawire.com [198.147.96.73]
#
sub parse_from_line
{
    local ( $hour, $id, $user, $size ) = @_;
    
    print "id=$id, user=$user"                          if( $DetailedDebug );
    $user =~ s/.*<(.*)>/$1/;        # get rid of <>
    print "$user\n"                                     if( $DetailedDebug );
    $user =~ tr/A-Z/a-z/;           # canonical lc
    print "$user\n"                                     if( $DetailedDebug );
    
    if( ! $user )
    {
        $user = $NO_FROM_USER;
        print stderr "No 'from' user: $_"   if( $PrintLinesWithNoFromOrTo );
        $NErrors++;
    }
    
    if( $ReduceToLocalMailbox )
    {
        local ( $ouser ) = $user;
        #print "    $user\n";
        $user = do strip( $user );
        #print "     $user\n";
        $user = $ouser if ! $known{ $user };
        #print "      $user\n";
        #print "      $known{$user}\n";
    }
    
    #
    # First time we've run into this ID ?
    #
    $NMsgsFrom++ if( ! $id_size{ $id } );
    
    #
    # Record other info about it
    #
    $from_user_size { $user } += $size;
    $id_size        { $id   }  = $size;
    $from_user_count{ $user }++;
    $Hourly         { $hour }++;
}


#
# parse_to_line()
#
# Parse a "to=" line, recording the info about it.
# It is somewhat complicated because the commas that
# normal separate fields are also used to separate
# several addresses:
#
#   to=<martha.josephson@ezi.com>,<cathy.anterasian@ezi.com>, delay=2+05:02:33, ...
#      |________________________| |________________________|  |______________|
#                addr 1                     addr 2
#               fld 1                                              fld 2
#
sub parse_to_line
{
    local ( $id, $rest ) = @_;
    local ( $statv, $to, $oto, $statv );
    
    if( $rest =~ /, stat=(Sent)/
     || $rest =~ /, stat=(Deferred)/
     || $rest =~ /, stat=(.ueued)/ )
    {
        $statv = $1;
    }
    else
    {
        print "Not a 'Sent', 'Deferred' or 'Queued' to= line: $rest\n"  if( ! $Debug );
        return;
    }
    
    #
    # $id   = NAA10522
    # $rest = <martha.josephson@ezi.com>,<cathy.anterasian@ezi.com>, 
    #           delay=2+05:02:33, mailer=esmtp, relay=nova.linked.net. [220.1.1.1], 
    #           stat=Deferred: No route to host
    #
    # or:
    #
    # $rest = matt@ptolemy.metawire.com, ctladdr=syswatch (1010/201), 
    #           delay=00:01:15, xdelay=00:01:15, mailer=smtp8, 
    #           relay=ptolemy-mail.metawire.com. [205.219.92.165], 
    #           stat=Deferred: No route to host
    #
    print "  parsing rest...\n"     if( $DetailedDebug );
    while( $rest )
    {
        ($to, $rest) = split( /,/, $rest, 2 );
        print "  split: to=$to, rest=$rest\n"       if( $DetailedDebug );
        
        #
        # Have we split to the point where "addr" now is "xxx=yyy"?
        # If so then this is the next field
        #
        last if( $to =~ /=/ );
        
        #
        # Remove enclosing "<" ">", if they are there
        #
        if( $to =~ /.*<(.*)>/ )
        {
            $to = $1;
        }
            
        print "  to=$to\n"      if( $DetailedDebug );
        
        if( $ReduceToLocalMailbox )
        {
            $oto = $to;
            print "     $to\n"              if( $Debug );
            $to = do strip( $to );
            print "      $to\n"             if( $Debug );
            $to = $oto if ! $known{ $to };
            print "       $to\n"            if( $Debug );
            print "       $known{ $to }\n"  if( $Debug );
        }
        $to =~ tr/A-Z/a-z/;
        
        if( ! $to )
        {
            $to = $NO_TO_USER;
            print stderr "No 'to' user: $_" if( $PrintLinesWithNoFromOrTo );
            $NErrors++;
        }
    
        if( $statv eq "Deferred" )
        {
            $NDeferred++;
            $deferred_count{ $to }++    if( $deferred_msg{ $id } == 0 );
            $deferred_msg  { $id }++;
            print "Deferred: to=$to, #=$deferred_count{ $to }\n"    if( $DetailedDebug );
        }
        
        #
        # $statv eq "Sent"
        # $statv eq "queued" || $statv eq "Queued" )
        else
        {
            # printf "  adding %d bytes to %s from %s\n", $id_size{ $id }, $to, $user if( $Debug );
            print "Sent: to=$to\n"      if( $DetailedDebug );
            
            $to_user_size { $to } += $id_size{ $id };
            $to_user_count{ $to }++;
            $NMsgsTo++;
        }
    }
    print "  done parsing_to_line\n"    if( $DetailedDebug );
}


#
# output_stats
#
sub output_stats
{
    local ( $output_file ) = @_;
    local ( $n, $user );
    
    open( OF, "> $output_file" ) || die "Can't open output $output_file";
    
    $n = $NMsgsFrom;
    $n = "0" if( ! $n );
    print OF "MsgsFrom\t$n\n";
    
    $n = $NMsgsTo;
    $n = "0" if( ! $n );
    print OF "MsgsTo\t$n\n";
    
    $n = $NErrors;
    $n = "0" if( ! $NErrors );
    print OF "Error lines\t$n\n";
    
    $n = $NDeferred;
    $n = "0" if( ! $NDeferred );
    print OF "Deferred\t$n\n";
    
    $n = $NBlocked;
    $n = "0" if( ! $NBlocked );
    print OF "Blocked\t$n\n";
    
    $n = $NBlockedExpired;
    $n = "0" if( ! $NBlockedExpired );
    print OF "BlockedExpired\t$n\n";
    
    $n = $NUnblocked;
    $n = "0" if( ! $NUnblocked );
    print OF "Unblocked\t$n\n";
    
    local ( $hr, $ind );
    print OF "Hourly\t";
    foreach $hr (0..23)
    {
        $ind = "$hr";
        $ind = "0$hr"   if( $hr < 10 );
        $n = $Hourly{ $ind };
        $n = "0"    if( $n == 0 );
        print OF "$n";
        print OF "\t"   if( $hr ne "23" );
    }
    print OF "\n";
    
    print OF "#\n";
    print OF "# Who\tMsgs To\tMsgs From\tBytes To\tBytes From\tDeferred\n";
    
    #
    # Merge all together
    #
    @loop = keys( to_user_count );
    foreach $user (@loop)
    {
        $merged_count{ $user }  = $to_user_count  { $user };
    }
    
    @loop = keys( from_user_count );
    foreach $user (@loop)
    {
        $merged_count{ $user } += $from_user_count{ $user };
    }
    
    @loop = keys( deferred_count );
    foreach $user (@loop)
    {
        $merged_count{ $user } += $deferred_count { $user };
    }
    
    #
    # And print each to the file
    #
    @loop = keys( merged_count );
    foreach $user (sort bothsort @loop)
    {
        print OF "$user";
        print OF "\t" . &numof( $to_user_count  { $user } );
        print OF "\t" . &numof( $from_user_count{ $user } );
        print OF "\t" . &numof( $to_user_size   { $user } );
        print OF "\t" . &numof( $from_user_size { $user } );
        print OF "\t" . &numof( $deferred_count { $user } );
        print OF "\n";
    }
    
    close( OF );
}


sub bothsort
{
    ($merged_count{ $b } - $merged_count{ $a });
}


sub numof
{
    local ( $n ) = @_;
    $n = "0" if( ! $n );
    return $n;
}


sub strip
{
    local($foo) = shift(@_);
    
    $foo =~ s/@.*//;
    $foo =~ s/.*!//;
    $foo =~ s/\s*\(.*\)//;
    $foo =~ tr/A-Z/a-z/;

    return $foo;
} 


sub hash_passwd
{
    chop( $yp = `/bin/domainname` ) if -x '/bin/domainname';
    $passwd = $yp ? 'ypcat passwd |' : '/etc/passwd';
    open( PASSWD, $passwd ) || die "$program: can't open $passwd: $!\n";
    while( <PASSWD> )
    {
        /^(\w+):[^:]+:(\d+):.*/;
        ($who,$uid) = ($1, $2);
        $uid = 'zero' if( $uid == 0 && $who );
        $known{$who} = $uid;
    } 
    close PASSWD;
}