#!/usr/bin/perl -w # # spamtune.pl # # Make an OOo spreadsheet for tweaking spamcontrol rules. # # Copyright (C) 2004 Raj Mathur # # Available under the terms of the GNU General Purpose License version # 2. # # $Id$ # use strict ; use Mail::Box::File ; use Mail::Box::Maildir ; use Mail::Message ; use Mail::Address ; use Pod::Usage ; use Getopt::Long qw(:config auto_help) ; use ooolib ; use Text::CSV ; =head1 NAME spamtune Make an OpenOffice.org spreadsheet from spam and ham mail =head1 SYNOPSIS spamtune [--ham ham-folder] [--spam spam-folder] [--[no]maildir] [--sa-config-dir dir] [--sa-local-cf file] output-file =head1 OPTIONS spamtune accepts the following command-line options: =over 4 =item B<--spam> I Treat I as consisting of spam mail =item B<--ham> I Treat I as consisting of ham (non-spam) mail. =back The B<--spam> and B<--ham> options may be repeated multiple times to specify multiple ham and spam folders. =over 4 =item B<--[no]maildir> [Don't] Treat folders as Maildir's. The default is to treat folder names as mbox files. =item B<-sa-config-dir> I Look for SpamAssassin configuration files in I. The default is to look in C. =item B<--sa-local-cf> I Get local information from I. The default is to look in C. =item I Name of the output spreadsheet file. The C<.sxc> extension is added automatically. =back =head1 DESCRIPTION spamtune makes an OpenOffice.org spreadsheet from given ham and spam mail folders. Folders may be in Unix mbox or Maildir (untested) format. spamtune permits you to tweak the scoring of messages and check the effect on SpamAssassin's scoring for each message. To use spamtune, collect all known spam mail into one (or more) folder(s) and all known ham (non-spam) mail into one (or more) folder(s). Now pass these folder names to spamtune with the B<--spam> and B<--ham> options respectively. If the folders are in Maildir format, use the B<--maildir> option. By default spamtune expects folders to be in Unix mbox format. spamtune will generate an OpenOffice.org spreadsheet that permits you to change the scoring for various SpamAssassin rules, as well the threshhold score for tagging spam. Changing the value of any score (or the threshhold) immediately shows you the percentages of false positives and false negatives that would be detected by SpamAssassin with those settings. It also shows you precisely which messages would get tagged as spam and ham with that combination of scores. =head1 PREREQUISITES C C C C =head1 SEE ALSO C C =head1 BUGS =over 4 =item Mail scores Sometimes the computed mail spam score doesn't match SpamAssassin's mail score. The difference is usually minimal. =item Which score? spamtune uses the 2nd score (Bayes disabled, network tests enabled) in the list of 4 SpamAssassin scores for computation. Need to add a command-line option for specifying shich of the scores to use. =back =head1 AUTHOR Raj Mathur Eraju (at) kandalaya (dot) orgE =head1 COPYRIGHT spamtune is available under the terms of the GNU General Public License version 2. =head1 AVAILABILITY The latest version of this package is likely to be available from: http://kandalaya.org/ =cut ################################################################ # # Main program # ################################################################ # # Get command-line parameters my @spamfolder = () ; my @hamfolder = () ; my $maildir = 0 ; # # Default directory for SpamAssassin config files. my $sa_config_dir = '/usr/share/spamassassin' ; # # local.cf can be found here. my $sa_local_cf = '/etc/mail/spamassassin/local.cf' ; # # Command-line options override configuration. Getopt::Long::GetOptions ( 'spam=s' => \@spamfolder , 'ham=s' => \@hamfolder , 'maildir!' => \$maildir , 'sa-config-dir=s' => \$sa_config_dir , 'sa-local-cf=s' => \$sa_local_cf , ) ; # # Open and process. # pod2usage ( 1 ) if $#ARGV != 0 ; my $output_file = $ARGV[0] ; my $foldername ; ## ## Print CSV header ## #my # $csvheader = makecsvline # ( # "Message ID" , # "Sender" , # "Sender E-mail" , # "Size" , # "Subject" , # "Time" , # "Recipients" , # "Spam score" , # "Spam status" , # "Spam?" , # "Computed Spam?" , # "Tests" , # ) ; #print "$csvheader\n" ; # # OOo calc header my $sxc = new ooolib ( "sxc" ) ; $sxc -> oooSet ( 'builddir' , '.' ) ; $sxc -> oooSet ( 'title' , 'Spam tuner' ) ; $sxc -> oooSet ( 'author' , 'Raj Mathur' ) ; my $sxcstartrow = 4 ; # 2 rows for counts, one blank. my $sxcrow = $sxcstartrow ; my $sxcstartcol ; my $sxccol ; my $sxcflagcol = "" ; my $nspamfolders = $#spamfolder + 1 ; my $nspam = 0 ; my $nham = 0 ; # # Make the required hits threshhold and scores table. Make # appropriate range for using later in formulae. my ( $rhrow , $rhcol , $strow , $nrows , $testcol , $scorecol ) = make_scores ( $sa_config_dir , $sa_local_cf , $sxc , 1 , 1 ) ; my $testcolalpha = makecol ( $testcol ) ; my $scorecolalpha = makecol ( $scorecol ) ; my $endrow = $strow + $nrows - 1 ; my $testrange = "$testcolalpha$strow:$testcolalpha$endrow" ; my $scorerange = "$scorecolalpha$strow:$scorecolalpha$endrow" ; my $range = "$testrange;$scorerange" ; # # Make header for our data. We start from line 1, column # ($scorecol+1) $sxccol = $scorecol + 2 ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooSet ( 'bold' , 'on' ) ; $sxc -> oooData ( 'cell-text' , 'Message ID' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Sender' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Sender E-mail' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Size' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Subject' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Time' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Recipients' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'SA Score' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Our Score' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'SA computed Spam' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Our computed Spam' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'False +ve/-ve' ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , 'Scoring tests' ) ; $sxcrow++ ; $sxcstartcol = $scorecol + 2 ; $sxccol = $sxcstartcol ; foreach $foldername ( @spamfolder , @hamfolder ) { my $isspam = $nspamfolders > 0 ; # Is there a better way of doing # this? $nspamfolders-- ; my $spammark = $isspam ? "*****" : "" ; print STDERR "*** Processing " . ( $isspam ? "SPAM" : "HAM" ) . " folder $foldername...\n" ; my @folderoptions = ( keep_dups => 1 , extract => 'LAZY' , trace => 0 ) ; my $folder ; if ( $maildir ) { $folder = Mail::Box::Maildir -> new ( folder => $foldername , @folderoptions ) ; } else { $folder = Mail::Box::File -> new ( folder => $foldername , @folderoptions ) ; } if ( !$folder ) { print STDERR "Unable to open mbox/maildir $foldername: $!\n" ; next ; } # # Fields we need to extract from the message headers. my $messageId ; my $senderString ; my $senderId ; my $messageSize ; my $subject ; my $isotimestamp ; my $to ; my @saMatches = () ; my $saStatus ; my $saScore ; foreach my $message ( @$folder ) { $messageSize = $message -> size () ; $messageId = $message -> messageId (); my $timestamp = $message -> timestamp () ; $isotimestamp = isodate ( $timestamp ) ; $subject = $message -> subject () ; my $sender = $message -> sender () ; next if !defined ( $sender ) ; $senderString = $sender -> format () ; $senderId = $sender -> address () ; my @recipients = $message -> to () ; my @recipientString = () ; foreach my $i ( @recipients ) { push ( @recipientString , $i -> format () ) ; } $to = join ( ", " , @recipientString ) ; my $saLine = $message -> get ( 'X-Spam-Status' ) ; # # Sample line looks like this: # # X-Spam-Status: No, hits=-4.9 required=9.5 tests=BAYES_00 autolearn=ham # version=2.64 # # Mail::Message folds it by removing the newlines, retains the tabs. # Hence we can get it into grokable shape by eliminating the tabs. # next if !defined ( $saLine ) ; # print STDERR "Spam-Status: $saLine\n" ; # $saLine =~ s/\t//g ; # print STDERR "Spam-Status: $saLine\n" ; if ( $isspam ) { $nspam++ ; } else { $nham++ ; } $saLine =~ /^(.*), hits=(.*)\s+required=.*\s+tests=(.*)\s+version=.*/ ; my $tempSaStatus = $1 ; $saScore = $2 ; my $tests = $3 ; $tests =~ s/\s+autolearn=.*// ; $saStatus = ( $tempSaStatus eq 'Yes' ) ; $saStatus = 0 if $saStatus eq "" ; @saMatches = sort ( split ( /,\s*/ , $tests ) ) ; # my # $csvline = makecsvline # ( # $messageId , # $senderString , # $senderId , # $messageSize , # $subject , # isodate ( $timestamp ) , # $to , # $saScore , # $saStatus , # $spammark , # "" , # @saMatches , # ) ; # print "$csvline\n" ; # # Make OOo sxc line $sxccol = $scorecol + 2 ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , $messageId ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , $senderString ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , $senderId ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-float' , $messageSize ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , $subject ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , $isotimestamp ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , $to ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-float' , $saScore ) ; # # Do some magic with the number of spam tests. my $formula = "=" ; my $plus = "" ; foreach my $col ( $sxccol + 4..$sxccol + 4 + $#saMatches ) { $formula .= "${plus}LOOKUP(" . makecol ( $col ) . "$sxcrow;$range)" ; $plus = "+" ; } # print "$formula\n"; my $formulacol = $sxccol ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-formula' , $formula ) ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , $spammark ) ; my $compare = "=IF(" . makecol ( $formulacol ) . "$sxcrow>" . makecol ( $rhcol ) . "$rhrow;88888888;0)" ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-formula' , $compare ) ; if ( $isspam ) { $compare = "=IF(" . makecol ( $formulacol ) . "$sxcrow>" . makecol ( $rhcol ) . "$rhrow;0;1)" ; } else { $compare = "=IF(" . makecol ( $formulacol ) . "$sxcrow>" . makecol ( $rhcol ) . "$rhrow;1;0)" ; } $sxcflagcol = $sxccol ; $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-formula' , $compare ) ; #print "$compare\n" ; foreach my $match ( @saMatches ) { $sxc -> oooSet ( 'cell-loc' , $sxccol++ , $sxcrow ) ; $sxc -> oooData ( 'cell-text' , $match ) ; } $sxcrow++ ; } } # # Put total counts on top $sxc -> oooSet ( 'cell-loc' , $sxcstartcol , 1 ) ; $sxc -> oooData ( 'cell-text' , "# Spam Records" ) ; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol , 1 + 1 ) ; $sxc -> oooData ( 'cell-text' , "# Ham Records" ) ; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 1 , 1 ) ; $sxc -> oooData ( 'cell-float' , $nspam ) ; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 1 , 1 + 1 ) ; $sxc -> oooData ( 'cell-float' , $nham ) ; # # Totals of false marks $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 2 , 1 ) ; $sxc -> oooData ( 'cell-text' , "# False -ves" ) ; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 2 , 1 + 1 ) ; $sxc -> oooData ( 'cell-text' , "# False +ves" ) ; my $nspamformula = "=0" ; $nspamformula = "=SUM(" . makecr ( $sxcflagcol , $sxcstartrow + 1 ) . ":" . makecr ( $sxcflagcol , $sxcstartrow + $nspam ) . ")" if ( $nspam ) ; #print "$nspamformula\n"; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 2 + 1 , 1 ) ; $sxc -> oooData ( 'cell-formula' , $nspamformula ) ; my $nhamformula = "=0" ; $nhamformula = "=SUM(" . makecr ( $sxcflagcol , $sxcstartrow + $nspam + 1 ) . ":" . makecr ( $sxcflagcol , $sxcstartrow + $nspam + $nham ) . ")" if ( $nham ) ; #print "$nhamformula\n"; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 2 + 1 , 1 + 1 ) ; $sxc -> oooData ( 'cell-formula' , $nhamformula ) ; # # Percentages of false marks. $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 4 , 1 ) ; $sxc -> oooData ( 'cell-text' , '% False -ves' ) ; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 4 , 1 + 1 ) ; $sxc -> oooData ( 'cell-text' , '% False +ves' ) ; my $pcspamformula = "=0" ; $pcspamformula = "=" . makecr ( $sxcstartcol + 3 , 1 ) . "/$nspam*100" if ( $nspam ) ; #print "$pcspamformula\n"; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 5 , 1 ) ; $sxc -> oooData ( 'cell-formula' , $pcspamformula ) ; my $pchamformula = "=0" ; $pchamformula = "=" . makecr ( $sxcstartcol + 3 , 1 + 1 ) . "/$nham*100" if ( $nham ) ; #print "$pchamformula\n"; $sxc -> oooSet ( 'cell-loc' , $sxcstartcol + 5 , 1 + 1 ) ; $sxc -> oooData ( 'cell-formula' , $pchamformula ) ; # # Ready, generate the spreadsheet $sxc -> oooGenerate ( $output_file ) ; exit ( 0 ) ; ################################################################ # # Functions # ################################################################ # # Populate scores and required hits portion of spreadsheet # my %scores = () ; # # process_file # # Open the given file and insert scores into %scores # # Parameters: # f path to file to open # sub process_file { my $f = shift ; unless ( open ( CFFILE , "<$f" ) ) { print STDERR "Unable to open $f: $!. Skipping..." ; next ; } while ( ) { # # Special treatment for getting threshhold level. if ( /^required_hits\s+([0-9.-])+/ ) { $scores{REQUIRED_HITS} = $1 ; next ; } next unless /^score / ; my ( undef , $test , $sc1 , $sc2 , $sc3 , $sc4 ) = split ( /\s+/ ) ; $scores{$test} = defined ( $sc2 ) ? $sc2 : $sc1 ; } close ( CFFILE ) ; return ; } # # make_scores # # Scan and parse all the files in the SpamAssassin config directories # and generate a scores table and get the number of hits required to # mark spam. # # Parameters: # config_dir directory for config files # local_cf local configuration file # sxc OOo spreadsheet object # start_row, start_col row and column to start from # # Returns: # (rhrow,rhcol) Location of required_hits cell # strow start row of scores table # nrows number of rows in scores table # testcol column containing test name # scorecol column containing test score # sub make_scores { my $config_dir = shift ; my $local_cf = shift ; my $sxc = shift ; my $start_row = shift ; my $start_col = shift ; my $cf_file ; opendir ( CFDIR , $config_dir ) or die "Unable to open $config_dir: $!\n" ; while ( $cf_file = readdir ( CFDIR ) ) { next if $cf_file !~ /\.cf$/ ; process_file ( "$config_dir/$cf_file" ) ; } process_file ( $local_cf ) ; # # Make the headers, threshhold line and test/score lines my $threshhold = 0 ; $threshhold = $scores{REQUIRED_HITS} if exists ( $scores{REQUIRED_HITS} ) ; my $row = $start_row ; my $col = $start_col ; my $rhrow ; my $rhcol ; my $strow ; my $nrows ; my $testcol ; my $scorecol ; $sxc -> oooSet ( 'cell-loc' , $col , $row ) ; $sxc -> oooData ( 'cell-text' , "Required hits (Threshhold):" ) ; $sxc -> oooSet ( 'cell-loc' , $col + 1 , $row ) ; $sxc -> oooData ( 'cell-float' , $threshhold ) ; $rhrow = $row ; $rhcol = $col + 1 ; $row++ ; $row++ ; # Keep one blank line $sxc -> oooSet ( 'cell-loc' , $col , $row ) ; $sxc -> oooData ( 'cell-text' , "Test name" ) ; $sxc -> oooSet ( 'cell-loc' , $col + 1 , $row++ ) ; $sxc -> oooData ( 'cell-text' , "Test score" ) ; $strow = $row ; $testcol = $col ; $scorecol = $col + 1 ; foreach my $test ( sort ( keys ( %scores ) ) ) { $sxc -> oooSet ( 'cell-loc' , $col , $row ) ; $sxc -> oooData ( 'cell-text' , $test ) ; $sxc -> oooSet ( 'cell-loc' , $col + 1 , $row++ ) ; $sxc -> oooData ( 'cell-float' , $scores{$test} ) ; } $nrows = $row - $strow ; return ( $rhrow , $rhcol , $strow , $nrows , $testcol , $scorecol ) ; } # # helpanddie # # Print help message and die... what did you expect? # # Parameters: # exitval value to return to calling shell # sub helpanddie { my $exitval = shift ; print "Usage: $0 [flags] folder...\n" ; exit ( $exitval ) ; } # # makecsvline # # Make a CSV line out of the arguments # # Parameters: # Whatever you want to put into the line. # sub makecsvline { my $csv = new Text::CSV ; $csv -> combine ( @_ ) ; return ( $csv -> string () ) ; } # # isodate # # Make ISO-style (YYYY-MM-DD hh:mm:ss) date from given Unix timestamp # # Parameters: # ts timestamp # # Returns: # formatted ISO date string # sub isodate { my $ts = shift ; my ( $ss , $mm , $hh , $DD , $MM , $YY , undef ) = localtime ( $ts ) ; return sprintf ( "%04d-%02d-%02d %02d:%02d:%02d" , $YY + 1900 , $MM + 1 , $DD , $hh , $mm , $ss ) ; } # # makecol # # Make corresponding column letter from column number # # Parameters: # n column number # # Returns: # l column letter(s) # sub makecol { my $n = shift ; $n-- ; my $n1 = int ( $n / 26 ) ; my $n2 = $n % 26 ; my $l1 = $n1 ? chr ( $n1 - 1 + ord ( 'A' ) ) : "" ; my $l2 = chr ( $n2 + ord ( 'A' ) ) ; return ( "$l1$l2" ) ; } # # makerc # # Make A1-type reference given row and column # # Parameters: # c column # r row # # Returns: # cell cell name suitable for use in formula # sub makecr { my $c = shift ; my $r = shift ; return ( makecol ( $c ) . $r ) ; }