diff --git a/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm b/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm index eb2d7e1..b28a1fa 100644 --- a/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm +++ b/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm @@ -1,9 +1,19 @@ -#!/usr/bin/perl - -package Mail::SpamAssassin::KeywordRuleGenerator; - -use strict; -use warnings; +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =head1 NAME @@ -11,11 +21,7 @@ Mail::SpamAssassin::KeywordRuleGenerator - Generate SA rules for keywords =cut -our $VERSION = '0.01'; - -=head1 VERSION - -Version 0.01 +package Mail::SpamAssassin::KeywordRuleGenerator; =head1 SYNOPSIS @@ -23,6 +29,31 @@ Generate SpamAssassin compatible configuration files given lists of keywords. Implemented as a module largely for testing purposes. use Mail::SpamAssassin::KeywordRuleGenerator; + my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new($id); + $kw->readFile('keywords.cf'); + $kw->writeFiles(); + +=head1 DESCRIPTION + +Mail::SpamAssassin::KeywordRuleGenerator does what it says on the tin; it +generates SpamAssassin compatible configuration files to catch keywords that you +specify. Most simply, it can take in one or more properly formatted input files +(see FILES), generate rules for individual files (optional), as well as meta +rules for the counts for each set of keywords (ie. 1 of N hit; 2 of N hit...). +See RULES for more on how the rules are generated. + +The sets of keywords can be broken up into groups (see GROUPS). + +=head1 PREREQUISITES + +Requires C executable and the following Perl modules + + To::Be::Determined + +=cut + +#use strict; +use warnings; =head2 FILES @@ -77,6 +108,14 @@ describe ID_FILE_GROUP_1 1 match in keyword group 'GROUP' meta ID_FILE_GROUP_2 ( __ID_FILE_WORD + __ID_FILE_OTHER ) > 1 describe ID_FILE_GROUP_2 2 matches in keyword group 'GROUP' +=head1 VERSION + +Version 0.01 + +=cut + +our $VERSION = '0.01'; + =head1 EXPORT =cut @@ -89,86 +128,284 @@ our @EXPORT = qw( new ); our %EXPORT_TAGS = ( 'all' => [ 'new', - '' + 'getId', + 'setId', + 'getPriority', + 'setPriority', + 'getOutfile', + 'setOutfile', + 'getScorefile', + 'setScorefile', + 'clearFiles', + 'getFiles', + 'nextFile', + 'readFile', + 'getFile', + 'setFile', + 'joinRules', + 'processMetas', + 'processWords', + 'processGroups', + 'processAll', + 'writeFile', + 'writeAll' ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{all} } ); -=head1 SUBROUTINES/METHODS +=head1 METHODS -=head2 C<$kw = new($id)> +Except for getter methods, all methods make changes directly within the context +object and only return errors. Thus, each call can be verified by checking that +it returned undef: + + die "Failed to run 'method'\n" if($kw->method($arg)); + +or: + + my $err = $kw->method($arg); + die "Received $err\n" if ($err); + +Getter methods will return the attribute or undef if it does not exist. + +=head2 C<$kw = new( %args )> Creates a new C object and returns it. -C<$id> is an optional identifier which will be used to prefix any rule output. -It can be updated at any time via the C<$kw->{id}> value or with the -C<$kw->id($id)> setter; +C<$args> is an optional hash to assign any preliminary attributes or flags to +the object. See various 'get' and 'set' functions. =cut sub new { - my ($class, $id) = @_; + my ($class, $args) = @_; + my $self = $args; - $id = uc($id) || 'KW'; - bless { - id => $id || '', - keywords => {} - } => $class; -} +=head3 Initial attributes -=head2 C<$kw->id()> +While other attributes will exist within the object during processing, those +which make sense to define up-front and which will not be overwritten by any of +the built-in functions (other than their associated getters) are: -Get/Setter for C<$kw->{'id'}> +=head3 id + + id scalar + A short identifier which will be appending to all output files to allow + for easier recognition of the files' sources. Default: 'KW' =cut -sub id + $self->{'id'} //= 'KW'; + +=head3 priority + + priority scalar + Typically a number used to define the priority of the rules. + SpamAssassin will read configuration files in alphabetical order and the + last iteration of a configuration for the same rule will be used. This + means that the files read last will overwrite those read earlier. By + convention the number is used for easy sorting. Any leading aphabetical + character will be ordered after all numbers, and so given high priority. + Default: '50' + +=cut + + $self->{'priority'} //= 50; + +=head3 debug + + debug boolean + Enable (1) or disable (0) debugging output. Default: 0 + +=cut + + $self->{'debug'} //= 0; + +=head3 singleOutfile + + singleOutfile boolean + Indicates whether output rules should all be added to a single file (1), + or to one file per input file (0). Note that this still requires + 'joinScores' to have one file total. Default: 0 + +=cut + + $self->{'singleOutfile'} //= 0; + +=head3 joinScores + + joinScores bootlean + Indicates whether to include the scores in the same file as their + associated rule definitions (1) or in a second file on their own (0). + The second file will simply append '_SCORES' to the file name (prior to + the '.cf'), unless overridden by C<$kw->setScoreFile($path)> or a second + path argument in C<$kw->setOutfile($rulePath, $scorePath)>. Default: 1 + +=cut + + $self->{'joinScores'} //= 1; + + bless $self, $class; + return $self; +} + +=head2 C<$kw->getId()> + +Getter for C<$kw->{'id'}>. 'id' is used for top-level rule names. + +=cut + +sub getId +{ + my $self = shift; + + if (defined($self->{'id'})) { + return $self->{'id'}; + } +} + +=head2 C<$kw->setId()> + +Setter for C<$kw->{'id'}> + +=cut + +sub setId { my $self = shift; my $id = shift; if (defined($id)) { $id = uc($id); - $self->{'id'} = $id; + $self->{'id'} = $id || return "Failed to set $id"; } else { - if (defined($self->{'id'})) { - return $self->{'id'}; - } else { - return undef; - } + return "No ID provided\n"; } } -=head2 C<$kw->file()> +=head2 C<$kw->getPriority()>. -Get/Setter for C<$kw->{'file'}> +Getter for C<$kw->{'priority'}>. 'priority' value used for the output filepath +to indicate the load order. SpamAssassin reads in ascending order with later +iterations overriding earlier. The 'score' file will increment so that it comes +at the end. =cut -sub file +sub getPriority +{ + my $self = shift; + + if (defined($self->{'priority'})) { + return $self->{'priority'}; + } +} + +=head2 C<$kw->setPriority()> + +Setter for C<$kw->{'priority'}>. + +=cut + +sub setPriority +{ + my $self = shift; + my $priority = shift; + + if (defined($priority)) { + $path = uc($priority); + $self->{'priority'} = $priority || return "Failed to set priority: $priority"; + } else { + return "No 'priority' provided\n"; + } +} + +=head2 C<$kw->getFile()> + +Getter for C<$kw->{'file'}>. This is the name of the current output file being +processed. Not to be confused with C or C. + +=cut + +sub getFile +{ + my $self = shift; + + if (defined($self->{'file'})) { + return $self->{'file'}; + } +} + +=head2 C<$kw->setFile()> + +Setter for C<$kw->{'file'}>. + +=cut + +sub setFile { my $self = shift; my $file = shift; if (defined($file)) { - my $name = $file; - $name =~ s/^([a-zA-Z0-9\-_]*)(\..*)?$/$1/; - $name =~ s/-/_/; - $name = uc($name); - die("invalid filename\n") unless ($name =~ m/^[A-Z_]+$/); - $self->{'file'} = $name; + $self->{'file'} = $file; } else { - if (defined($self->{'file'})) { - return $self->{'file'}; + return "No File provided\n"; + } +} + +=head2 C<$kw->getOutfile()> + +Getter for C<$kw->{'outfile'}>. 'outfile' represents the real filepath of the +output file which is currently being processed. + +=cut + +sub getOutfile +{ + my $self = shift; + + if (defined($self->{'outfile'})) { + return $self->{'outfile'}; + } +} + +=head2 C<$kw->setOutfile()> + +Setter for C<$kw->{'outfile'}>. Can be defined manually with a scalar argument, +otherwise the path is constructed from the existing attributes. + +=cut + +sub setOutfile +{ + my $self = shift; + my $path = shift; + + if (defined($path)) { + $self->{'outfile'} = $path || return "Failed to set $path"; + } else { + if ($self->{'singleOutfile'}) { + $self->{'outfile'} = $self->{'priority'} . + '_' . uc($self->{'id'}) . + '.cf'; } else { - return undef; + my $file = $self->{'file'}; + $file =~ s/\//_/g; # Change dir slashes to _ + $file =~ s/(\.[^\.]*)$//g; # Remove extensions + $file = uc($file); # Convert to uppercase for rule names + + $self->{'outfile'} = $self->{'priority'} . + '_' . uc($self->{'id'}) . + '_' . $file . + '.cf'; } } } -=head2 C<$kw->getFile();> +=head2 C<$kw->getFiles($regex);> Simple recursive search for files within a directory. Will validate that each file is readable and return an array of file names. @@ -184,24 +421,63 @@ The regex will be used as a file filter and will only return files that match. sub getFiles { my $self = shift; - my @args = @_; + my $regex = shift; + my $return = ''; foreach (@args) { - die "$_ does not exist\n" unless (-e "$_" || -l "$_"); + $return .= "$_ does not exist\n" unless (-e "$_" || -l "$_"); if (-l $_) { - getFiles(readlink($_)); + $self->getFiles(readlink($_)); } elsif (-d $_) { my @recursive = glob($_."/*"); - getFiles(@recursive); + $self->getFiles(@recursive); } else { - die "$_ is not readable\n" unless (-r "$_"); - push(@{$self->{files_ref}}, $_); + if (defined($regex)) { + if ($_ =~ $regex) { + push(@{$self->{'files_ref'}}, $_); + } else { + next; + } + } else { + $return .= "$_ is not readable\n" unless (-r "$_"); + push(@{$self->{'files_ref'}}, $_); + } } } - return @$self->{files_ref}; + return $return; } -=head2 C<$kw->readFile();> +=head2 C<$kw->nextFile();> + +Shift next message in files_ref queue to current. + +=cut + +sub nextFile +{ + my $self = shift; + if (scalar(@{$self->{'files_ref'}})) { + $self->{'path'} = shift(@{$self->{'files_ref'}}); + } else { + return "End of array\n"; + } + return 0; +} + +=head2 C<$kw->clearFiles();> + +Clear the current file reference queue. + +=cut + +sub clearFiles +{ + my $self = shift; + delete($self->{'rules'}) || return "Failed to delete rules hash\n"; + return 0; +} + +=head2 C<$kw->readFile(%args);> Read in properly formatted keyword list file. The basic format is one keyword per line, an optional score, and an optional list of 'groups'. @@ -218,45 +494,314 @@ When keyword groups are omitted, that keyword defaults to just the 'LOCAL' group Examples with other combinations are: -# word 0 Same as previous -# word 2 Used for LOCAL, not GLOBAL, scores 2 on it's own -# word GROUP GROUP group, not LOCAL, not GLOBAL, no score -# word 0 GROUP Same as previous -# word 1 GROUP GLOBAL LOCAL GROUP group, in GLOBAL, in LOCAL, scores 1 + word 0 # Same as previous + word 2 # Used for LOCAL, not GLOBAL, scores 2 + word GROUP # GROUP group, not LOCAL, not GLOBAL, no score + word 0 GROUP # Same as previous + word 1 GROUP GLOBAL LOCAL # GROUP group, in GLOBAL, in LOCAL, scores 1 + +C<%args> can be used to override the attributes used to index the rules, +including the id, file + +This function intentionally does not iterate through the files queue so that you +can step through and make modifications as you go. =cut -sub readFiles +sub readFile { - my $key_ref = shift; - my @files = @_; + my $self = shift; + my $file = shift || return 'No file provided'; + my %args = @_; + $self->{'file'} = $file; + $self->setOutfile(); + my $n = 0; - foreach my $file (@files) { - my $n = 0; # Track line number for errors - my $name = $file; - $name =~ s/\//_/g; # Change dir slashes to _ - $name =~ s/(\.[^\.]*)*$//g; # Remove extensions - $name = uc($name); # Convert to uppercase for rule names - if (open(my $fh, '<', $file)) { - while (<$fh>) { - $n++; - # Ignore blank lines and comments - if ($_ =~ m/^\s*$/ || $_ =~ m/^#/) { - next; - # Verify formatting - } elsif ($_ =~ m/^([^\s]+)\s+([0-9]+)(?:\s+(.*))?/) { - my ($word, $score, $groups) = ($1, $2, $3); - print "'$word' '$score' " . (join(',',$groups)) . "\n"; - } else { - die "Invalid input in $file, line $n: $_\n"; + if (open(my $fh, '<', $file)) { + my $rules = 0; + while (<$fh>) { + $n++; + # Ignore blank lines and comments + if ($_ =~ m/^\s*$/ || $_ =~ m/^#/) { + next; + # Verify formatting + } elsif (my ($word, $score, $comment, @groups) = $self->readLine($_)) { + if ($self->{'debug'}) { + print "FOUND: '$word' '$score' " . (join(',',@groups)) . "\n"; } + foreach my $group (@groups) { + if ($group eq 'GLOBAL') { + push(@{$self->{'rules'}->{'GLOBAL'}}, $word); + } else { + push(@{$self->{'rules'}->{$self->{'outfile'}}->{$group}}, $word); + } + } + $self->{'rules'}->{$self->{'outfile'}}->{'SCORED'}->{$word} = $score if ($score); + $self->{'rules'}->{$self->{'outfile'}}->{'COMMENTS'}->{$word} = $comment if ($comment); + } elsif ($self->{debug}) { + print STDERR "ERROR: Invalid input in $file, line $n: $_\n"; + } + } + print STDERR "No rules found in $file\n" unless ($self->{'rules'}->{$self->{'outfile'}} || !$self->{'debug'}); + return "No rules found in $file\n" unless ($self->{'rules'}->{$self->{'outfile'}}); + } else { + delete($self->{'file'}); + return "Failed to read $file"; + } + return undef; +} + +=head2 C<$kw->readAll();> + +Loops through C for all files in the queue. + +=cut + +sub readAll +{ + my $self = shift; + my @files = ( shift ) || return "No files provided\n"; + while (scalar(@_)) { + push(@files, shift); + } + + my @failed; + foreach my $file (@files) { + if (!-e $file) { + push(@failed, $file); + print STDERR "$file does not exist\n" if ($self->{'debug'}); + } elsif (-l $file) { + # Accept only links within PWD + my $dest = readlink($file); + if ($dest =~ m/^\//) { + $dest =~ s/$ENV{'PWD'}\/// ; + if ($dest eq readlink($file)) { + push(@failed, $file); + print STDERR "Symlink outside of PWD $file\n" if ($self->{'debug'}); + next; + } + } + $self->readAll($dest); + } elsif (-f $file) { + push(@failed, "Failed to read $file") if $self->readFile($file); + } elsif (-d $file) { + push(@failed, @_) if $self->readAll(glob($file.'/*')); + # Bash pattern? + } else { + if (my @glob = glob($file)) { + push(@failed, ( @_ )) if $self->readAll(@glob); + } else { + print STDERR "Bad file $file\n" if ($self->{'debug'}); } } } - #print "$_\n" foreach(@$files_ref); + return @failed if (scalar(@failed)); + return undef; } -=head 2 C<$kw->writeFiles($out_dir)> +=head2 C<$kw->readLine($line)> + +Reads a line from a configuration file and returns the relevant values or undef +if the line is not properly formatted. + +=cut + +sub readLine +{ + my $self = shift; + my $line = shift; + my $invalid = shift; + my ($word, $score, $comment, @groups); + if (my @sections = $line =~ m/(?:^|\s+)(?:([^\d\s#]\S+)|(\d+(?:\.\d+)?\b)|([^\d\s#]+)|(#.*$))/g) { + while (@sections) { + next unless my $section = shift(@sections); + if (defined($comment)) { + $comment .= ' ' if ($comment ne ''); + $comment .= $section; + } elsif (!defined($word) && $section =~ m/^([^\d\s#]\S+)$/) { + $word = $section; + } elsif (defined($word) && $section =~ m/^(\d+(?:\.\d+)?)$/ && !defined($score)) { + $score = $section; + } elsif (defined($word) && $section =~ m/^([^\d\s#]+)$/) { + push(@groups, $section); + } elsif (defined($word) && $section =~ m/^#.*$/ && !defined($comment)) { + $comment = $section; + $comment =~ s/^#\s*//; + } else { + push(@invalid, $section); + } + } + if (scalar(@invalid)) { + if ($self->{'debug'}) { + print("Invalid clauses: '".join("', ", @invalid)."' in '$line'\n"); + } + return undef; + } + return undef unless ($word); + $word = lc($word); + $score //= 0; + $comment //= ''; + @groups = ( 'LOCAL', 'GLOBAL' ) unless (scalar(@groups)); + return ( $word, $score, $comment, @groups ); + } + return (); +} + +=head2 C<$kw->joinRules(%rules)> + +Merge rules hash into working output hash. Generally called from C, +but if you want to add rules manually, it will require the following format: + +{ + 'word' => { + 'score' => 0, + 'groups' => ( 'LOCAL' ) + }, + 'other' => { + 'score' => 1, + 'groups' => ( 'GLOBAL', 'group' ) + } +} + +The main context object will provide the working 'ID', 'FILE' and other +attributes necessary to nest these rules in the existing hash. + +=cut + +sub joinRules +{ + my $self = shift; + my %rules = @_; + + if ($self->{'unified'}) { + $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'out'} = $self->{'priority'}.'_'.$self->{'id'}.'.cf'; + } else { + $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'out'} = $self->{'priority'}.'_'.$self->{'id'}.'_'.$self->{'file'}.'.cf'; + } + foreach my $word (keys(%rules)) { + my $score = $rules{$word}{'score'} || 0; + my @groups = $rules{$word}{'groups'} || ( 'GLOBAL' ); + if ($score) { + $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'words'}->{$word} = $score; + } + foreach my $group (@groups) { + if ($group eq 'GLOBAL') { + if (scalar(keys(%{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}}))) { + push(@{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}}, $word); + } else { + $self->{'rules'}->{$self->{'id'}}->{'GLOBAL'} = ( $word ); + } + next; + } + if (scalar(keys(%{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group}}))) { + push(@{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group}}, $word); + } else { + $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group} = ( $word ); + } + } + } + return 0; +} + +=head2 C<$kw->processMetas($outfile, $file);> + +Create all of the component meta rules for the declared C<$file>. Those that +will be used for the standalone and count rules. Output to C<$outfile>. This +must be run before the other process methods and must be run for 'GLOBAL' first, +otherwise output will be invalid. Meta rules for file-specific words will not be +generated if they are also in the 'GLOBAL' group, instead the meta rules from +the 'GLOBAL' file will be used for the count rules in all other files. This +will prevent duplicates, but also requires that you not rename output files such +that the they appear before the 'GLOBAL' file (without '_C<$file>' at the end). + +=cut + +sub processMetas +{ + my $self = shift; + my $outfile = shift; + my $file = shift; + my $rules = shift; + + my $prefix = $self->{'id'}; + my @words; + if ($file eq 'GLOBAL') { + @words = @{$rules->{'GLOBAL'}}; + } else { + $prefix .= "_".$file; + foreach (keys(%{$rules->{$file}->{'groups'}})) { + next if (grep {/^$_$/} @{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}}); + } + foreach (keys(%{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'words'}})) { + next if (grep {/^$_$/} @{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}}); + next if (grep {/^$_$/} @words); + push (@words, $_); + } + } + foreach my $word (@words) { + $self->{'output'}->{$outfile} .= + "body\t__".$prefix."_".uc($word)."_BODY\t/\\b".$word."\\b/\n" . + "header\t__".$prefix."_".uc($word)."_SUBJ\tSubject =~ /\\b".$word."\\b/\n" . + "meta\t__".$prefix."_".uc($word)."\t( __".$prefix."_".uc($word)."_BODY || __".$prefix."_".uc($word)."_SUBJ )\n\n"; + } + +} + +=head2 C<$kw->processWords($outfile,%args);> + +Take a list of words with scores and add them to C<$outfile>. + +=cut + +sub processWords +{ + my $self = shift; + my $out = shift; +} + +=head2 C<$kw->processGroup(%args);> + +Take a single group, including 'GLOBAL' and add it to C<$outfile>. + +=cut + +sub processGroup +{ + my $self = shift; + my $outfile = shift; + + for (my $i = 0; $i < scalar(@all); $i++) { + $files->{$self->{'priority'}.'_'.$self->{'id'}.'cf'} .= + "meta\t".$self->{'id'}."_".$i."\t( ".join(' + ',@all)." ) > $i\n" . + "describe\tMatched ".($i+1)."of keywords: ".join(', ',@{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}})."\n\n"; + } +} + +=head2 C<$kw->processAll(%args);> + +Process all groups. + +=cut + +sub processAll +{ + my $self = shift; + my %args = @_; + + $self->{'output'} = {}; + foreach my $id (keys(%{$self->{'rules'}})) { + $self->processMetas($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id}); + $self->processGroups($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id}); + $self->processWords($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id}); + foreach my $file (keys(%{$self->{'rules'}->{$id}})) { + $self->processMetas($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id}); + $self->processGroups($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id}); + $self->processWords($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id}); + } + } +} + +=head2 C<$kw->writeFiles($out_dir)> Output files will use the name of each input file, stripping any extension, and forcing the name to uppercase. Rules in each file will be called: @@ -282,13 +827,60 @@ body for that word. sub writeFiles { my $self = shift; - my $dir = shift; + foreach my $out (keys(%{$self->{'rules'}})) { + unless ($self->{'joinScores'}) { + $file =~ s/(.*)\.cf/$1_SCORES.cf/; + print STDERR $self->{'scores'}; + } + if (open(my $fh, '>', $out)) { + print $fh $self->{'output'}->{$out}; + close($fh); + } else { + print STDERR "Failed to write $out\n"; + } + } - use Data::Dump; - print Data::Dump::dump($self->{keywords}); return 0; } +=head1 MORE + +For discussion of the module and examples, see: + +Ehttps://john.me.tz/projects/article.php?topic=Mail-SpamAssassin-KeywordRuleGenerator + +=head1 SEE ALSO + +Mail::SpamAssassin +spamassassin + +=head1 BUGS + +Report issues to: + +Ehttps://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator/issuesE + +=head1 AUTHOR + +John Mertz + +=head1 COPYRIGHT + +Mail::SpamAssassin::KeywordRuleGenerator is distributed under the Apache License +Version 2.0, as described in this file and the file C included with the +distribution. + +=head1 AVAILABILITY + +If possible, the latest version of this library will be made available from CPAN +as well as: + +Ehttps://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGeneratorE + +=cut + +1; + =pod die "Please provide rules file(s) as an argument\n" unless (defined($ARGV[0])); diff --git a/t/02_files.cf b/t/02_files.cf new file mode 100644 index 0000000..4f5b278 --- /dev/null +++ b/t/02_files.cf @@ -0,0 +1 @@ +word diff --git a/t/02_files.dir/02_one.cf b/t/02_files.dir/02_one.cf new file mode 100644 index 0000000..564c438 --- /dev/null +++ b/t/02_files.dir/02_one.cf @@ -0,0 +1,3 @@ +oneone 2 LOCAL +onetwo 0 GLOBAL +onethree three diff --git a/t/02_files.dir/02_two.cf b/t/02_files.dir/02_two.cf new file mode 100644 index 0000000..cfc4ad2 --- /dev/null +++ b/t/02_files.dir/02_two.cf @@ -0,0 +1,3 @@ +twoone 2 LOCAL +twotwo 0 GLOBAL +twothree three diff --git a/t/02_files.lnk b/t/02_files.lnk new file mode 120000 index 0000000..d2114e9 --- /dev/null +++ b/t/02_files.lnk @@ -0,0 +1 @@ +/home/jpm/KeywordRuleGenerator/t/02_files.cf \ No newline at end of file diff --git a/t/02_files.t b/t/02_files.t new file mode 100644 index 0000000..074df91 --- /dev/null +++ b/t/02_files.t @@ -0,0 +1,47 @@ +#!/usr/bin/perl + +# Basic test to ensure files can be found and read. Verification will come later + +use Test::More; + +use lib 'lib/'; +use Mail::SpamAssassin::KeywordRuleGenerator; + +my $id = '02'; + +my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id } ); + +# Read a single file +my $file = 't/02_files.cf'; +ok (!$kw->readFile($file), "Run readFile on $file"); +ok ($kw->getFile($file) == $file, "Get current working file"); +ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files in rules"); +ok ($kw->{'rules'}->{'50_02_T_02_FILES.cf'}, "Loaded correct output name"); +ok (!$kw->clearFiles(), "Clear 'rules' hash"); +ok (!$kw->{'rules'}, "Fetched none after clearing"); + +# Read a directory +my $dir = 't/02_files.dir'; +ok (!$kw->readAll($dir), "Run file on dir"); +ok (scalar(keys(%{$kw->{'rules'}})) == 3, "Fetched correct number of files from dir"); +ok (!$kw->clearFiles(), "Clear dir files"); + +# Read a symlink +my $link = 't/02_files.lnk'; +ok (!$kw->readAll($link), "Run file on link"); +ok ($kw->getFile() == $file, "Fetched correct name ($file) from link ($link)"); +ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files from link"); +ok (!$kw->clearFiles(), "Clear link files"); + +# Read multiple mixed +ok (!$kw->readAll($file, $dir, $link), "Run readAll on multiple/mixed"); +ok (scalar(keys(%{$kw->{'rules'}})) == 4, "Fetched correct number of files from all"); +ok (!$kw->clearFiles(), "Clear dir files"); + + +ok ($kw->readFile('does_not_exist'), "Correctly failed for non-existent file"); +ok (!$kw->getFile(), "Current name undefined for non-existent file"); +ok ($kw->readFile('t/02_unreadable.cf'), "Correctly failed for non-readable file"); +ok (!$kw->getFile(), "Current name undefined for non-readable file"); + +done_testing(); diff --git a/t/03_verify.t b/t/03_verify.t new file mode 100644 index 0000000..e493374 --- /dev/null +++ b/t/03_verify.t @@ -0,0 +1,64 @@ +#!/usr/bin/perl + +# Verify file input line formatting + +use Test::More; + +use lib 'lib/'; +use Mail::SpamAssassin::KeywordRuleGenerator; + +my $id = '03'; + +my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id, 'debug' => 0 } ); + +my @good = ( + 'Lorem 1 GLOBAL # comment', + 'ipsum 0 GLOBAL # comment', + 'dolor GLOBAL # comment', + 'sit 1 # comment', + 'amet 0 # comment', + 'consectetur # comment', + 'adipiscing 1 GLOBAL', + 'elit 0 GLOBAL', + 'sed GLOBAL', + 'do 1', + 'eiusmod 0', + 'tempor', +); +foreach my $score ( '', 0, 1 ) { + foreach my $group ( '', 'GLOBAL', 'GLOBAL LOCAL' ) { + foreach my $comment ( '', 'TESTING', 'LONGER COMMENT' ) { + my $word = "word"; + my $rule = $word; + if ($score ne '') { + $rule .= ' '.$score; + } + if ($group ne '') { + $rule .= ' '.$group; + } + if ($comment ne '') { + $rule .= ' # '.$comment; + } + my ($rword, $rscore, $rcomment, @rgroups) = $kw->readLine($rule); + ok($rword eq $word, "Word '$word' found for '$rule'"); + ok($rscore eq $score, "Score '$score' found for '$rule'") if ($score ne ''); + ok(join(' ',@rgroups) eq $group, "Groups '$group' found for '$rule'") if ($group ne ''); + ok(join(' ',@rgroups) eq 'LOCAL GLOBAL', "Inferred groups 'LOCAL GLOBAL' found for '$rule'") if ($group eq ''); + ok($rcomment eq $comment, "Comment '$comment' found for '$rule'") if ($comment ne ''); + } + } +} + +# Check for bad formatting + +my %bad = ( + '' => 'Ignore empty line', + '# comment' => "Ignore line starting with comment", + 'word 1 2' => "Ignore line with multiple scores", + '2 bad' => "Ignore line with score first", +); +foreach my $input (keys(%bad)) { + ok(!$kw->readLine($input), "$bad{$input} ($input)"); +} + +done_testing(); diff --git a/t/04_rules.t b/t/04_rules.t new file mode 100644 index 0000000..2680872 --- /dev/null +++ b/t/04_rules.t @@ -0,0 +1,283 @@ +#!/usr/bin/perl + +# Basic test to ensure files can be found and read. Verification will come later + +use Test::More; + +use lib 'lib/'; +use Mail::SpamAssassin::KeywordRuleGenerator; + +my $id = '04'; + +my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id, 'debug' => 0 } ); + +my @files = ( 't/04_rules0.cf', 't/04_rules1.cf' ); +my @failed = @{$kw->readAll( @files )}; +ok(!scalar(@failed), "Load 'rules' hash with readAll"); + +my $expected = getExpected(); +my ( $missing, $extra, $incorrect ) = 0; +use Data::Dump; +foreach my $file (keys(%{$expected})) { + if ($file eq 'GLOBAL') { + my ($m, $e) = compareGroups( + $expected->{$file}, + $kw->{'rules'}->{$file} + ); + print("Missing $m words\n") if ($m); + print("$e extra words\n") if ($e); + $missing += $m; + $extra += $e; + } else { + foreach my $group (keys(%{$expected->{$file}})) { + if ($group eq 'SCORED') { + my ($m, $e, $i) = compareScores( + $expected->{$file}->{$group}, + $kw->{'rules'}->{$file}->{$group} + ); + $missing += $m; + $extra += $e; + $incorrect += $i; + } elsif ($group eq 'COMMENTS') { + my ($m, $e, $i) = compareScores( + $expected->{$file}->{$group}, + $kw->{'rules'}->{$file}->{$group} + ); + $missing += $m; + $extra += $e; + $incorrect += $i; + } else { + my ($m, $e) = compareGroups( + $expected->{$file}->{$group}, + $kw->{'rules'}->{$file}->{$group} + ); + $missing += $m; + $extra += $e; + } + } + } +} + +ok ($missing == 0, "No expected rules are missing"); +ok ($extra == 0, "No extra rules are found"); +ok ($incorrect == 0, "No incorrect scores are found"); + +done_testing(); + +sub compareGroups +{ + my $expect = shift; + my $loaded = shift; + + my @e = sort(@{$expect}); + my @l = sort(@{$loaded}); + my ($missing, $extra) = (0, 0); + + while (scalar(@e)) { + unless (scalar(@l)) { + print("extra words @l\n"); + $extra += scalar(@e); + last; + } + if ($e[0] eq $l[0]) { + shift(@e); + shift(@l); + next(); + } + if ($e[0] lt $l[0]) { + print("Extra word $e[0]\n"); + $extra++; + shift(@e); + next; + } + if ($e[0] gt $l[0]) { + print("Missing word $l[0]\n"); + $missing++; + shift(@l); + next; + } + } + if (scalar(@e)) { + print("Missing ".scalar(@e)." at the end of parsing\n"); + $missing += scalar(@e); + last; + } + + return ($missing, $extra); +} + +sub compareScores +{ + my $expect = shift; + my $loaded = shift; + + my %remaining = %{$loaded}; + my ($missing, $extra, $incorrect) = (0, 0, 0); + foreach my $word (keys(%$expect)) { + if (!defined($loaded->{$word})) { + print("Missing score assignment for $word\n"); + $missing++; + } elsif ($expect->{$word} != $loaded->{$word}) { + print("Incorrect score assignment for $word\n"); + $incorrect++; + } else { + delete($remaining{$word}); + } + } + $extra = scalar(keys(%remaining)); + print("Extra score assignment for $word\n") foreach (@{$extra}); + + return ($missing, $extra, $incorrect); +} + +sub getExpected +{ + my %expected = ( + 'GLOBAL' => [ + 'lorem', + 'ipsum', + 'dolor', + 'sit', + 'amet', + 'consectetur', + 'adipiscing', + 'elit', + 'sed', + 'do', + 'eiusmod', + 'tempor', + 'minim', + 'veniam', + 'quis', + 'nostrud', + 'exercitation', + 'ullamco', + 'laboris', + 'nisi', + 'ut', + 'aliquip', + 'ex', + 'ea', + 'commodo', + 'consequat', + 'duis', + 'dolore', + 'eu', + 'fugiat', + ], + '50_04_T_04_RULES0.cf' => { + 'SCORED' => { + 'lorem' => 1, + 'sit' => 1, + 'adipiscing' => 1, + 'do' => 1, + 'incididunt' => 1, + 'et' => 1, + 'aliqua' => 1, + 'minim' => 1, + }, + 'COMMENTS' => { + 'lorem' => '1, 1, 1, 1', + 'ipsum' => '1, 1, 1, 0', + 'dolor' => '1, 1, 1, undef', + 'sit' => '1, 1, 0, 1', + 'amet' => '1, 1, 0, 0', + 'consectetur' => '1, 1, 0, undef', + 'adipiscing' => '1, 0, 1, 1', + 'elit' => '1, 0, 1, 0', + 'sed' => '1, 0, 1, undef', + 'do' => '1, 0, 0, 1', + 'eiusmod' => '1, 0, 0, 0', + 'tempor' => '1, 0, 0, undef', + 'incididunt' => '0, 1, 1, 1', + 'ut' => '0, 1, 1, 0', + 'labore' => '0, 1, 1, undef', + 'et' => '0, 1, 0, 1', + 'dolore' => '0, 1, 0, 0', + 'magna' => '0, 1, 0, undef', + 'aliqua' => '0, 0, 1, 1', + 'enim' => '0, 0, 1, 0', + 'ad' => '0, 0, 1, undef', + 'minim' => '0, 0, 0, 1', + 'veniam' => '0, 0, 0, 0', + 'quis' => '0, 0, 0, undef', + }, + 'LOCAL' => [ + 'lorem', + 'ipsum', + 'dolor', + 'sit', + 'amet', + 'consectetur', + 'incididunt', + 'ut', + 'labore', + 'et', + 'dolore', + 'magna', + 'minim', + 'veniam', + 'quis', + ], + 'group' => [ + 'lorem', + 'ipsum', + 'dolor', + 'adipiscing', + 'elit', + 'sed', + 'incididunt', + 'ut', + 'labore', + 'aliqua', + 'enim', + 'ad', + ] + }, + '50_04_T_04_RULES1.cf' => { + 'SCORED' => { + 'nostrud' => 1, + 'laboris' => 1, + 'aliquip' => 1, + 'consequat' => 1, + 'aute' => 1, + 'in' => 1, + 'velit' => 1, + 'dolore' => 1, + }, + 'LOCAL' => [ + 'nostrud', + 'exercitation', + 'ullamco', + 'laboris', + 'nisi', + 'ut', + 'aute', + 'irure', + 'dolor', + 'in', + 'reprehenderit', + 'volupatate', + 'dolore', + 'eu', + 'fugiat', + ], + 'group' => [ + 'nostrud', + 'exercitation', + 'ullamco', + 'aliquip', + 'ex', + 'ea', + 'aute', + 'irure', + 'dolor', + 'velit', + 'esse', + 'cillum', + ] + } + ); + + return \%expected; +} diff --git a/t/04_rules0.cf b/t/04_rules0.cf new file mode 100644 index 0000000..1989c9a --- /dev/null +++ b/t/04_rules0.cf @@ -0,0 +1,28 @@ +# Comments to be ignored. As with following blank line. +# Must test every combination of: +# GLOBAL, LOCAL, GROUP, SCORE + +Lorem 1 GLOBAL LOCAL group # 1, 1, 1, 1 +ipsum 0 GLOBAL LOCAL group # 1, 1, 1, 0 +dolor GLOBAL LOCAL group # 1, 1, 1, undef +sit 1 GLOBAL LOCAL # 1, 1, 0, 1 +amet 0 GLOBAL LOCAL # 1, 1, 0, 0 +consectetur GLOBAL LOCAL # 1, 1, 0, undef +adipiscing 1 GLOBAL group # 1, 0, 1, 1 +elit 0 GLOBAL group # 1, 0, 1, 0 +sed GLOBAL group # 1, 0, 1, undef +do 1 GLOBAL # 1, 0, 0, 1 +eiusmod 0 GLOBAL # 1, 0, 0, 0 +tempor GLOBAL # 1, 0, 0, undef +incididunt 1 LOCAL group # 0, 1, 1, 1 +ut 0 LOCAL group # 0, 1, 1, 0 +labore LOCAL group # 0, 1, 1, undef +et 1 LOCAL # 0, 1, 0, 1 +dolore 0 LOCAL # 0, 1, 0, 0 +magna LOCAL # 0, 1, 0, undef +aliqua 1 group # 0, 0, 1, 1 +enim 0 group # 0, 0, 1, 0 +ad group # 0, 0, 1, undef +minim 1 # 0, 0, 0, 1 +veniam 0 # 0, 0, 0, 0 +quis # 0, 0, 0, undef diff --git a/t/04_rules1.cf b/t/04_rules1.cf new file mode 100644 index 0000000..4d32478 --- /dev/null +++ b/t/04_rules1.cf @@ -0,0 +1,24 @@ +nostrud 1 GLOBAL LOCAL group +exercitation 0 GLOBAL LOCAL group +ullamco GLOBAL LOCAL group +laboris 1 GLOBAL LOCAL +nisi 0 GLOBAL LOCAL +ut GLOBAL LOCAL +aliquip 1 GLOBAL group +ex 0 GLOBAL group +ea GLOBAL group +commodo 0 GLOBAL +consequat 1 GLOBAL +Duis GLOBAL +aute 1 LOCAL group +irure 0 LOCAL group +dolor LOCAL group +in 1 LOCAL +reprehenderit 0 LOCAL +volupatate LOCAL +velit 1 group +esse 0 group +cillum group +dolore 1 +eu 0 +fugiat