Cleanup, fix tests, add script

Script will process all files (recursively) from PWD and write
output to './output'
This commit is contained in:
John Mertz 2023-01-09 23:26:07 -05:00
parent f5b68b2b9a
commit 0aa3153879
Signed by: jpm
GPG Key ID: E9C5EA2D867501AB
8 changed files with 800 additions and 677 deletions

93
Makefile.PL Normal file
View File

@ -0,0 +1,93 @@
use strict;
use warnings;
use 5.006;
use ExtUtils::MakeMaker;
#if ( $^O eq 'MSWin32' ) {
#die "AnyEvent::Sway cannot be used on win32 (unix sockets are missing)";
#}
my %meta = (
name => 'Mail-SpamAssassin-KeywordRuleGenerator',
author => 'John Mertz, C<< <git at john.me.tz> >>',
license => ['apache_2_0'],
'meta-spec' => { version => 2 },
resources => {
repository => {
url => 'git://git.john.me.tz:233/jpm/Mail-SpamAssassin-KeywordRuleGenerator',
web => 'https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator',
type => 'git',
},
bugtracker => {
web => 'https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator/issues',
},
homepage => 'https://john.me.tz/projects/article.php?topic=Mail-SpamAssassin-KeywordRuleGenerator',
license => ['https://www.apache.org/licenses/LICENSE-2.0.html'],
},
);
my %requirements = (
configure_requires => {
'ExtUtils::MakeMaker' => 6.36,
},
build_requires => {
'ExtUtils::MakeMaker' => 6.36
},
runtime_requires => {
'Mail::SpamAssassin' => 0,
},
test_requires => {
'Test::More' => 0.80,
},
);
my %merged_requirements = (
'Mail::SpamAssassin' => 0,
'Test::More' => 0.80,
);
$meta{prereqs}{configure}{requires} = $requirements{configure_requires};
$meta{prereqs}{build}{requires} = $requirements{build_requires};
$meta{prereqs}{runtime}{requires} = $requirements{runtime_requires};
$meta{prereqs}{test}{requires} = $requirements{test_requires};
my %MM_Args = (
AUTHOR => 'John Mertz',
NAME => 'Mail::SpamAssassin::KeywordRuleGenerator',
DISTNAME => 'Mail-SpamAssassin-KeywordRuleGenerator',
EXE_FILES => [],
MIN_PERL_VERSION => '5.006',
VERSION_FROM => 'lib/Mail/SpamAssassin/KeywordRuleGenerator.pm',
ABSTRACT_FROM => 'lib/Mail/SpamAssassin/KeywordRuleGenerator.pm',
test => {
TESTS => 't/*.t',
},
);
sub is_eumm {
eval { ExtUtils::MakeMaker->VERSION( $_[0] ) };
}
is_eumm(6.30) and $MM_Args{LICENSE} = $meta{license}[0];
is_eumm(6.47_01) or delete $MM_Args{MIN_PERL_VERSION};
is_eumm(6.52)
and $MM_Args{CONFIGURE_REQUIRES} = $requirements{configure_requires};
is_eumm(6.57_02) and !is_eumm(6.57_07) and $MM_Args{NO_MYMETA} = 1;
if ( is_eumm(6.63_03) ) {
%MM_Args = (
%MM_Args,
TEST_REQUIRES => $requirements{test_requires},
BUILD_REQUIRES => $requirements{build_requires},
PREREQ_PM => $requirements{runtime_requires},
);
}
else {
$MM_Args{PREREQ_PM} = \%merged_requirements;
}
unless ( -f 'META.yml' ) {
$MM_Args{META_ADD} = \%meta;
}
WriteMakefile(%MM_Args);

34
bin/generate-keyword-rules Executable file
View File

@ -0,0 +1,34 @@
#!/usr/bin/perl
# Create a directory called 'output' with generated files for all config files in the $PWD
use lib '../lib/';
use Mail::SpamAssassin::KeywordRuleGenerator;
use strict;
use warnings;
my $id = 'MC';
my $dir = 'output';
# setup
my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' =>$id, 'debug' => 0, 'joinScores' => 0 , 'dir' => $dir } );
# clean
unlink(glob("$dir/*"));
rmdir($dir);
# get files in PWD
my @files = glob("./*");
my @clean;
$kw->createDir($dir);
my @failed = $kw->readAll( @files );
die scalar(@failed)." error(s) - ".join(', ', @failed)."\n" if (scalar(@failed));
$kw->writeAll();
use Mail::SpamAssassin;
my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => "./$dir", 'pre_config_text' => "loadplugin Mail::SpamAssassin::Plugin::Check" } );
my $fail = $sa->lint_rules();
die "Failed to verify with SpamAssassin: $fail" if ($fail);

View File

@ -48,20 +48,20 @@ The sets of keywords can be broken up into groups (see GROUPS).
Requires C<spamassassin> executable and the following Perl modules
To::Be::Determined
Mail::SpamAssassin
=cut
#use strict;
use strict;
use warnings;
=head2 FILES
There are built-in functions to ingest formatted list files. See C<readFiles>
There are built-in functions to ingest formatted list files. See C<readFile>
method. By default, the output file name and the rules therein will use a
stripped and capitalized version of those filenames.
$kw->readFiles( 'example.txt' );
$kw->readAll( 'example.cf', 'example.txt' );
$kw->writeAll();
This will creates rules formatted like:
@ -70,20 +70,44 @@ This will creates rules formatted like:
and will output to the file:
70_id_example.cf
70_ID_EXAMPLE.cf
Leading './' or $PWD will be stripped, but other paths will be included with '_'
substituted '/'. For example:
$kw->readAll ( './example.cf', '/to/pwd/example2.cf', 'dir/example3.cf' );
will produce:
70_KW_EXAMPLE.cf
70_KW_EXAMPLE2.cf
70_KW_DIR_EXAMPLE3.cf
If there are any rules in the 'GLOBAL' group, or if in C<singleOutfile> mode,
a file with just the priority and keyword will be produced with the select rules
or all rules, respectively. This will look like:
70_KW.cf
See the C<generate*> methods for more information on this formatting. Also see
the C<new> method for discussion of the 'id'.
Finally, a like file:
Finally, unless in C<joinScores> mode, each configuration will have a matching
scores file like:
71_id_scores.cf
70_KW_EXAMPLE_SCORES.cf
70_KW_SCORES.cf
Will be created with the scores for all of the rules in the prior file(s). The
C<join_scores> variable is true by default, creating the above file. If made
false, then will determine a unique score file will be created for each file.
Alternatively, C<append_scores> can be set to include the scores directly in the
config file with the rule definitions.
In C<joinScores> mode, the scores will be directly appended in the same config
file with the rule definitions.
Note that the global files may have an incremented priority since it requires
all meta rules from the other files be defined first. For example, the above
would be left with 70 for the global priority, however if the C<id> were lower
in the alphabet, the global files would be incremented:
70_AB_EXAMPLES.cf
71_AB.cf
=head2 RULES
@ -145,7 +169,6 @@ our %EXPORT_TAGS = (
'getGlobalOutput',
'getGlobalScoresOutput',
'clearFile',
'getFiles',
'nextFile',
'readFile',
'getFile',
@ -158,7 +181,8 @@ our %EXPORT_TAGS = (
'generateGlobals',
'generateAll',
'writeFile',
'writeAll'
'writeAll',
'verifyOutput'
]
);
@ -326,7 +350,6 @@ sub setPriority
my $priority = shift;
if (defined($priority)) {
$path = uc($priority);
$self->{'priority'} = $priority || return "Failed to set priority: $priority";
} else {
return "No 'priority' provided\n";
@ -504,7 +527,7 @@ alphabetically after the last file with 'meta' rules. C<$file> can be used to
bypass that check, but it might lead to rules that do not work.
Will try to select a filename as close to the existing output files as possible.
First it will simply duplicate a name if it is in C<singleOutput> mode. Then,
First it will simply duplicate a name if it is in C<singleOutfile> mode. Then,
it will try to use the base name without the C<$file> portion. This will
generally not work because the '_' before the file will come after the dot
without:
@ -528,11 +551,11 @@ sub setGlobalOutfile
my $path = shift;
if (defined($path)) {
$self->{'filename'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
$self->{'filemap'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
return undef;
}
if ($self->{'singleOutfile'}) {
$self->{'filename'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
$self->{'filemap'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
return undef;
}
my $last;
@ -541,7 +564,7 @@ sub setGlobalOutfile
}
my $file = $self->getDir().'/'.$self->getPriority()."_".$self->getId().".cf";
if (defined($last) && $file gt $last) {
$self->{'filename'}->{'GLOBAL'} = $file;
$self->{'filemap'}->{'GLOBAL'} = $file;
return undef;
}
if ($self->getPriority < 99) {
@ -550,7 +573,7 @@ sub setGlobalOutfile
}
$file =~ s/_/__/ unless ($file gt $last);
if ($file gt $last) {
$self->{'filename'}->{'GLOBAL'} = $file;
$self->{'filemap'}->{'GLOBAL'} = $file;
return undef;
}
return ("Cannot determine a valid GLOBAL output file\n");
@ -603,7 +626,7 @@ sub setGlobalScoresOutfile
if (defined($self->{'filemap'}->{'GLOBAL_SCORES'}) && $self->{'filemap'}->{'GLOBAL_SCORES'} gt $self->{'filemap'}->{'GLOBAL'}) {
return undef;
}
my $file = $self->{'filemap'}->{'GLOBAL'};
my $file = $self->getGlobalOutfile() || die "HUH?";
$file =~ s/\.cf$/_SCORES.cf/;
if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
$self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
@ -611,20 +634,20 @@ sub setGlobalScoresOutfile
}
$file = $self->getDir().'/'.$self->getPriority()."_".$self->getId()."_SCORES.cf";
if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
$self->{'filename'}->{'GLOBAL_SCORES'} = $file;
$self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
return undef;
}
my $gpriority = $self->{'filename'}->{'GLOBAL'};
my $gpriority = $self->{'filemap'}->{'GLOBAL'};
$gpriority =~ s/^(\d\d).*/$1/;
$file = $self->getDir().'/'.($gpriority+1)."_".$self->getId()."_SCORES.cf" if ($gpriority < 99);
if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
$self->{'filename'}->{'GLOBAL_SCORES'} = $file;
$self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
return undef;
}
$file = $self->{'filename'}->{'GLOBAL'};
$file = $self->{'filemap'}->{'GLOBAL'};
$file =~ s/_/__/;
if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
$self->{'filename'}->{'GLOBAL_SCORES'} = $file;
$self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
return undef;
}
return ("Cannot determine a valid GLOBAL output file\n");
@ -707,65 +730,6 @@ sub getGlobalScoresOutput
}
}
=head2 C<$kw->getFiles($regex);>
Simple recursive search for files within a directory. Will validate that each
file is readable and return an array of file names.
Expects a single file or directory path scalar as first argument and an optional
regex as the secord. If you have multiple entries to fetch, run separately and
append to your array.
The regex will be used as a file filter and will only return files that match.
=cut
sub getFiles
{
my $self = shift;
my $regex = shift;
my $return = '';
foreach (@args) {
$return .= "$_ does not exist\n" unless (-e "$_" || -l "$_");
if (-l $_) {
$self->getFiles(readlink($_));
} elsif (-d $_) {
my @recursive = glob($_."/*");
$self->getFiles(@recursive);
} else {
if (defined($regex)) {
if ($_ =~ $regex) {
push(@{$self->{'files_ref'}}, $_);
} else {
next;
}
} else {
$return .= "$_ is not readable\n" unless (-r "$_");
push(@{$self->{'files_ref'}}, $_);
}
}
}
return $return;
}
=head2 C<$kw->nextFile();>
Shift next message in files_ref queue to current.
=cut
sub nextFile
{
my $self = shift;
if (scalar(@{$self->{'files_ref'}})) {
$self->{'path'} = shift(@{$self->{'files_ref'}});
} else {
return "End of array\n";
}
return 0;
}
=head2 C<$kw->clearFile();>
Clear the current file reference queue.
@ -780,6 +744,21 @@ sub clearFile
return 0;
}
=head2 C<$kw->clearAll();>
Clear all file reference queues.
=cut
sub clearAll
{
my $self = shift;
foreach my $file (keys(%{$self->{'rules'}})) {
%{$self->{'rules'}->{'DEFAULT'}} = () if ($file eq 'DEFAULT');
$self->clearFile($file);
}
return 0;
}
=head2 C<$kw->readFile(%args);>
Read in properly formatted keyword list file. The basic format is one keyword
@ -816,6 +795,10 @@ sub readFile
my $self = shift;
my $file = shift || return 'No file provided';
my %args = @_;
# Clean to relative path
$file =~ s/^\.\///;
$file =~ s/^$ENV{'PWD'}\///;
$self->getDir();
$self->{'file'} = $file;
$self->setOutfile();
@ -866,15 +849,12 @@ Loops through C<readFile> for all files in the queue.
sub readAll
{
my $self = shift;
my @files = ( shift ) || return "No files provided\n";
while (scalar(@_)) {
push(@files, shift);
}
my @files = @_;
my @failed;
foreach my $file (@files) {
if (!-e $file) {
push(@failed, $file);
push(@failed, "$file does not exist\n");
print STDERR "$file does not exist\n" if ($self->{'debug'});
} elsif (-l $file) {
# Accept only links within PWD
@ -882,7 +862,7 @@ sub readAll
if ($dest =~ m/^\//) {
$dest =~ s/$ENV{'PWD'}\/// ;
if ($dest eq readlink($file)) {
push(@failed, $file);
push(@failed, "Symlink outside of PWD $file\n");
print STDERR "Symlink outside of PWD $file\n" if ($self->{'debug'});
next;
}
@ -891,7 +871,7 @@ sub readAll
} elsif (-f $file) {
push(@failed, "Failed to read $file") if $self->readFile($file);
} elsif (-d $file) {
push(@failed, @_) if $self->readAll(glob($file.'/*'));
push(@failed, $self->readAll(glob($file.'/*')) );
# Bash pattern?
} else {
if (my @glob = glob($file)) {
@ -901,8 +881,7 @@ sub readAll
}
}
}
return @failed if (scalar(@failed));
return undef;
return @failed;
}
=head2 C<$kw->readLine($line)>
@ -916,7 +895,7 @@ sub readLine
{
my $self = shift;
my $line = shift;
my $invalid = shift;
my @invalid;
my ($word, $score, $comment, @groups);
if (my @sections = $line =~ m/(?:^|\s+)(?:([^\d\s#]\S+)|(\d+(?:\.\d+)?\b)|([^\d\s#]+)|(#.*$))/g) {
while (@sections) {
@ -947,7 +926,7 @@ sub readLine
$word = lc($word);
$score //= 0;
$comment //= '';
@groups = ( 'LOCAL', 'GLOBAL' ) unless (scalar(@groups));
@groups = ( 'LOCAL' ) unless (scalar(@groups));
return ( $word, $score, $comment, @groups );
}
return ();
@ -1276,6 +1255,7 @@ sub writeFile
my $file = shift || return "Requires filename";
return "Nothing generated for $file" unless (defined($self->{'output'}->{$file}));
return unless (scalar(@{$self->{'output'}->{$file}}));
if (open(my $fh, ">", $file)) {
foreach my $line (@{$self->{'output'}->{$file}}) {
print $fh $line;
@ -1302,6 +1282,25 @@ sub writeAll
return ("Errors:\n ".join("\n ", @errors)) if (scalar(@errors));
}
=head2 C<$kw->verifyOutput()>
Process output directory with SpamAssassin linter. Return errors or
undef if successful. An optional C<$path> can be provided to test a
different file or directory.
=cut
sub verifyOutput
{
my $self = shift;
my $path = shift || $self->{'dir'};
use Mail::SpamAssassin;
my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => $path } );
return $sa->lint_rules();
}
=head1 MORE
For discussion of the module and examples, see:

View File

@ -16,7 +16,7 @@ my $defaults = {
};
my $args = {
'id' => '01',
'id' => 'TEST',
'priority' => 10,
'debug' => 1,
'singleOutfile' => 1,

View File

@ -1 +0,0 @@
/home/jpm/KeywordRuleGenerator/t/02_files.cf

1
t/02_files.lnk Normal file
View File

@ -0,0 +1 @@
word

View File

@ -18,26 +18,26 @@ ok ($kw->getFile($file) == $file, "Get current working file");
ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files in rules");
ok ($kw->{'rules'}->{$file}, "Loaded correct hash key");
ok ($kw->{'filemap'}->{$file} =~ m/50_02_T_02_FILES.cf$/, "Loaded correct output name");
ok (!$kw->clearFiles(), "Clear 'rules' hash");
ok (!$kw->{'rules'}, "Fetched none after clearing");
ok (!$kw->clearAll(), "Clear 'rules' hash");
ok (!scalar(keys(%{$kw->{'rules'}})), "Fetched none after clearing");
# Read a directory
my $dir = 't/02_files.dir';
ok (!$kw->readAll($dir), "Run file on dir");
ok (scalar(keys(%{$kw->{'rules'}})) == 3, "Fetched correct number of files from dir");
ok (!$kw->clearFiles(), "Clear dir files");
ok (!$kw->clearAll(), "Clear dir files");
# Read a symlink
my $link = 't/02_files.lnk';
ok (!$kw->readAll($link), "Run file on link");
ok ($kw->getFile() == $file, "Fetched correct name ($file) from link ($link)");
ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files from link");
ok (!$kw->clearFiles(), "Clear link files");
ok (!$kw->clearAll(), "Clear link files");
# Read multiple mixed
ok (!$kw->readAll($file, $dir, $link), "Run readAll on multiple/mixed");
ok (scalar(keys(%{$kw->{'rules'}})) == 4, "Fetched correct number of files from all");
ok (!$kw->clearFiles(), "Clear dir files");
ok (scalar(keys(%{$kw->{'rules'}})) == 5, "Fetched correct number of files from all");
ok (!$kw->clearAll(), "Clear dir files");
ok ($kw->readFile('does_not_exist'), "Correctly failed for non-existent file");

View File

@ -17,7 +17,6 @@ ok(!scalar(@failed), "Load 'rules' hash with readAll");
my $expected = getExpected();
my ( $missing, $extra, $incorrect ) = 0;
use Data::Dump;
foreach my $file (keys(%{$expected})) {
if ($file eq 'GLOBAL') {
my ($m, $e) = compareValues(

View File

@ -46,10 +46,8 @@ foreach (@files) {
}
ok (!scalar(keys(%remaining)), "All expected output files found");
use Mail::SpamAssassin;
my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => $testdir } );
$failed = $sa->lint_rules();
ok (!$failed, "Verified by spamassassin".($res ? "\n$failed" : ""));
$failed = $kw->verifyOutput();
ok (!$failed, "Verified by spamassassin".($failed ? "\n$failed" : ""));
$kw->cleanDir();