Cleanup, fix tests, add script

Script will process all files (recursively) from PWD and write output to './output'
2023-01-09 23:26:07 -05:00 · 2023-01-09 23:26:07 -05:00 · 0aa3153879
parent f5b68b2b9a
commit 0aa3153879
8 changed files with 800 additions and 677 deletions
--- a/Makefile.PL
+++ b/Makefile.PL
@ -0,0 +1,93 @@
 use strict;
 use warnings;
 use 5.006;
 use ExtUtils::MakeMaker;
 #if ( $^O eq 'MSWin32' ) {
    #die "AnyEvent::Sway cannot be used on win32 (unix sockets are missing)";
 #}
 my %meta = (
    name        => 'Mail-SpamAssassin-KeywordRuleGenerator',
    author      => 'John Mertz, C<< <git at john.me.tz> >>',
    license     => ['apache_2_0'],
    'meta-spec' => { version => 2 },
    resources   => {
        repository => {
            url => 'git://git.john.me.tz:233/jpm/Mail-SpamAssassin-KeywordRuleGenerator',
            web => 'https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator',
            type => 'git',
        },
        bugtracker => {
            web => 'https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator/issues',
        },
        homepage => 'https://john.me.tz/projects/article.php?topic=Mail-SpamAssassin-KeywordRuleGenerator',
        license  => ['https://www.apache.org/licenses/LICENSE-2.0.html'],
    },
 );
 my %requirements = (
    configure_requires => {
        'ExtUtils::MakeMaker' => 6.36,
    },
    build_requires => {
        'ExtUtils::MakeMaker' => 6.36
    },
    runtime_requires => {
        'Mail::SpamAssassin' => 0,
    },
    test_requires => {
        'Test::More' => 0.80,
    },
 );
 my %merged_requirements = (
    'Mail::SpamAssassin' => 0,
    'Test::More' => 0.80,
 );
 $meta{prereqs}{configure}{requires} = $requirements{configure_requires};
 $meta{prereqs}{build}{requires}     = $requirements{build_requires};
 $meta{prereqs}{runtime}{requires}   = $requirements{runtime_requires};
 $meta{prereqs}{test}{requires}      = $requirements{test_requires};
 my %MM_Args = (
    AUTHOR           => 'John Mertz',
    NAME             => 'Mail::SpamAssassin::KeywordRuleGenerator',
    DISTNAME         => 'Mail-SpamAssassin-KeywordRuleGenerator',
    EXE_FILES        => [],
    MIN_PERL_VERSION => '5.006',
    VERSION_FROM     => 'lib/Mail/SpamAssassin/KeywordRuleGenerator.pm',
    ABSTRACT_FROM    => 'lib/Mail/SpamAssassin/KeywordRuleGenerator.pm',
    test             => {
        TESTS => 't/*.t',
    },
 );
 sub is_eumm {
    eval { ExtUtils::MakeMaker->VERSION( $_[0] ) };
 }
 is_eumm(6.30) and $MM_Args{LICENSE} = $meta{license}[0];
 is_eumm(6.47_01) or delete $MM_Args{MIN_PERL_VERSION};
 is_eumm(6.52)
  and $MM_Args{CONFIGURE_REQUIRES} = $requirements{configure_requires};
 is_eumm(6.57_02) and !is_eumm(6.57_07) and $MM_Args{NO_MYMETA} = 1;
 if ( is_eumm(6.63_03) ) {
    %MM_Args = (
        %MM_Args,
        TEST_REQUIRES  => $requirements{test_requires},
        BUILD_REQUIRES => $requirements{build_requires},
        PREREQ_PM      => $requirements{runtime_requires},
    );
 }
 else {
    $MM_Args{PREREQ_PM} = \%merged_requirements;
 }
 unless ( -f 'META.yml' ) {
    $MM_Args{META_ADD} = \%meta;
 }
 WriteMakefile(%MM_Args);
--- a/bin/generate-keyword-rules
+++ b/bin/generate-keyword-rules
@ -0,0 +1,34 @@
 #!/usr/bin/perl
 # Create a directory called 'output' with generated files for all config files in the $PWD
 use lib '../lib/';
 use Mail::SpamAssassin::KeywordRuleGenerator;
 use strict;
 use warnings;
 my $id = 'MC';
 my $dir = 'output';
 # setup
 my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' =>$id, 'debug' => 0, 'joinScores' => 0 , 'dir' => $dir } );
 # clean
 unlink(glob("$dir/*"));
 rmdir($dir);
 # get files in PWD
 my @files = glob("./*");
 my @clean;
 $kw->createDir($dir);
 my @failed = $kw->readAll( @files );
 die scalar(@failed)." error(s) - ".join(', ', @failed)."\n" if (scalar(@failed));
 $kw->writeAll();
 use Mail::SpamAssassin;
 my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => "./$dir", 'pre_config_text' => "loadplugin Mail::SpamAssassin::Plugin::Check" } );
 my $fail = $sa->lint_rules();
 die "Failed to verify with SpamAssassin: $fail" if ($fail);
--- a/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm
+++ b/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm
@ -48,20 +48,20 @@ The sets of keywords can be broken up into groups (see GROUPS).
 Requires C<spamassassin> executable and the following Perl modules
-        To::Be::Determined
+    Mail::SpamAssassin
 =cut
-#use strict;
+use strict;
 use warnings;
 =head2 FILES
-There are built-in functions to ingest formatted list files. See C<readFiles>
+There are built-in functions to ingest formatted list files. See C<readFile>
 method. By default, the output file name and the rules therein will use a
 stripped and capitalized version of those filenames.
-    $kw->readFiles( 'example.txt' );
+    $kw->readAll( 'example.cf', 'example.txt' );
    $kw->writeAll();
 This will creates rules formatted like:
@ -70,20 +70,44 @@ This will creates rules formatted like:
 and will output to the file:
-    70_id_example.cf
+    70_ID_EXAMPLE.cf
 Leading './' or $PWD will be stripped, but other paths will be included with '_'
 substituted '/'. For example:
    $kw->readAll ( './example.cf', '/to/pwd/example2.cf', 'dir/example3.cf' );
 will produce:
    70_KW_EXAMPLE.cf
    70_KW_EXAMPLE2.cf
    70_KW_DIR_EXAMPLE3.cf
 If there are any rules in the 'GLOBAL' group, or if in C<singleOutfile> mode,
 a file with just the priority and keyword will be produced with the select rules
 or all rules, respectively. This will look like:
    70_KW.cf
 See the C<generate*> methods for more information on this formatting. Also see
 the C<new> method for discussion of the 'id'.
-Finally, a like file:
+Finally, unless in C<joinScores> mode, each configuration will have a matching
 scores file like:
-    71_id_scores.cf
+    70_KW_EXAMPLE_SCORES.cf
    70_KW_SCORES.cf
-Will be created with the scores for all of the rules in the prior file(s). The
+In C<joinScores> mode, the scores will be directly appended in the same config
-C<join_scores> variable is true by default, creating the above file. If made
+file with the rule definitions.
-false, then will determine a unique score file will be created for each file.
+
-Alternatively, C<append_scores> can be set to include the scores directly in the
+Note that the global files may have an incremented priority since it requires
-config file with the rule definitions.
+all meta rules from the other files be defined first. For example, the above
 would be left with 70 for the global priority, however if the C<id> were lower
 in the alphabet, the global files would be incremented:
    70_AB_EXAMPLES.cf
    71_AB.cf
 =head2 RULES
@ -145,7 +169,6 @@ our %EXPORT_TAGS = (
        'getGlobalOutput',
        'getGlobalScoresOutput',
        'clearFile',
                'getFiles',
        'nextFile',
        'readFile',
        'getFile',
@ -158,7 +181,8 @@ our %EXPORT_TAGS = (
        'generateGlobals',
        'generateAll',
        'writeFile',
-                'writeAll'
+        'writeAll',
        'verifyOutput'
    ]
 );
@ -326,7 +350,6 @@ sub setPriority
    my $priority = shift;
    if (defined($priority)) {
                $path = uc($priority);
        $self->{'priority'} = $priority || return "Failed to set priority: $priority";
    } else {
        return "No 'priority' provided\n";
@ -504,7 +527,7 @@ alphabetically after the last file with 'meta' rules. C<$file> can be used to
 bypass that check, but it might lead to rules that do not work.
 Will try to select a filename as close to the existing output files as possible.
-First it will simply duplicate a name if it is in C<singleOutput> mode. Then,
+First it will simply duplicate a name if it is in C<singleOutfile> mode. Then,
 it will try to use the base name without the C<$file> portion. This will
 generally not work because the '_' before the file will come after the dot
 without:
@ -528,11 +551,11 @@ sub setGlobalOutfile
    my $path = shift;
    if (defined($path)) {
-                $self->{'filename'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
+        $self->{'filemap'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
        return undef;
    }
    if ($self->{'singleOutfile'}) {
-                $self->{'filename'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
+        $self->{'filemap'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
        return undef;
    }
    my $last;
@ -541,7 +564,7 @@ sub setGlobalOutfile
    }
    my $file = $self->getDir().'/'.$self->getPriority()."_".$self->getId().".cf";
    if (defined($last) && $file gt $last) {
-                $self->{'filename'}->{'GLOBAL'} = $file;
+        $self->{'filemap'}->{'GLOBAL'} = $file;
        return undef;
    }
    if ($self->getPriority < 99) {
@ -550,7 +573,7 @@ sub setGlobalOutfile
    }
    $file =~ s/_/__/ unless ($file gt $last);
    if ($file gt $last) {
-                $self->{'filename'}->{'GLOBAL'} = $file;
+        $self->{'filemap'}->{'GLOBAL'} = $file;
        return undef;
    }
    return ("Cannot determine a valid GLOBAL output file\n");
@ -603,7 +626,7 @@ sub setGlobalScoresOutfile
    if (defined($self->{'filemap'}->{'GLOBAL_SCORES'}) && $self->{'filemap'}->{'GLOBAL_SCORES'} gt $self->{'filemap'}->{'GLOBAL'}) {
        return undef;
    }
-        my $file = $self->{'filemap'}->{'GLOBAL'};
+    my $file = $self->getGlobalOutfile() || die "HUH?";
    $file =~ s/\.cf$/_SCORES.cf/;
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
@ -611,20 +634,20 @@ sub setGlobalScoresOutfile
    }
    $file = $self->getDir().'/'.$self->getPriority()."_".$self->getId()."_SCORES.cf";
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
-                $self->{'filename'}->{'GLOBAL_SCORES'} = $file;
+        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
-        my $gpriority = $self->{'filename'}->{'GLOBAL'};
+    my $gpriority = $self->{'filemap'}->{'GLOBAL'};
    $gpriority =~ s/^(\d\d).*/$1/;
    $file = $self->getDir().'/'.($gpriority+1)."_".$self->getId()."_SCORES.cf" if ($gpriority < 99);
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
-                $self->{'filename'}->{'GLOBAL_SCORES'} = $file;
+        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
-        $file = $self->{'filename'}->{'GLOBAL'};
+    $file = $self->{'filemap'}->{'GLOBAL'};
    $file =~ s/_/__/;
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
-                $self->{'filename'}->{'GLOBAL_SCORES'} = $file;
+        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
    return ("Cannot determine a valid GLOBAL output file\n");
@ -707,65 +730,6 @@ sub getGlobalScoresOutput
    }
 }
 =head2 C<$kw->getFiles($regex);>
 Simple recursive search for files within a directory. Will validate that each
 file is readable and return an array of file names.
 Expects a single file or directory path scalar as first argument and an optional
 regex as the secord. If you have multiple entries to fetch, run separately and
 append to your array.
 The regex will be used as a file filter and will only return files that match.
 =cut
 sub getFiles
 {
        my $self = shift;
        my $regex = shift;
        my $return = '';
        foreach (@args) {
                $return .= "$_ does not exist\n" unless (-e "$_" || -l "$_");
                if (-l $_) {
                        $self->getFiles(readlink($_));
                } elsif (-d $_) {
                        my @recursive = glob($_."/*");
                        $self->getFiles(@recursive);
                } else {
                        if (defined($regex)) {
                                if ($_ =~ $regex) {
                                        push(@{$self->{'files_ref'}}, $_);
                                } else {
                                        next;
                                }
                        } else {
                                $return .= "$_ is not readable\n" unless (-r "$_");
                                push(@{$self->{'files_ref'}}, $_);
                        }
                }
        }
        return $return;
 }
 =head2 C<$kw->nextFile();>
 Shift next message in files_ref queue to current.
 =cut
 sub nextFile
 {
        my $self = shift;
        if (scalar(@{$self->{'files_ref'}})) {
                $self->{'path'} = shift(@{$self->{'files_ref'}});
        } else {
                return "End of array\n";
        }
        return 0;
 }
 =head2 C<$kw->clearFile();>
 Clear the current file reference queue.
@ -780,6 +744,21 @@ sub clearFile
    return 0;
 }
 =head2 C<$kw->clearAll();>
 Clear all file reference queues.
 =cut
 sub clearAll
 {
    my $self = shift;
    foreach my $file (keys(%{$self->{'rules'}})) {
        %{$self->{'rules'}->{'DEFAULT'}} = () if ($file eq 'DEFAULT');
        $self->clearFile($file);
    }
    return 0;
 }
 =head2 C<$kw->readFile(%args);>
 Read in properly formatted keyword list file. The basic format is one keyword
@ -816,6 +795,10 @@ sub readFile
    my $self = shift;
    my $file = shift || return 'No file provided';
    my %args = @_;
    # Clean to relative path
    $file =~ s/^\.\///;
    $file =~ s/^$ENV{'PWD'}\///;
    $self->getDir();
    $self->{'file'} = $file;
    $self->setOutfile();
@ -866,15 +849,12 @@ Loops through C<readFile> for all files in the queue.
 sub readAll
 {
    my $self = shift;
-        my @files = ( shift ) || return "No files provided\n";
+    my @files = @_;
        while (scalar(@_)) {
                push(@files, shift);
        }
    my @failed;
    foreach my $file (@files) {
        if (!-e $file) {
-                        push(@failed, $file);
+            push(@failed, "$file does not exist\n");
            print STDERR "$file does not exist\n" if ($self->{'debug'});
        } elsif (-l $file) {
            # Accept only links within PWD
@ -882,7 +862,7 @@ sub readAll
            if ($dest =~ m/^\//) {
                $dest =~ s/$ENV{'PWD'}\/// ;
                if ($dest eq readlink($file)) {
-                                        push(@failed, $file);
+                    push(@failed, "Symlink outside of PWD $file\n");
                    print STDERR "Symlink outside of PWD $file\n" if ($self->{'debug'});
                    next;
                }
@ -891,7 +871,7 @@ sub readAll
        } elsif (-f $file) {
            push(@failed, "Failed to read $file") if $self->readFile($file);
        } elsif (-d $file) {
-                        push(@failed, @_) if $self->readAll(glob($file.'/*'));
+            push(@failed, $self->readAll(glob($file.'/*')) );
        # Bash pattern?
        } else {
            if (my @glob = glob($file)) {
@ -901,8 +881,7 @@ sub readAll
            }
        }
    }
-        return @failed if (scalar(@failed));
+    return @failed;
        return undef;
 }
 =head2 C<$kw->readLine($line)>
@ -916,7 +895,7 @@ sub readLine
 {
    my $self = shift;
    my $line = shift;
-        my $invalid = shift;
+    my @invalid;
    my ($word, $score, $comment, @groups);
    if (my @sections = $line =~ m/(?:^|\s+)(?:([^\d\s#]\S+)|(\d+(?:\.\d+)?\b)|([^\d\s#]+)|(#.*$))/g) {
        while (@sections) {
@ -947,7 +926,7 @@ sub readLine
        $word = lc($word);
        $score //= 0;
        $comment //= '';
-                @groups = ( 'LOCAL', 'GLOBAL' ) unless (scalar(@groups));
+        @groups = ( 'LOCAL' ) unless (scalar(@groups));
        return ( $word, $score, $comment, @groups );
    }
    return ();
@ -1276,6 +1255,7 @@ sub writeFile
    my $file = shift || return "Requires filename";
    return "Nothing generated for $file" unless (defined($self->{'output'}->{$file}));
    return unless (scalar(@{$self->{'output'}->{$file}}));
    if (open(my $fh, ">", $file)) {
        foreach my $line (@{$self->{'output'}->{$file}}) {
            print $fh $line;
@ -1302,6 +1282,25 @@ sub writeAll
    return ("Errors:\n ".join("\n ", @errors)) if (scalar(@errors));
 }
 =head2 C<$kw->verifyOutput()>
 Process output directory with SpamAssassin linter. Return errors or
 undef if successful. An optional C<$path> can be provided to test a
 different file or directory.
 =cut
 sub verifyOutput
 {
    my $self = shift;
    my $path = shift || $self->{'dir'};
    use Mail::SpamAssassin;
    my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => $path } );
    return $sa->lint_rules();
 }
 =head1 MORE
 For discussion of the module and examples, see:
--- a/t/01_attributes.t
+++ b/t/01_attributes.t
@ -16,7 +16,7 @@ my $defaults = {
 };
 my $args = {
-        'id'            => '01',
+        'id'            => 'TEST',
        'priority'      => 10,
        'debug'         => 1,
        'singleOutfile' => 1,
--- a/t/02_files.lnk
+++ b/t/02_files.lnk
@ -1 +0,0 @@
 /home/jpm/KeywordRuleGenerator/t/02_files.cf
--- a/t/02_files.lnk
+++ b/t/02_files.lnk
@ -0,0 +1 @@
 word
--- a/t/02_files.t
+++ b/t/02_files.t
@ -18,26 +18,26 @@ ok ($kw->getFile($file) == $file, "Get current working file");
 ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files in rules");
 ok ($kw->{'rules'}->{$file}, "Loaded correct hash key");
 ok ($kw->{'filemap'}->{$file} =~ m/50_02_T_02_FILES.cf$/, "Loaded correct output name");
-ok (!$kw->clearFiles(), "Clear 'rules' hash");
+ok (!$kw->clearAll(), "Clear 'rules' hash");
-ok (!$kw->{'rules'}, "Fetched none after clearing");
+ok (!scalar(keys(%{$kw->{'rules'}})), "Fetched none after clearing");
 # Read a directory
 my $dir = 't/02_files.dir';
 ok (!$kw->readAll($dir), "Run file on dir");
 ok (scalar(keys(%{$kw->{'rules'}})) == 3, "Fetched correct number of files from dir");
-ok (!$kw->clearFiles(), "Clear dir files");
+ok (!$kw->clearAll(), "Clear dir files");
 # Read a symlink
 my $link = 't/02_files.lnk';
 ok (!$kw->readAll($link), "Run file on link");
 ok ($kw->getFile() == $file, "Fetched correct name ($file) from link ($link)");
 ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files from link");
-ok (!$kw->clearFiles(), "Clear link files");
+ok (!$kw->clearAll(), "Clear link files");
 # Read multiple mixed
 ok (!$kw->readAll($file, $dir, $link), "Run readAll on multiple/mixed");
-ok (scalar(keys(%{$kw->{'rules'}})) == 4, "Fetched correct number of files from all");
+ok (scalar(keys(%{$kw->{'rules'}})) == 5, "Fetched correct number of files from all");
-ok (!$kw->clearFiles(), "Clear dir files");
+ok (!$kw->clearAll(), "Clear dir files");
 ok ($kw->readFile('does_not_exist'), "Correctly failed for non-existent file");
--- a/t/04_rules.t
+++ b/t/04_rules.t
@ -17,7 +17,6 @@ ok(!scalar(@failed), "Load 'rules' hash with readAll");
 my $expected = getExpected();
 my ( $missing, $extra, $incorrect ) = 0;
 use Data::Dump;
 foreach my $file (keys(%{$expected})) {
        if ($file eq 'GLOBAL') {
                my ($m, $e) = compareValues(
--- a/t/05_write.t
+++ b/t/05_write.t
@ -46,10 +46,8 @@ foreach (@files) {
 }
 ok (!scalar(keys(%remaining)), "All expected output files found");
-use Mail::SpamAssassin;
+$failed = $kw->verifyOutput();
-my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => $testdir } );
+ok (!$failed, "Verified by spamassassin".($failed ? "\n$failed" : ""));
 $failed = $sa->lint_rules();
 ok (!$failed, "Verified by spamassassin".($res ? "\n$failed" : ""));
 $kw->cleanDir();
		`@ -1 +0,0 @@`
			`/home/jpm/KeywordRuleGenerator/t/02_files.cf`