Cleanup, fix tests, add script

Script will process all files (recursively) from PWD and write output to './output'
2023-01-09 23:26:07 -05:00 · 2023-01-09 23:26:07 -05:00 · 0aa3153879
parent f5b68b2b9a
commit 0aa3153879
8 changed files with 800 additions and 677 deletions
--- a/Makefile.PL
+++ b/Makefile.PL
@ -0,0 +1,93 @@
+use strict;
+use warnings;
+
+use 5.006;
+use ExtUtils::MakeMaker;
+
+#if ( $^O eq 'MSWin32' ) {
+    #die "AnyEvent::Sway cannot be used on win32 (unix sockets are missing)";
+#}
+
+my %meta = (
+    name        => 'Mail-SpamAssassin-KeywordRuleGenerator',
+    author      => 'John Mertz, C<< <git at john.me.tz> >>',
+    license     => ['apache_2_0'],
+    'meta-spec' => { version => 2 },
+    resources   => {
+        repository => {
+            url => 'git://git.john.me.tz:233/jpm/Mail-SpamAssassin-KeywordRuleGenerator',
+            web => 'https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator',
+            type => 'git',
+        },
+        bugtracker => {
+            web => 'https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator/issues',
+        },
+        homepage => 'https://john.me.tz/projects/article.php?topic=Mail-SpamAssassin-KeywordRuleGenerator',
+        license  => ['https://www.apache.org/licenses/LICENSE-2.0.html'],
+    },
+);
+
+my %requirements = (
+    configure_requires => {
+        'ExtUtils::MakeMaker' => 6.36,
+    },
+    build_requires => {
+        'ExtUtils::MakeMaker' => 6.36
+    },
+    runtime_requires => {
+        'Mail::SpamAssassin' => 0,
+    },
+    test_requires => {
+        'Test::More' => 0.80,
+    },
+);
+
+my %merged_requirements = (
+    'Mail::SpamAssassin' => 0,
+    'Test::More' => 0.80,
+);
+
+$meta{prereqs}{configure}{requires} = $requirements{configure_requires};
+$meta{prereqs}{build}{requires}     = $requirements{build_requires};
+$meta{prereqs}{runtime}{requires}   = $requirements{runtime_requires};
+$meta{prereqs}{test}{requires}      = $requirements{test_requires};
+
+my %MM_Args = (
+    AUTHOR           => 'John Mertz',
+    NAME             => 'Mail::SpamAssassin::KeywordRuleGenerator',
+    DISTNAME         => 'Mail-SpamAssassin-KeywordRuleGenerator',
+    EXE_FILES        => [],
+    MIN_PERL_VERSION => '5.006',
+    VERSION_FROM     => 'lib/Mail/SpamAssassin/KeywordRuleGenerator.pm',
+    ABSTRACT_FROM    => 'lib/Mail/SpamAssassin/KeywordRuleGenerator.pm',
+    test             => {
+        TESTS => 't/*.t',
+    },
+);
+
+sub is_eumm {
+    eval { ExtUtils::MakeMaker->VERSION( $_[0] ) };
+}
+
+is_eumm(6.30) and $MM_Args{LICENSE} = $meta{license}[0];
+is_eumm(6.47_01) or delete $MM_Args{MIN_PERL_VERSION};
+is_eumm(6.52)
+  and $MM_Args{CONFIGURE_REQUIRES} = $requirements{configure_requires};
+
+is_eumm(6.57_02) and !is_eumm(6.57_07) and $MM_Args{NO_MYMETA} = 1;
+
+if ( is_eumm(6.63_03) ) {
+    %MM_Args = (
+        %MM_Args,
+        TEST_REQUIRES  => $requirements{test_requires},
+        BUILD_REQUIRES => $requirements{build_requires},
+        PREREQ_PM      => $requirements{runtime_requires},
+    );
+}
+else {
+    $MM_Args{PREREQ_PM} = \%merged_requirements;
+}
+unless ( -f 'META.yml' ) {
+    $MM_Args{META_ADD} = \%meta;
+}
+WriteMakefile(%MM_Args);
--- a/bin/generate-keyword-rules
+++ b/bin/generate-keyword-rules
@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+# Create a directory called 'output' with generated files for all config files in the $PWD
+
+use lib '../lib/';
+use Mail::SpamAssassin::KeywordRuleGenerator;
+
+use strict;
+use warnings;
+
+my $id = 'MC';
+my $dir = 'output';
+
+# setup
+my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' =>$id, 'debug' => 0, 'joinScores' => 0 , 'dir' => $dir } );
+
+# clean
+unlink(glob("$dir/*"));
+rmdir($dir);
+
+# get files in PWD
+my @files = glob("./*");
+my @clean;
+
+$kw->createDir($dir);
+
+my @failed = $kw->readAll( @files );
+die scalar(@failed)." error(s) - ".join(', ', @failed)."\n" if (scalar(@failed));
+$kw->writeAll();
+
+use Mail::SpamAssassin;
+my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => "./$dir", 'pre_config_text' => "loadplugin Mail::SpamAssassin::Plugin::Check" } );
+my $fail = $sa->lint_rules();
+die "Failed to verify with SpamAssassin: $fail" if ($fail);
--- a/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm
+++ b/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm
@ -48,20 +48,20 @@ The sets of keywords can be broken up into groups (see GROUPS).

 Requires C<spamassassin> executable and the following Perl modules

-        To::Be::Determined
+    Mail::SpamAssassin

 =cut

-#use strict;
+use strict;
 use warnings;

 =head2 FILES

-There are built-in functions to ingest formatted list files. See C<readFiles>
+There are built-in functions to ingest formatted list files. See C<readFile>
 method. By default, the output file name and the rules therein will use a
 stripped and capitalized version of those filenames.

-    $kw->readFiles( 'example.txt' );
+    $kw->readAll( 'example.cf', 'example.txt' );
    $kw->writeAll();

 This will creates rules formatted like:
@ -70,20 +70,44 @@ This will creates rules formatted like:

 and will output to the file:

-    70_id_example.cf
+    70_ID_EXAMPLE.cf
+
+Leading './' or $PWD will be stripped, but other paths will be included with '_'
+substituted '/'. For example:
+
+    $kw->readAll ( './example.cf', '/to/pwd/example2.cf', 'dir/example3.cf' );
+
+will produce:
+
+    70_KW_EXAMPLE.cf
+    70_KW_EXAMPLE2.cf
+    70_KW_DIR_EXAMPLE3.cf
+
+If there are any rules in the 'GLOBAL' group, or if in C<singleOutfile> mode,
+a file with just the priority and keyword will be produced with the select rules
+or all rules, respectively. This will look like:
+
+    70_KW.cf

 See the C<generate*> methods for more information on this formatting. Also see
 the C<new> method for discussion of the 'id'.

-Finally, a like file:
+Finally, unless in C<joinScores> mode, each configuration will have a matching
+scores file like:

-    71_id_scores.cf
+    70_KW_EXAMPLE_SCORES.cf
+    70_KW_SCORES.cf

-Will be created with the scores for all of the rules in the prior file(s). The
-C<join_scores> variable is true by default, creating the above file. If made
-false, then will determine a unique score file will be created for each file.
-Alternatively, C<append_scores> can be set to include the scores directly in the
-config file with the rule definitions.
+In C<joinScores> mode, the scores will be directly appended in the same config
+file with the rule definitions.
+
+Note that the global files may have an incremented priority since it requires
+all meta rules from the other files be defined first. For example, the above
+would be left with 70 for the global priority, however if the C<id> were lower
+in the alphabet, the global files would be incremented:
+
+    70_AB_EXAMPLES.cf
+    71_AB.cf

 =head2 RULES

@ -145,7 +169,6 @@ our %EXPORT_TAGS = (
        'getGlobalOutput',
        'getGlobalScoresOutput',
        'clearFile',
-                'getFiles',
        'nextFile',
        'readFile',
        'getFile',
@ -158,7 +181,8 @@ our %EXPORT_TAGS = (
        'generateGlobals',
        'generateAll',
        'writeFile',
-                'writeAll'
+        'writeAll',
+        'verifyOutput'
    ]
 );

@ -326,7 +350,6 @@ sub setPriority
    my $priority = shift;

    if (defined($priority)) {
-                $path = uc($priority);
        $self->{'priority'} = $priority || return "Failed to set priority: $priority";
    } else {
        return "No 'priority' provided\n";
@ -504,7 +527,7 @@ alphabetically after the last file with 'meta' rules. C<$file> can be used to
 bypass that check, but it might lead to rules that do not work.

 Will try to select a filename as close to the existing output files as possible.
-First it will simply duplicate a name if it is in C<singleOutput> mode. Then,
+First it will simply duplicate a name if it is in C<singleOutfile> mode. Then,
 it will try to use the base name without the C<$file> portion. This will
 generally not work because the '_' before the file will come after the dot
 without:
@ -528,11 +551,11 @@ sub setGlobalOutfile
    my $path = shift;

    if (defined($path)) {
-                $self->{'filename'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
+        $self->{'filemap'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
        return undef;
    }
    if ($self->{'singleOutfile'}) {
-                $self->{'filename'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
+        $self->{'filemap'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
        return undef;
    }
    my $last;
@ -541,7 +564,7 @@ sub setGlobalOutfile
    }
    my $file = $self->getDir().'/'.$self->getPriority()."_".$self->getId().".cf";
    if (defined($last) && $file gt $last) {
-                $self->{'filename'}->{'GLOBAL'} = $file;
+        $self->{'filemap'}->{'GLOBAL'} = $file;
        return undef;
    }
    if ($self->getPriority < 99) {
@ -550,7 +573,7 @@ sub setGlobalOutfile
    }
    $file =~ s/_/__/ unless ($file gt $last);
    if ($file gt $last) {
-                $self->{'filename'}->{'GLOBAL'} = $file;
+        $self->{'filemap'}->{'GLOBAL'} = $file;
        return undef;
    }
    return ("Cannot determine a valid GLOBAL output file\n");
@ -603,7 +626,7 @@ sub setGlobalScoresOutfile
    if (defined($self->{'filemap'}->{'GLOBAL_SCORES'}) && $self->{'filemap'}->{'GLOBAL_SCORES'} gt $self->{'filemap'}->{'GLOBAL'}) {
        return undef;
    }
-        my $file = $self->{'filemap'}->{'GLOBAL'};
+    my $file = $self->getGlobalOutfile() || die "HUH?";
    $file =~ s/\.cf$/_SCORES.cf/;
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
@ -611,20 +634,20 @@ sub setGlobalScoresOutfile
    }
    $file = $self->getDir().'/'.$self->getPriority()."_".$self->getId()."_SCORES.cf";
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
-                $self->{'filename'}->{'GLOBAL_SCORES'} = $file;
+        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
-        my $gpriority = $self->{'filename'}->{'GLOBAL'};
+    my $gpriority = $self->{'filemap'}->{'GLOBAL'};
    $gpriority =~ s/^(\d\d).*/$1/;
    $file = $self->getDir().'/'.($gpriority+1)."_".$self->getId()."_SCORES.cf" if ($gpriority < 99);
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
-                $self->{'filename'}->{'GLOBAL_SCORES'} = $file;
+        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
-        $file = $self->{'filename'}->{'GLOBAL'};
+    $file = $self->{'filemap'}->{'GLOBAL'};
    $file =~ s/_/__/;
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
-                $self->{'filename'}->{'GLOBAL_SCORES'} = $file;
+        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
    return ("Cannot determine a valid GLOBAL output file\n");
@ -707,65 +730,6 @@ sub getGlobalScoresOutput
    }
 }

-=head2 C<$kw->getFiles($regex);>
-
-Simple recursive search for files within a directory. Will validate that each
-file is readable and return an array of file names.
-
-Expects a single file or directory path scalar as first argument and an optional
-regex as the secord. If you have multiple entries to fetch, run separately and
-append to your array.
-
-The regex will be used as a file filter and will only return files that match.
-
-=cut
-
-sub getFiles
-{
-        my $self = shift;
-        my $regex = shift;
-
-        my $return = '';
-        foreach (@args) {
-                $return .= "$_ does not exist\n" unless (-e "$_" || -l "$_");
-                if (-l $_) {
-                        $self->getFiles(readlink($_));
-                } elsif (-d $_) {
-                        my @recursive = glob($_."/*");
-                        $self->getFiles(@recursive);
-                } else {
-                        if (defined($regex)) {
-                                if ($_ =~ $regex) {
-                                        push(@{$self->{'files_ref'}}, $_);
-                                } else {
-                                        next;
-                                }
-                        } else {
-                                $return .= "$_ is not readable\n" unless (-r "$_");
-                                push(@{$self->{'files_ref'}}, $_);
-                        }
-                }
-        }
-        return $return;
-}
-
-=head2 C<$kw->nextFile();>
-
-Shift next message in files_ref queue to current.
-
-=cut
-
-sub nextFile
-{
-        my $self = shift;
-        if (scalar(@{$self->{'files_ref'}})) {
-                $self->{'path'} = shift(@{$self->{'files_ref'}});
-        } else {
-                return "End of array\n";
-        }
-        return 0;
-}
-
 =head2 C<$kw->clearFile();>

 Clear the current file reference queue.
@ -780,6 +744,21 @@ sub clearFile
    return 0;
 }

+=head2 C<$kw->clearAll();>
+
+Clear all file reference queues.
+
+=cut
+
+sub clearAll
+{
+    my $self = shift;
+    foreach my $file (keys(%{$self->{'rules'}})) {
+        %{$self->{'rules'}->{'DEFAULT'}} = () if ($file eq 'DEFAULT');
+        $self->clearFile($file);
+    }
+    return 0;
+}
 =head2 C<$kw->readFile(%args);>

 Read in properly formatted keyword list file. The basic format is one keyword
@ -816,6 +795,10 @@ sub readFile
    my $self = shift;
    my $file = shift || return 'No file provided';
    my %args = @_;
+    # Clean to relative path
+    $file =~ s/^\.\///;
+    $file =~ s/^$ENV{'PWD'}\///;
+
    $self->getDir();
    $self->{'file'} = $file;
    $self->setOutfile();
@ -866,15 +849,12 @@ Loops through C<readFile> for all files in the queue.
 sub readAll
 {
    my $self = shift;
-        my @files = ( shift ) || return "No files provided\n";
-        while (scalar(@_)) {
-                push(@files, shift);
-        }
+    my @files = @_;

    my @failed;
    foreach my $file (@files) {
        if (!-e $file) {
-                        push(@failed, $file);
+            push(@failed, "$file does not exist\n");
            print STDERR "$file does not exist\n" if ($self->{'debug'});
        } elsif (-l $file) {
            # Accept only links within PWD
@ -882,7 +862,7 @@ sub readAll
            if ($dest =~ m/^\//) {
                $dest =~ s/$ENV{'PWD'}\/// ;
                if ($dest eq readlink($file)) {
-                                        push(@failed, $file);
+                    push(@failed, "Symlink outside of PWD $file\n");
                    print STDERR "Symlink outside of PWD $file\n" if ($self->{'debug'});
                    next;
                }
@ -891,7 +871,7 @@ sub readAll
        } elsif (-f $file) {
            push(@failed, "Failed to read $file") if $self->readFile($file);
        } elsif (-d $file) {
-                        push(@failed, @_) if $self->readAll(glob($file.'/*'));
+            push(@failed, $self->readAll(glob($file.'/*')) );
        # Bash pattern?
        } else {
            if (my @glob = glob($file)) {
@ -901,8 +881,7 @@ sub readAll
            }
        }
    }
-        return @failed if (scalar(@failed));
-        return undef;
+    return @failed;
 }

 =head2 C<$kw->readLine($line)>
@ -916,7 +895,7 @@ sub readLine
 {
    my $self = shift;
    my $line = shift;
-        my $invalid = shift;
+    my @invalid;
    my ($word, $score, $comment, @groups);
    if (my @sections = $line =~ m/(?:^|\s+)(?:([^\d\s#]\S+)|(\d+(?:\.\d+)?\b)|([^\d\s#]+)|(#.*$))/g) {
        while (@sections) {
@ -947,7 +926,7 @@ sub readLine
        $word = lc($word);
        $score //= 0;
        $comment //= '';
-                @groups = ( 'LOCAL', 'GLOBAL' ) unless (scalar(@groups));
+        @groups = ( 'LOCAL' ) unless (scalar(@groups));
        return ( $word, $score, $comment, @groups );
    }
    return ();
@ -1276,6 +1255,7 @@ sub writeFile
    my $file = shift || return "Requires filename";
    return "Nothing generated for $file" unless (defined($self->{'output'}->{$file}));

+    return unless (scalar(@{$self->{'output'}->{$file}}));
    if (open(my $fh, ">", $file)) {
        foreach my $line (@{$self->{'output'}->{$file}}) {
            print $fh $line;
@ -1302,6 +1282,25 @@ sub writeAll
    return ("Errors:\n ".join("\n ", @errors)) if (scalar(@errors));
 }

+=head2 C<$kw->verifyOutput()>
+
+Process output directory with SpamAssassin linter. Return errors or
+undef if successful. An optional C<$path> can be provided to test a
+different file or directory.
+
+=cut
+
+sub verifyOutput
+{
+    my $self = shift;
+    my $path = shift || $self->{'dir'};
+
+    use Mail::SpamAssassin;
+    my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => $path } );
+
+    return $sa->lint_rules();
+}
+
 =head1 MORE

 For discussion of the module and examples, see:
--- a/t/01_attributes.t
+++ b/t/01_attributes.t
@ -16,7 +16,7 @@ my $defaults = {
 };

 my $args = {
-        'id'            => '01',
+        'id'            => 'TEST',
        'priority'      => 10,
        'debug'         => 1,
        'singleOutfile' => 1,
--- a/t/02_files.lnk
+++ b/t/02_files.lnk
@ -1 +0,0 @@
-/home/jpm/KeywordRuleGenerator/t/02_files.cf
--- a/t/02_files.lnk
+++ b/t/02_files.lnk
@ -0,0 +1 @@
+word
--- a/t/02_files.t
+++ b/t/02_files.t
@ -18,26 +18,26 @@ ok ($kw->getFile($file) == $file, "Get current working file");
 ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files in rules");
 ok ($kw->{'rules'}->{$file}, "Loaded correct hash key");
 ok ($kw->{'filemap'}->{$file} =~ m/50_02_T_02_FILES.cf$/, "Loaded correct output name");
-ok (!$kw->clearFiles(), "Clear 'rules' hash");
-ok (!$kw->{'rules'}, "Fetched none after clearing");
+ok (!$kw->clearAll(), "Clear 'rules' hash");
+ok (!scalar(keys(%{$kw->{'rules'}})), "Fetched none after clearing");

 # Read a directory
 my $dir = 't/02_files.dir';
 ok (!$kw->readAll($dir), "Run file on dir");
 ok (scalar(keys(%{$kw->{'rules'}})) == 3, "Fetched correct number of files from dir");
-ok (!$kw->clearFiles(), "Clear dir files");
+ok (!$kw->clearAll(), "Clear dir files");

 # Read a symlink
 my $link = 't/02_files.lnk';
 ok (!$kw->readAll($link), "Run file on link");
 ok ($kw->getFile() == $file, "Fetched correct name ($file) from link ($link)");
 ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files from link");
-ok (!$kw->clearFiles(), "Clear link files");
+ok (!$kw->clearAll(), "Clear link files");

 # Read multiple mixed
 ok (!$kw->readAll($file, $dir, $link), "Run readAll on multiple/mixed");
-ok (scalar(keys(%{$kw->{'rules'}})) == 4, "Fetched correct number of files from all");
-ok (!$kw->clearFiles(), "Clear dir files");
+ok (scalar(keys(%{$kw->{'rules'}})) == 5, "Fetched correct number of files from all");
+ok (!$kw->clearAll(), "Clear dir files");


 ok ($kw->readFile('does_not_exist'), "Correctly failed for non-existent file");
--- a/t/04_rules.t
+++ b/t/04_rules.t
@ -17,7 +17,6 @@ ok(!scalar(@failed), "Load 'rules' hash with readAll");

 my $expected = getExpected();
 my ( $missing, $extra, $incorrect ) = 0;
-use Data::Dump;
 foreach my $file (keys(%{$expected})) {
        if ($file eq 'GLOBAL') {
                my ($m, $e) = compareValues(
--- a/t/05_write.t
+++ b/t/05_write.t
@ -46,10 +46,8 @@ foreach (@files) {
 }
 ok (!scalar(keys(%remaining)), "All expected output files found");

-use Mail::SpamAssassin;
-my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => $testdir } );
-$failed = $sa->lint_rules();
-ok (!$failed, "Verified by spamassassin".($res ? "\n$failed" : ""));
+$failed = $kw->verifyOutput();
+ok (!$failed, "Verified by spamassassin".($failed ? "\n$failed" : ""));

 $kw->cleanDir();
				`@ -1 +0,0 @@`
				`/home/jpm/KeywordRuleGenerator/t/02_files.cf`