Additional tests

Locate and open files Validate config file input formatting Check that the hash representation of rules read from files is correct
2022-11-28 21:05:01 -05:00 · 2022-11-28 21:05:01 -05:00 · cfbcee7c7d
parent 944c3eed5e
commit cfbcee7c7d
10 changed files with 1131 additions and 85 deletions
--- a/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm
+++ b/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm
@ -1,9 +1,19 @@
-#!/usr/bin/perl
+# <@LICENSE>
-
+# Licensed to the Apache Software Foundation (ASF) under one or more
-package Mail::SpamAssassin::KeywordRuleGenerator;
+# contributor license agreements.  See the NOTICE file distributed with
-
+# this work for additional information regarding copyright ownership.
-use strict;
+# The ASF licenses this file to you under the Apache License, Version 2.0
-use warnings;
+# (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at:
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # </@LICENSE>
 =head1 NAME
@ -11,11 +21,7 @@ Mail::SpamAssassin::KeywordRuleGenerator - Generate SA rules for keywords
 =cut
-our $VERSION = '0.01';
+package Mail::SpamAssassin::KeywordRuleGenerator;
 =head1 VERSION
 Version 0.01
 =head1 SYNOPSIS
@ -23,6 +29,31 @@ Generate SpamAssassin compatible configuration files given lists of keywords.
 Implemented as a module largely for testing purposes.
    use Mail::SpamAssassin::KeywordRuleGenerator;
    my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new($id);
    $kw->readFile('keywords.cf');
    $kw->writeFiles();
 =head1 DESCRIPTION
 Mail::SpamAssassin::KeywordRuleGenerator does what it says on the tin; it
 generates SpamAssassin compatible configuration files to catch keywords that you
 specify. Most simply, it can take in one or more properly formatted input files
 (see FILES), generate rules for individual files (optional), as well as meta
 rules for the counts for each set of keywords (ie. 1 of N hit; 2 of N hit...).
 See RULES for more on how the rules are generated.
 The sets of keywords can be broken up into groups (see GROUPS).
 =head1 PREREQUISITES
 Requires C<spamassassin> executable and the following Perl modules
        To::Be::Determined
 =cut
 #use strict;
 use warnings;
 =head2 FILES
@ -77,6 +108,14 @@ describe        ID_FILE_GROUP_1         1 match in keyword group 'GROUP'
 meta            ID_FILE_GROUP_2         ( __ID_FILE_WORD + __ID_FILE_OTHER ) > 1
 describe        ID_FILE_GROUP_2         2 matches in keyword group 'GROUP'
 =head1 VERSION
 Version 0.01
 =cut
 our $VERSION = '0.01';
 =head1 EXPORT
 =cut
@ -89,86 +128,284 @@ our @EXPORT = qw( new );
 our %EXPORT_TAGS = (
        'all' => [
                'new',
-                ''
+                'getId',
                'setId',
                'getPriority',
                'setPriority',
                'getOutfile',
                'setOutfile',
                'getScorefile',
                'setScorefile',
                'clearFiles',
                'getFiles',
                'nextFile',
                'readFile',
                'getFile',
                'setFile',
                'joinRules',
                'processMetas',
                'processWords',
                'processGroups',
                'processAll',
                'writeFile',
                'writeAll'
        ]
 );
 our @EXPORT_OK = ( @{ $EXPORT_TAGS{all} } );
-=head1 SUBROUTINES/METHODS
+=head1 METHODS
-=head2 C<$kw = new($id)>
+Except for getter methods, all methods make changes directly within the context
 object and only return errors. Thus, each call can be verified by checking that
 it returned undef:
    die "Failed to run 'method'\n" if($kw->method($arg));
 or:
    my $err = $kw->method($arg);
    die "Received $err\n" if ($err);
 Getter methods will return the attribute or undef if it does not exist.
 =head2 C<$kw = new( %args )>
 Creates a new C<Mail::SpamAssassin::KeywordRuleGenerator> object and returns it.
-C<$id> is an optional identifier which will be used to prefix any rule output.
+C<$args> is an optional hash to assign any preliminary attributes or flags to
-It can be updated at any time via the C<$kw->{id}> value or with the
+the object. See various 'get' and 'set' functions.
 C<$kw->id($id)> setter;
 =cut
 sub new
 {
-        my ($class, $id) = @_;
+        my ($class, $args) = @_;
        my $self = $args;
-        $id = uc($id) || 'KW';
+=head3 Initial attributes
        bless {
                id => $id || '',
                keywords => {}
        } => $class;
 }
-=head2 C<$kw->id()>
+While other attributes will exist within the object during processing, those
 which make sense to define up-front and which will not be overwritten by any of
 the built-in functions (other than their associated getters) are:
-Get/Setter for C<$kw->{'id'}>
+=head3 id
    id scalar
        A short identifier which will be appending to all output files to allow
        for easier recognition of the files' sources. Default: 'KW'
 =cut
-sub id
+        $self->{'id'}                   //= 'KW';
 =head3 priority
    priority scalar
        Typically a number used to define the priority of the rules.
        SpamAssassin will read configuration files in alphabetical order and the
        last iteration of a configuration for the same rule will be used. This
        means that the files read last will overwrite those read earlier. By
        convention the number is used for easy sorting. Any leading aphabetical
        character will be ordered after all numbers, and so given high priority.
        Default: '50'
 =cut
        $self->{'priority'}             //= 50;
 =head3 debug
    debug boolean
        Enable (1) or disable (0) debugging output. Default: 0
 =cut
        $self->{'debug'}                //= 0;
 =head3 singleOutfile
    singleOutfile boolean
        Indicates whether output rules should all be added to a single file (1),
        or to one file per input file (0). Note that this still requires
        'joinScores' to have one file total. Default: 0
 =cut
        $self->{'singleOutfile'}        //= 0;
 =head3 joinScores
    joinScores bootlean
        Indicates whether to include the scores in the same file as their
        associated rule definitions (1) or in a second file on their own (0).
        The second file will simply append '_SCORES' to the file name (prior to
        the '.cf'), unless overridden by C<$kw->setScoreFile($path)> or a second
        path argument in C<$kw->setOutfile($rulePath, $scorePath)>. Default: 1
 =cut
        $self->{'joinScores'}           //= 1;
        bless $self, $class;
        return $self;
 }
 =head2 C<$kw->getId()>
 Getter for C<$kw->{'id'}>. 'id' is used for top-level rule names.
 =cut
 sub getId
 {
        my $self = shift;
        if (defined($self->{'id'})) {
                return $self->{'id'};
        }
 }
 =head2 C<$kw->setId()>
 Setter for C<$kw->{'id'}>
 =cut
 sub setId
 {
        my $self = shift;
        my $id = shift;
        if (defined($id)) {
                $id = uc($id);
-                $self->{'id'} = $id;
+                $self->{'id'} = $id || return "Failed to set $id";
        } else {
-                if (defined($self->{'id'})) {
+                return "No ID provided\n";
                        return $self->{'id'};
                } else {
                        return undef;
                }
        }
 }
-=head2 C<$kw->file()>
+=head2 C<$kw->getPriority()>.
-Get/Setter for C<$kw->{'file'}>
+Getter for C<$kw->{'priority'}>. 'priority' value used for the output filepath
 to indicate the load order. SpamAssassin reads in ascending order with later
 iterations overriding earlier. The 'score' file will increment so that it comes
 at the end.
 =cut
-sub file
+sub getPriority
 {
        my $self = shift;
        if (defined($self->{'priority'})) {
                return $self->{'priority'};
        }
 }
 =head2 C<$kw->setPriority()>
 Setter for C<$kw->{'priority'}>.
 =cut
 sub setPriority
 {
        my $self = shift;
        my $priority = shift;
        if (defined($priority)) {
                $path = uc($priority);
                $self->{'priority'} = $priority || return "Failed to set priority: $priority";
        } else {
                return "No 'priority' provided\n";
        }
 }
 =head2 C<$kw->getFile()>
 Getter for C<$kw->{'file'}>. This is the name of the current output file being
 processed. Not to be confused with C<readFile> or C<getOutfile>.
 =cut
 sub getFile
 {
        my $self = shift;
        if (defined($self->{'file'})) {
                return $self->{'file'};
        }
 }
 =head2 C<$kw->setFile()>
 Setter for C<$kw->{'file'}>.
 =cut
 sub setFile
 {
        my $self = shift;
        my $file = shift;
        if (defined($file)) {
-                my $name = $file;
+                $self->{'file'} = $file;
                $name =~ s/^([a-zA-Z0-9\-_]*)(\..*)?$/$1/;
                $name =~ s/-/_/;
                $name = uc($name);
                die("invalid filename\n") unless ($name =~ m/^[A-Z_]+$/);
                $self->{'file'} = $name;
        } else {
-                if (defined($self->{'file'})) {
+                return "No File provided\n";
-                        return $self->{'file'};
+        }
 }
 =head2 C<$kw->getOutfile()>
 Getter for C<$kw->{'outfile'}>. 'outfile' represents the real filepath of the
 output file which is currently being processed.
 =cut
 sub getOutfile
 {
        my $self = shift;
        if (defined($self->{'outfile'})) {
                return $self->{'outfile'};
        }
 }
 =head2 C<$kw->setOutfile()>
 Setter for C<$kw->{'outfile'}>. Can be defined manually with a scalar argument,
 otherwise the path is constructed from the existing attributes.
 =cut
 sub setOutfile
 {
        my $self = shift;
        my $path = shift;
        if (defined($path)) {
                $self->{'outfile'} = $path || return "Failed to set $path";
        } else {
-                        return undef;
+                if ($self->{'singleOutfile'}) {
                        $self->{'outfile'} = $self->{'priority'} .
                                '_' . uc($self->{'id'}) .
                                '.cf';
                } else {
                        my $file = $self->{'file'};
                        $file =~ s/\//_/g; # Change dir slashes to _
                        $file =~ s/(\.[^\.]*)$//g; # Remove extensions
                        $file = uc($file); # Convert to uppercase for rule names
                        $self->{'outfile'} = $self->{'priority'} .
                                '_' . uc($self->{'id'}) .
                                '_' . $file . 
                                '.cf';
                }
        }
 }
-=head2 C<$kw->getFile();>
+=head2 C<$kw->getFiles($regex);>
 Simple recursive search for files within a directory. Will validate that each
 file is readable and return an array of file names.
@ -184,24 +421,63 @@ The regex will be used as a file filter and will only return files that match.
 sub getFiles
 {
        my $self = shift;
-        my @args = @_;
+        my $regex = shift;
        my $return = '';
        foreach (@args) {
-                die "$_ does not exist\n" unless (-e "$_" || -l "$_");
+                $return .= "$_ does not exist\n" unless (-e "$_" || -l "$_");
                if (-l $_) {
-                        getFiles(readlink($_));
+                        $self->getFiles(readlink($_));
                } elsif (-d $_) {
                        my @recursive = glob($_."/*");
-                        getFiles(@recursive);
+                        $self->getFiles(@recursive);
                } else {
-                        die "$_ is not readable\n" unless (-r "$_");
+                        if (defined($regex)) {
-                        push(@{$self->{files_ref}}, $_);
+                                if ($_ =~ $regex) {
                                        push(@{$self->{'files_ref'}}, $_);
                                } else {
                                        next;
                                }
                        } else {
                                $return .= "$_ is not readable\n" unless (-r "$_");
                                push(@{$self->{'files_ref'}}, $_);
                        }
                }
-        return @$self->{files_ref};
+        }
        return $return;
 }
-=head2 C<$kw->readFile();>
+=head2 C<$kw->nextFile();>
 Shift next message in files_ref queue to current.
 =cut
 sub nextFile
 {
        my $self = shift;
        if (scalar(@{$self->{'files_ref'}})) {
                $self->{'path'} = shift(@{$self->{'files_ref'}});
        } else {
                return "End of array\n";
        }
        return 0;
 }
 =head2 C<$kw->clearFiles();>
 Clear the current file reference queue.
 =cut
 sub clearFiles
 {
        my $self = shift;
        delete($self->{'rules'}) || return "Failed to delete rules hash\n";
        return 0;
 }
 =head2 C<$kw->readFile(%args);>
 Read in properly formatted keyword list file. The basic format is one keyword
 per line, an optional score, and an optional list of 'groups'.
@ -218,45 +494,314 @@ When keyword groups are omitted, that keyword defaults to just the 'LOCAL' group
 Examples with other combinations are:
-# word 0                        Same as previous
+    word 0                      # Same as previous
-# word 2                        Used for LOCAL, not GLOBAL, scores 2 on it's own
+    word 2                      # Used for LOCAL, not GLOBAL, scores 2
-# word GROUP                    GROUP group, not LOCAL, not GLOBAL, no score
+    word GROUP                  # GROUP group, not LOCAL, not GLOBAL, no score
-# word 0 GROUP                  Same as previous
+    word 0 GROUP                # Same as previous
-# word 1 GROUP GLOBAL LOCAL     GROUP group, in GLOBAL, in LOCAL, scores 1
+    word 1 GROUP GLOBAL LOCAL   # GROUP group, in GLOBAL, in LOCAL, scores 1
 C<%args> can be used to override the attributes used to index the rules,
 including the id, file
 This function intentionally does not iterate through the files queue so that you
 can step through and make modifications as you go.
 =cut
-sub readFiles
+sub readFile
 {
-        my $key_ref = shift;
+        my $self = shift;
-        my @files = @_;
+        my $file = shift || return 'No file provided';
        my %args = @_;
        $self->{'file'} = $file;
        $self->setOutfile();
        my $n = 0;
        foreach my $file (@files) {
                my $n = 0; # Track line number for errors
                my $name = $file;
                $name =~ s/\//_/g; # Change dir slashes to _
                $name =~ s/(\.[^\.]*)*$//g; # Remove extensions
                $name = uc($name); # Convert to uppercase for rule names
        if (open(my $fh, '<', $file)) {
                my $rules = 0;
                while (<$fh>) {
                        $n++;
                        # Ignore blank lines and comments
                        if ($_ =~ m/^\s*$/ || $_ =~ m/^#/) {
                                next;
                        # Verify formatting
-                                } elsif ($_ =~ m/^([^\s]+)\s+([0-9]+)(?:\s+(.*))?/) {
+                        } elsif (my ($word, $score, $comment, @groups) = $self->readLine($_)) {
-                                        my ($word, $score, $groups) = ($1, $2, $3);
+                                if ($self->{'debug'}) {
-                                        print "'$word' '$score' " . (join(',',$groups)) . "\n";
+                                        print "FOUND: '$word' '$score' " . (join(',',@groups)) . "\n";
                                }
                                foreach my $group (@groups) {
                                        if ($group eq 'GLOBAL') {
                                                push(@{$self->{'rules'}->{'GLOBAL'}}, $word);
                                        } else {
-                                        die "Invalid input in $file, line $n: $_\n";
+                                                push(@{$self->{'rules'}->{$self->{'outfile'}}->{$group}}, $word);
                                        }
                                }
                                $self->{'rules'}->{$self->{'outfile'}}->{'SCORED'}->{$word} = $score if ($score);
                                $self->{'rules'}->{$self->{'outfile'}}->{'COMMENTS'}->{$word} = $comment if ($comment);
                        } elsif ($self->{debug}) {
                                print STDERR "ERROR: Invalid input in $file, line $n: $_\n";
                        }
                }
-        #print "$_\n" foreach(@$files_ref);
+                print STDERR "No rules found in $file\n" unless ($self->{'rules'}->{$self->{'outfile'}} || !$self->{'debug'});
                return "No rules found in $file\n" unless ($self->{'rules'}->{$self->{'outfile'}});
        } else {
                delete($self->{'file'});
                return "Failed to read $file";
        }
        return undef;
 }
-=head 2 C<$kw->writeFiles($out_dir)>
+=head2 C<$kw->readAll();>
 Loops through C<readFile> for all files in the queue.
 =cut
 sub readAll
 {
        my $self = shift;
        my @files = ( shift ) || return "No files provided\n";
        while (scalar(@_)) {
                push(@files, shift);
        }
        my @failed;
        foreach my $file (@files) {
                if (!-e $file) {
                        push(@failed, $file);
                        print STDERR "$file does not exist\n" if ($self->{'debug'});
                } elsif (-l $file) {
                        # Accept only links within PWD
                        my $dest = readlink($file);
                        if ($dest =~ m/^\//) {
                                $dest =~ s/$ENV{'PWD'}\/// ;
                                if ($dest eq readlink($file)) {
                                        push(@failed, $file);
                                        print STDERR "Symlink outside of PWD $file\n" if ($self->{'debug'});
                                        next;
                                }
                        }
                        $self->readAll($dest);
                } elsif (-f $file) {
                        push(@failed, "Failed to read $file") if $self->readFile($file);
                } elsif (-d $file) {
                        push(@failed, @_) if $self->readAll(glob($file.'/*'));
                # Bash pattern?
                } else {
                        if (my @glob = glob($file)) {
                                push(@failed, ( @_ )) if $self->readAll(@glob);
                        } else {
                                print STDERR "Bad file $file\n" if ($self->{'debug'});
                        }
                }
        }
        return @failed if (scalar(@failed));
        return undef;
 }
 =head2 C<$kw->readLine($line)>
 Reads a line from a configuration file and returns the relevant values or undef
 if the line is not properly formatted.
 =cut
 sub readLine
 {
        my $self = shift;
        my $line = shift;
        my $invalid = shift;
        my ($word, $score, $comment, @groups);
        if (my @sections = $line =~ m/(?:^|\s+)(?:([^\d\s#]\S+)|(\d+(?:\.\d+)?\b)|([^\d\s#]+)|(#.*$))/g) {
                while (@sections) {
                        next unless my $section = shift(@sections);
                        if (defined($comment)) {
                                $comment .= ' ' if ($comment ne '');
                                $comment .= $section;
                        } elsif (!defined($word) && $section =~ m/^([^\d\s#]\S+)$/) {
                                $word = $section;
                        } elsif (defined($word) && $section =~ m/^(\d+(?:\.\d+)?)$/ && !defined($score)) {
                                $score = $section;
                        } elsif (defined($word) && $section =~ m/^([^\d\s#]+)$/) {
                                push(@groups, $section);
                        } elsif (defined($word) && $section =~ m/^#.*$/ && !defined($comment)) {
                                $comment = $section;
                                $comment =~ s/^#\s*//;
                        } else {
                                push(@invalid, $section);
                        }
                }
                if (scalar(@invalid)) {
                        if ($self->{'debug'}) {
                                print("Invalid clauses: '".join("', ", @invalid)."' in '$line'\n");
                        }
                        return undef;
                }
                return undef unless ($word);
                $word = lc($word);
                $score //= 0;
                $comment //= '';
                @groups = ( 'LOCAL', 'GLOBAL' ) unless (scalar(@groups));
                return ( $word, $score, $comment, @groups );
        }
        return ();
 }
 =head2 C<$kw->joinRules(%rules)>
 Merge rules hash into working output hash. Generally called from C<readFile>,
 but if you want to add rules manually, it will require the following format:
 {
        'word' => {
                'score' => 0,
                'groups' => ( 'LOCAL' )
        },
        'other' => {
                'score' => 1,
                'groups' => ( 'GLOBAL', 'group' )
        }
 }
 The main context object will provide the working 'ID', 'FILE' and other
 attributes necessary to nest these rules in the existing hash.
 =cut
 sub joinRules
 {
        my $self = shift;
        my %rules = @_;
        if ($self->{'unified'}) {
                $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'out'} = $self->{'priority'}.'_'.$self->{'id'}.'.cf';
        } else {
                $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'out'} = $self->{'priority'}.'_'.$self->{'id'}.'_'.$self->{'file'}.'.cf';
        }
        foreach my $word (keys(%rules)) {
                my $score = $rules{$word}{'score'} || 0;
                my @groups = $rules{$word}{'groups'} || ( 'GLOBAL' );
                if ($score) {
                        $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'words'}->{$word} = $score;
                }
                foreach my $group (@groups) {
                        if ($group eq 'GLOBAL') {
                                if (scalar(keys(%{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}}))) {
                                        push(@{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}}, $word);
                                } else {
                                        $self->{'rules'}->{$self->{'id'}}->{'GLOBAL'} = ( $word );
                                }
                                next;
                        }
                        if (scalar(keys(%{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group}}))) {
                                push(@{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group}}, $word);
                        } else {
                                $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group} = ( $word );
                        }
                }
        }
        return 0;
 }
 =head2 C<$kw->processMetas($outfile, $file);>
 Create all of the component meta rules for the declared C<$file>. Those that
 will be used for the standalone and count rules. Output to C<$outfile>. This
 must be run before the other process methods and must be run for 'GLOBAL' first,
 otherwise output will be invalid. Meta rules for file-specific words will not be
 generated if they are also in the 'GLOBAL' group, instead the meta rules from
 the 'GLOBAL' file will be used for the count rules in all other files. This
 will prevent duplicates, but also requires that you not rename output files such
 that the they appear before the 'GLOBAL' file (without '_C<$file>' at the end).
 =cut
 sub processMetas
 {
        my $self = shift;
        my $outfile = shift;
        my $file = shift;
        my $rules = shift;
        my $prefix = $self->{'id'};
        my @words;
        if ($file eq 'GLOBAL') {
                @words = @{$rules->{'GLOBAL'}};
        } else {
                $prefix .= "_".$file;
                foreach (keys(%{$rules->{$file}->{'groups'}})) {
                        next if (grep {/^$_$/} @{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}});
                }
                foreach (keys(%{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'words'}})) {
                        next if (grep {/^$_$/} @{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}});
                        next if (grep {/^$_$/} @words);
                        push (@words, $_);
                }
        }
        foreach my $word (@words) {
                $self->{'output'}->{$outfile} .=
                        "body\t__".$prefix."_".uc($word)."_BODY\t/\\b".$word."\\b/\n" .
                        "header\t__".$prefix."_".uc($word)."_SUBJ\tSubject =~ /\\b".$word."\\b/\n" .
                        "meta\t__".$prefix."_".uc($word)."\t( __".$prefix."_".uc($word)."_BODY || __".$prefix."_".uc($word)."_SUBJ )\n\n";
        }
 }
 =head2 C<$kw->processWords($outfile,%args);>
 Take a list of words with scores and add them to C<$outfile>.
 =cut
 sub processWords
 {
        my $self = shift;
        my $out = shift;
 }
 =head2 C<$kw->processGroup(%args);>
 Take a single group, including 'GLOBAL' and add it to C<$outfile>.
 =cut
 sub processGroup
 {
        my $self = shift;
        my $outfile = shift;
        for (my $i = 0; $i < scalar(@all); $i++) {
                $files->{$self->{'priority'}.'_'.$self->{'id'}.'cf'} .=
                        "meta\t".$self->{'id'}."_".$i."\t( ".join(' + ',@all)." ) > $i\n" .
                        "describe\tMatched ".($i+1)."of keywords: ".join(', ',@{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}})."\n\n";
        }
 }
 =head2 C<$kw->processAll(%args);>
 Process all groups.
 =cut
 sub processAll
 {
        my $self = shift;
        my %args = @_;
        $self->{'output'} = {};
        foreach my $id (keys(%{$self->{'rules'}})) {
                $self->processMetas($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id});
                $self->processGroups($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id});
                $self->processWords($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id});
                foreach my $file (keys(%{$self->{'rules'}->{$id}})) {
                        $self->processMetas($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id});
                        $self->processGroups($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id});
                        $self->processWords($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id});
                }
        }
 }
 =head2 C<$kw->writeFiles($out_dir)>
 Output files will use the name of each input file, stripping any extension, and
 forcing the name to uppercase. Rules in each file will be called:
@ -282,13 +827,60 @@ body for that word.
 sub writeFiles
 {
        my $self = shift;
-        my $dir = shift;
+        foreach my $out (keys(%{$self->{'rules'}})) {
                unless ($self->{'joinScores'}) {
                        $file =~ s/(.*)\.cf/$1_SCORES.cf/;
                        print STDERR $self->{'scores'};
                }
                if (open(my $fh, '>', $out)) {
                        print $fh $self->{'output'}->{$out};
                        close($fh);
                } else {
                        print STDERR "Failed to write $out\n";
                }
        }
        use Data::Dump;
        print Data::Dump::dump($self->{keywords});
        return 0;
 }
 =head1 MORE
 For discussion of the module and examples, see:
 E<lt>https://john.me.tz/projects/article.php?topic=Mail-SpamAssassin-KeywordRuleGenerator<gt>
 =head1 SEE ALSO
 Mail::SpamAssassin
 spamassassin
 =head1 BUGS
 Report issues to:
 E<lt>https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator/issuesE<gt>
 =head1 AUTHOR
 John Mertz <git@john.me.tz>
 =head1 COPYRIGHT
 Mail::SpamAssassin::KeywordRuleGenerator is distributed under the Apache License
 Version 2.0, as described in this file and the file C<LICENSE> included with the
 distribution.
 =head1 AVAILABILITY
 If possible, the latest version of this library will be made available from CPAN
 as well as:
 E<lt>https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGeneratorE<gt>
 =cut
 1;
 =pod
 die "Please provide rules file(s) as an argument\n" unless (defined($ARGV[0]));
--- a/t/02_files.cf
+++ b/t/02_files.cf
@ -0,0 +1 @@
 word
--- a/t/02_files.dir/02_one.cf
+++ b/t/02_files.dir/02_one.cf
@ -0,0 +1,3 @@
 oneone 2 LOCAL
 onetwo 0 GLOBAL
 onethree three
--- a/t/02_files.dir/02_two.cf
+++ b/t/02_files.dir/02_two.cf
@ -0,0 +1,3 @@
 twoone 2 LOCAL
 twotwo 0 GLOBAL
 twothree three
--- a/t/02_files.lnk
+++ b/t/02_files.lnk
@ -0,0 +1 @@
 /home/jpm/KeywordRuleGenerator/t/02_files.cf
--- a/t/02_files.t
+++ b/t/02_files.t
@ -0,0 +1,47 @@
 #!/usr/bin/perl
 # Basic test to ensure files can be found and read. Verification will come later
 use Test::More;
 use lib 'lib/';
 use Mail::SpamAssassin::KeywordRuleGenerator;
 my $id = '02';
 my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id } );
 # Read a single file
 my $file = 't/02_files.cf';
 ok (!$kw->readFile($file), "Run readFile on $file");
 ok ($kw->getFile($file) == $file, "Get current working file");
 ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files in rules");
 ok ($kw->{'rules'}->{'50_02_T_02_FILES.cf'}, "Loaded correct output name");
 ok (!$kw->clearFiles(), "Clear 'rules' hash");
 ok (!$kw->{'rules'}, "Fetched none after clearing");
 # Read a directory
 my $dir = 't/02_files.dir';
 ok (!$kw->readAll($dir), "Run file on dir");
 ok (scalar(keys(%{$kw->{'rules'}})) == 3, "Fetched correct number of files from dir");
 ok (!$kw->clearFiles(), "Clear dir files");
 # Read a symlink
 my $link = 't/02_files.lnk';
 ok (!$kw->readAll($link), "Run file on link");
 ok ($kw->getFile() == $file, "Fetched correct name ($file) from link ($link)");
 ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files from link");
 ok (!$kw->clearFiles(), "Clear link files");
 # Read multiple mixed
 ok (!$kw->readAll($file, $dir, $link), "Run readAll on multiple/mixed");
 ok (scalar(keys(%{$kw->{'rules'}})) == 4, "Fetched correct number of files from all");
 ok (!$kw->clearFiles(), "Clear dir files");
 ok ($kw->readFile('does_not_exist'), "Correctly failed for non-existent file");
 ok (!$kw->getFile(), "Current name undefined for non-existent file");
 ok ($kw->readFile('t/02_unreadable.cf'), "Correctly failed for non-readable file");
 ok (!$kw->getFile(), "Current name undefined for non-readable file");
 done_testing();
--- a/t/03_verify.t
+++ b/t/03_verify.t
@ -0,0 +1,64 @@
 #!/usr/bin/perl
 # Verify file input line formatting
 use Test::More;
 use lib 'lib/';
 use Mail::SpamAssassin::KeywordRuleGenerator;
 my $id = '03';
 my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id, 'debug' => 0 } );
 my @good = (
        'Lorem 1 GLOBAL # comment',
        'ipsum 0 GLOBAL # comment',
        'dolor GLOBAL # comment',
        'sit 1 # comment',
        'amet 0 # comment',
        'consectetur # comment',
        'adipiscing 1 GLOBAL',
        'elit 0 GLOBAL',
        'sed GLOBAL',
        'do 1',
        'eiusmod 0',
        'tempor',
 );
 foreach my $score ( '', 0, 1 ) {
        foreach my $group ( '', 'GLOBAL', 'GLOBAL LOCAL' ) {
                foreach my $comment ( '', 'TESTING', 'LONGER COMMENT' ) {
                        my $word = "word";
                        my $rule = $word;
                        if ($score ne '') {
                                $rule .= ' '.$score;
                        }
                        if ($group ne '') {
                                $rule .= ' '.$group;
                        }
                        if ($comment ne '') {
                                $rule .= ' # '.$comment;
                        }
                        my ($rword, $rscore, $rcomment, @rgroups) = $kw->readLine($rule);
                        ok($rword eq $word, "Word '$word' found for '$rule'");
                        ok($rscore eq $score, "Score '$score' found for '$rule'") if ($score ne '');
                        ok(join(' ',@rgroups) eq $group, "Groups '$group' found for '$rule'") if ($group ne '');
                        ok(join(' ',@rgroups) eq 'LOCAL GLOBAL', "Inferred groups 'LOCAL GLOBAL' found for '$rule'") if ($group eq '');
                        ok($rcomment eq $comment, "Comment '$comment' found for '$rule'") if ($comment ne '');
                }
        }
 }
 # Check for bad formatting
 my %bad = (
        '' => 'Ignore empty line',
        '# comment' => "Ignore line starting with comment",
        'word 1 2' => "Ignore line with multiple scores",
        '2 bad' => "Ignore line with score first",
 );
 foreach my $input (keys(%bad)) {
        ok(!$kw->readLine($input), "$bad{$input} ($input)");
 }
 done_testing();
--- a/t/04_rules.t
+++ b/t/04_rules.t
@ -0,0 +1,283 @@
 #!/usr/bin/perl
 # Basic test to ensure files can be found and read. Verification will come later
 use Test::More;
 use lib 'lib/';
 use Mail::SpamAssassin::KeywordRuleGenerator;
 my $id = '04';
 my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id, 'debug' => 0 } );
 my @files = ( 't/04_rules0.cf', 't/04_rules1.cf' );
 my @failed = @{$kw->readAll( @files )};
 ok(!scalar(@failed), "Load 'rules' hash with readAll");
 my $expected = getExpected();
 my ( $missing, $extra, $incorrect ) = 0;
 use Data::Dump;
 foreach my $file (keys(%{$expected})) {
        if ($file eq 'GLOBAL') {
                my ($m, $e) = compareGroups(
                        $expected->{$file},
                        $kw->{'rules'}->{$file}
                );
                print("Missing $m words\n") if ($m);
                print("$e extra words\n") if ($e);
                $missing += $m;
                $extra += $e;
        } else {
                foreach my $group (keys(%{$expected->{$file}})) {
                        if ($group eq 'SCORED') {
                                my ($m, $e, $i) = compareScores(
                                        $expected->{$file}->{$group},
                                        $kw->{'rules'}->{$file}->{$group}
                                );
                                $missing += $m;
                                $extra += $e;
                                $incorrect += $i;
                        } elsif ($group eq 'COMMENTS') {
                                my ($m, $e, $i) = compareScores(
                                        $expected->{$file}->{$group},
                                        $kw->{'rules'}->{$file}->{$group}
                                );
                                $missing += $m;
                                $extra += $e;
                                $incorrect += $i;
                        } else {
                                my ($m, $e) = compareGroups(
                                        $expected->{$file}->{$group},
                                        $kw->{'rules'}->{$file}->{$group}
                                );
                                $missing += $m;
                                $extra += $e;
                        }
                }
        }
 }
 ok ($missing == 0, "No expected rules are missing");
 ok ($extra == 0, "No extra rules are found");
 ok ($incorrect == 0, "No incorrect scores are found");
 done_testing();
 sub compareGroups
 {
        my $expect = shift;
        my $loaded = shift;
        my @e = sort(@{$expect}); 
        my @l = sort(@{$loaded}); 
        my ($missing, $extra) = (0, 0);
        while (scalar(@e)) {
                unless (scalar(@l)) {
                        print("extra words @l\n");
                        $extra += scalar(@e);
                        last;
                }
                if ($e[0] eq $l[0]) {
                        shift(@e);
                        shift(@l);
                        next();
                }
                if ($e[0] lt $l[0]) {
                        print("Extra word $e[0]\n");
                        $extra++;
                        shift(@e);
                        next;
                }
                if ($e[0] gt $l[0]) {
                        print("Missing word $l[0]\n");
                        $missing++;
                        shift(@l);
                        next;
                }
        }
        if (scalar(@e)) {
                print("Missing ".scalar(@e)." at the end of parsing\n");
                $missing += scalar(@e);
                last;
        }
        return ($missing, $extra);
 }
 sub compareScores
 {
        my $expect = shift;
        my $loaded = shift;
        my %remaining = %{$loaded};
        my ($missing, $extra, $incorrect) = (0, 0, 0);
        foreach my $word (keys(%$expect)) {
                if (!defined($loaded->{$word})) {
                        print("Missing score assignment for $word\n");
                        $missing++;
                } elsif ($expect->{$word} != $loaded->{$word}) {
                        print("Incorrect score assignment for $word\n");
                        $incorrect++;
                } else {
                        delete($remaining{$word});
                }
        }
        $extra = scalar(keys(%remaining));
        print("Extra score assignment for $word\n") foreach (@{$extra});
        return ($missing, $extra, $incorrect);
 }
 sub getExpected
 {
        my %expected = (
                'GLOBAL' => [
                        'lorem',
                        'ipsum',
                        'dolor',
                        'sit',
                        'amet',
                        'consectetur',
                        'adipiscing',
                        'elit',
                        'sed',
                        'do',
                        'eiusmod',
                        'tempor',
                        'minim',
                        'veniam',
                        'quis',
                        'nostrud',
                        'exercitation',
                        'ullamco',
                        'laboris',
                        'nisi',
                        'ut',
                        'aliquip',
                        'ex',
                        'ea',
                        'commodo',
                        'consequat',
                        'duis',
                        'dolore',
                        'eu',
                        'fugiat',
                ],
                '50_04_T_04_RULES0.cf' => {
                        'SCORED' => {
                                'lorem' => 1,
                                'sit' => 1,
                                'adipiscing' => 1,
                                'do' => 1,
                                'incididunt' => 1,
                                'et' => 1,
                                'aliqua' => 1,
                                'minim' => 1,
                        },
                        'COMMENTS' => {
                                'lorem' => '1, 1, 1, 1',
                                'ipsum' => '1, 1, 1, 0',
                                'dolor' => '1, 1, 1, undef',
                                'sit' => '1, 1, 0, 1',
                                'amet' => '1, 1, 0, 0',
                                'consectetur' => '1, 1, 0, undef',
                                'adipiscing' => '1, 0, 1, 1',
                                'elit' => '1, 0, 1, 0',
                                'sed' => '1, 0, 1, undef',
                                'do' => '1, 0, 0, 1',
                                'eiusmod' => '1, 0, 0, 0',
                                'tempor' => '1, 0, 0, undef',
                                'incididunt' => '0, 1, 1, 1',
                                'ut' => '0, 1, 1, 0',
                                'labore' => '0, 1, 1, undef',
                                'et' => '0, 1, 0, 1',
                                'dolore' => '0, 1, 0, 0',
                                'magna' => '0, 1, 0, undef',
                                'aliqua' => '0, 0, 1, 1',
                                'enim' => '0, 0, 1, 0',
                                'ad' => '0, 0, 1, undef',
                                'minim' => '0, 0, 0, 1',
                                'veniam' => '0, 0, 0, 0',
                                'quis' => '0, 0, 0, undef',
                        },
                        'LOCAL' => [
                                'lorem',
                                'ipsum',
                                'dolor',
                                'sit',
                                'amet',
                                'consectetur',
                                'incididunt',
                                'ut',
                                'labore',
                                'et',
                                'dolore',
                                'magna',
                                'minim',
                                'veniam',
                                'quis',
                        ],
                        'group' => [
                                'lorem',
                                'ipsum',
                                'dolor',
                                'adipiscing',
                                'elit',
                                'sed',
                                'incididunt',
                                'ut',
                                'labore',
                                'aliqua',
                                'enim',
                                'ad',
                        ]
                },
                '50_04_T_04_RULES1.cf' => {
                        'SCORED' => {
                                'nostrud' => 1,
                                'laboris' => 1,
                                'aliquip' => 1,
                                'consequat' => 1,
                                'aute' => 1,
                                'in' => 1,
                                'velit' => 1,
                                'dolore' => 1,
                        },
                        'LOCAL' => [
                                'nostrud',
                                'exercitation',
                                'ullamco',
                                'laboris',
                                'nisi',
                                'ut',
                                'aute',
                                'irure',
                                'dolor',
                                'in',
                                'reprehenderit',
                                'volupatate',
                                'dolore',
                                'eu',
                                'fugiat',
                        ],
                        'group' => [
                                'nostrud',
                                'exercitation',
                                'ullamco',
                                'aliquip',
                                'ex',
                                'ea',
                                'aute',
                                'irure',
                                'dolor',
                                'velit',
                                'esse',
                                'cillum',
                        ]
                }
        );
        return \%expected;
 }
--- a/t/04_rules0.cf
+++ b/t/04_rules0.cf
@ -0,0 +1,28 @@
 # Comments to be ignored. As with following blank line.
 # Must test every combination of:
 # GLOBAL, LOCAL, GROUP, SCORE
 Lorem 1 GLOBAL LOCAL group # 1, 1, 1, 1
 ipsum 0 GLOBAL LOCAL group # 1, 1, 1, 0
 dolor GLOBAL LOCAL group # 1, 1, 1, undef
 sit 1 GLOBAL LOCAL # 1, 1, 0, 1
 amet 0 GLOBAL LOCAL # 1, 1, 0, 0
 consectetur GLOBAL LOCAL # 1, 1, 0, undef
 adipiscing 1 GLOBAL group # 1, 0, 1, 1
 elit 0 GLOBAL group # 1, 0, 1, 0
 sed GLOBAL group # 1, 0, 1, undef
 do 1 GLOBAL # 1, 0, 0, 1
 eiusmod 0 GLOBAL # 1, 0, 0, 0
 tempor GLOBAL # 1, 0, 0, undef
 incididunt 1 LOCAL group # 0, 1, 1, 1
 ut 0 LOCAL group # 0, 1, 1, 0
 labore LOCAL group # 0, 1, 1, undef
 et 1 LOCAL # 0, 1, 0, 1
 dolore 0 LOCAL # 0, 1, 0, 0
 magna LOCAL # 0, 1, 0, undef
 aliqua 1 group # 0, 0, 1, 1
 enim 0 group # 0, 0, 1, 0
 ad group # 0, 0, 1, undef
 minim 1 # 0, 0, 0, 1
 veniam 0 # 0, 0, 0, 0
 quis # 0, 0, 0, undef
--- a/t/04_rules1.cf
+++ b/t/04_rules1.cf
@ -0,0 +1,24 @@
 nostrud 1 GLOBAL LOCAL group
 exercitation 0 GLOBAL LOCAL group
 ullamco GLOBAL LOCAL group
 laboris 1 GLOBAL LOCAL
 nisi 0 GLOBAL LOCAL
 ut GLOBAL LOCAL
 aliquip 1 GLOBAL group
 ex 0 GLOBAL group
 ea GLOBAL group
 commodo 0 GLOBAL
 consequat 1 GLOBAL
 Duis GLOBAL
 aute 1 LOCAL group
 irure 0 LOCAL group
 dolor LOCAL group
 in 1 LOCAL
 reprehenderit 0 LOCAL
 volupatate LOCAL
 velit 1 group
 esse 0 group
 cillum group
 dolore 1
 eu 0
 fugiat
		`@ -0,0 +1 @@`
							`/home/jpm/KeywordRuleGenerator/t/02_files.cf`