Additional tests

Locate and open files Validate config file input formatting Check that the hash representation of rules read from files is correct
2022-11-28 21:05:01 -05:00 · 2022-11-28 21:05:01 -05:00 · cfbcee7c7d
parent 944c3eed5e
commit cfbcee7c7d
10 changed files with 1131 additions and 85 deletions
--- a/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm
+++ b/lib/Mail/SpamAssassin/KeywordRuleGenerator.pm
@ -1,9 +1,19 @@
-#!/usr/bin/perl
-
-package Mail::SpamAssassin::KeywordRuleGenerator;
-
-use strict;
-use warnings;
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>

 =head1 NAME

@ -11,11 +21,7 @@ Mail::SpamAssassin::KeywordRuleGenerator - Generate SA rules for keywords

 =cut

-our $VERSION = '0.01';
-
-=head1 VERSION
-
-Version 0.01
+package Mail::SpamAssassin::KeywordRuleGenerator;

 =head1 SYNOPSIS

@ -23,6 +29,31 @@ Generate SpamAssassin compatible configuration files given lists of keywords.
 Implemented as a module largely for testing purposes.

    use Mail::SpamAssassin::KeywordRuleGenerator;
+    my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new($id);
+    $kw->readFile('keywords.cf');
+    $kw->writeFiles();
+
+=head1 DESCRIPTION
+
+Mail::SpamAssassin::KeywordRuleGenerator does what it says on the tin; it
+generates SpamAssassin compatible configuration files to catch keywords that you
+specify. Most simply, it can take in one or more properly formatted input files
+(see FILES), generate rules for individual files (optional), as well as meta
+rules for the counts for each set of keywords (ie. 1 of N hit; 2 of N hit...).
+See RULES for more on how the rules are generated.
+
+The sets of keywords can be broken up into groups (see GROUPS).
+
+=head1 PREREQUISITES
+
+Requires C<spamassassin> executable and the following Perl modules
+
+        To::Be::Determined
+
+=cut
+
+#use strict;
+use warnings;

 =head2 FILES

@ -77,6 +108,14 @@ describe        ID_FILE_GROUP_1         1 match in keyword group 'GROUP'
 meta            ID_FILE_GROUP_2         ( __ID_FILE_WORD + __ID_FILE_OTHER ) > 1
 describe        ID_FILE_GROUP_2         2 matches in keyword group 'GROUP'

+=head1 VERSION
+
+Version 0.01
+
+=cut
+
+our $VERSION = '0.01';
+
 =head1 EXPORT

 =cut
@ -89,86 +128,284 @@ our @EXPORT = qw( new );
 our %EXPORT_TAGS = (
        'all' => [
                'new',
-                ''
+                'getId',
+                'setId',
+                'getPriority',
+                'setPriority',
+                'getOutfile',
+                'setOutfile',
+                'getScorefile',
+                'setScorefile',
+                'clearFiles',
+                'getFiles',
+                'nextFile',
+                'readFile',
+                'getFile',
+                'setFile',
+                'joinRules',
+                'processMetas',
+                'processWords',
+                'processGroups',
+                'processAll',
+                'writeFile',
+                'writeAll'
        ]
 );

 our @EXPORT_OK = ( @{ $EXPORT_TAGS{all} } );

-=head1 SUBROUTINES/METHODS
+=head1 METHODS

-=head2 C<$kw = new($id)>
+Except for getter methods, all methods make changes directly within the context
+object and only return errors. Thus, each call can be verified by checking that
+it returned undef:
+
+    die "Failed to run 'method'\n" if($kw->method($arg));
+
+or:
+
+    my $err = $kw->method($arg);
+    die "Received $err\n" if ($err);
+
+Getter methods will return the attribute or undef if it does not exist.
+
+=head2 C<$kw = new( %args )>

 Creates a new C<Mail::SpamAssassin::KeywordRuleGenerator> object and returns it.

-C<$id> is an optional identifier which will be used to prefix any rule output.
-It can be updated at any time via the C<$kw->{id}> value or with the
-C<$kw->id($id)> setter;
+C<$args> is an optional hash to assign any preliminary attributes or flags to
+the object. See various 'get' and 'set' functions.

 =cut

 sub new
 {
-        my ($class, $id) = @_;
+        my ($class, $args) = @_;
+        my $self = $args;

-        $id = uc($id) || 'KW';
-        bless {
-                id => $id || '',
-                keywords => {}
-        } => $class;
-}
+=head3 Initial attributes

-=head2 C<$kw->id()>
+While other attributes will exist within the object during processing, those
+which make sense to define up-front and which will not be overwritten by any of
+the built-in functions (other than their associated getters) are:

-Get/Setter for C<$kw->{'id'}>
+=head3 id
+
+    id scalar
+        A short identifier which will be appending to all output files to allow
+        for easier recognition of the files' sources. Default: 'KW'

 =cut

-sub id
+        $self->{'id'}                   //= 'KW';
+
+=head3 priority
+
+    priority scalar
+        Typically a number used to define the priority of the rules.
+        SpamAssassin will read configuration files in alphabetical order and the
+        last iteration of a configuration for the same rule will be used. This
+        means that the files read last will overwrite those read earlier. By
+        convention the number is used for easy sorting. Any leading aphabetical
+        character will be ordered after all numbers, and so given high priority.
+        Default: '50'
+
+=cut
+
+        $self->{'priority'}             //= 50;
+
+=head3 debug
+
+    debug boolean
+        Enable (1) or disable (0) debugging output. Default: 0
+
+=cut
+
+        $self->{'debug'}                //= 0;
+
+=head3 singleOutfile
+
+    singleOutfile boolean
+        Indicates whether output rules should all be added to a single file (1),
+        or to one file per input file (0). Note that this still requires
+        'joinScores' to have one file total. Default: 0
+
+=cut
+
+        $self->{'singleOutfile'}        //= 0;
+
+=head3 joinScores
+
+    joinScores bootlean
+        Indicates whether to include the scores in the same file as their
+        associated rule definitions (1) or in a second file on their own (0).
+        The second file will simply append '_SCORES' to the file name (prior to
+        the '.cf'), unless overridden by C<$kw->setScoreFile($path)> or a second
+        path argument in C<$kw->setOutfile($rulePath, $scorePath)>. Default: 1
+
+=cut
+
+        $self->{'joinScores'}           //= 1;
+
+        bless $self, $class;
+        return $self;
+}
+
+=head2 C<$kw->getId()>
+
+Getter for C<$kw->{'id'}>. 'id' is used for top-level rule names.
+
+=cut
+
+sub getId
+{
+        my $self = shift;
+
+        if (defined($self->{'id'})) {
+                return $self->{'id'};
+        }
+}
+
+=head2 C<$kw->setId()>
+
+Setter for C<$kw->{'id'}>
+
+=cut
+
+sub setId
 {
        my $self = shift;
        my $id = shift;

        if (defined($id)) {
                $id = uc($id);
-                $self->{'id'} = $id;
+                $self->{'id'} = $id || return "Failed to set $id";
        } else {
-                if (defined($self->{'id'})) {
-                        return $self->{'id'};
-                } else {
-                        return undef;
-                }
+                return "No ID provided\n";
        }
 }

-=head2 C<$kw->file()>
+=head2 C<$kw->getPriority()>.

-Get/Setter for C<$kw->{'file'}>
+Getter for C<$kw->{'priority'}>. 'priority' value used for the output filepath
+to indicate the load order. SpamAssassin reads in ascending order with later
+iterations overriding earlier. The 'score' file will increment so that it comes
+at the end.

 =cut

-sub file
+sub getPriority
+{
+        my $self = shift;
+
+        if (defined($self->{'priority'})) {
+                return $self->{'priority'};
+        }
+}
+
+=head2 C<$kw->setPriority()>
+
+Setter for C<$kw->{'priority'}>.
+
+=cut
+
+sub setPriority
+{
+        my $self = shift;
+        my $priority = shift;
+
+        if (defined($priority)) {
+                $path = uc($priority);
+                $self->{'priority'} = $priority || return "Failed to set priority: $priority";
+        } else {
+                return "No 'priority' provided\n";
+        }
+}
+
+=head2 C<$kw->getFile()>
+
+Getter for C<$kw->{'file'}>. This is the name of the current output file being
+processed. Not to be confused with C<readFile> or C<getOutfile>.
+
+=cut
+
+sub getFile
+{
+        my $self = shift;
+
+        if (defined($self->{'file'})) {
+                return $self->{'file'};
+        }
+}
+
+=head2 C<$kw->setFile()>
+
+Setter for C<$kw->{'file'}>.
+
+=cut
+
+sub setFile
 {
        my $self = shift;
        my $file = shift;

        if (defined($file)) {
-                my $name = $file;
-                $name =~ s/^([a-zA-Z0-9\-_]*)(\..*)?$/$1/;
-                $name =~ s/-/_/;
-                $name = uc($name);
-                die("invalid filename\n") unless ($name =~ m/^[A-Z_]+$/);
-                $self->{'file'} = $name;
+                $self->{'file'} = $file;
        } else {
-                if (defined($self->{'file'})) {
-                        return $self->{'file'};
+                return "No File provided\n";
+        }
+}
+
+=head2 C<$kw->getOutfile()>
+
+Getter for C<$kw->{'outfile'}>. 'outfile' represents the real filepath of the
+output file which is currently being processed.
+
+=cut
+
+sub getOutfile
+{
+        my $self = shift;
+
+        if (defined($self->{'outfile'})) {
+                return $self->{'outfile'};
+        }
+}
+
+=head2 C<$kw->setOutfile()>
+
+Setter for C<$kw->{'outfile'}>. Can be defined manually with a scalar argument,
+otherwise the path is constructed from the existing attributes.
+
+=cut
+
+sub setOutfile
+{
+        my $self = shift;
+        my $path = shift;
+
+        if (defined($path)) {
+                $self->{'outfile'} = $path || return "Failed to set $path";
+        } else {
+                if ($self->{'singleOutfile'}) {
+                        $self->{'outfile'} = $self->{'priority'} .
+                                '_' . uc($self->{'id'}) .
+                                '.cf';
                } else {
-                        return undef;
+                        my $file = $self->{'file'};
+                        $file =~ s/\//_/g; # Change dir slashes to _
+                        $file =~ s/(\.[^\.]*)$//g; # Remove extensions
+                        $file = uc($file); # Convert to uppercase for rule names
+
+                        $self->{'outfile'} = $self->{'priority'} .
+                                '_' . uc($self->{'id'}) .
+                                '_' . $file . 
+                                '.cf';
                }
        }
 }

-=head2 C<$kw->getFile();>
+=head2 C<$kw->getFiles($regex);>

 Simple recursive search for files within a directory. Will validate that each
 file is readable and return an array of file names.
@ -184,24 +421,63 @@ The regex will be used as a file filter and will only return files that match.
 sub getFiles
 {
        my $self = shift;
-        my @args = @_;
+        my $regex = shift;

+        my $return = '';
        foreach (@args) {
-                die "$_ does not exist\n" unless (-e "$_" || -l "$_");
+                $return .= "$_ does not exist\n" unless (-e "$_" || -l "$_");
                if (-l $_) {
-                        getFiles(readlink($_));
+                        $self->getFiles(readlink($_));
                } elsif (-d $_) {
                        my @recursive = glob($_."/*");
-                        getFiles(@recursive);
+                        $self->getFiles(@recursive);
                } else {
-                        die "$_ is not readable\n" unless (-r "$_");
-                        push(@{$self->{files_ref}}, $_);
+                        if (defined($regex)) {
+                                if ($_ =~ $regex) {
+                                        push(@{$self->{'files_ref'}}, $_);
+                                } else {
+                                        next;
+                                }
+                        } else {
+                                $return .= "$_ is not readable\n" unless (-r "$_");
+                                push(@{$self->{'files_ref'}}, $_);
+                        }
                }
        }
-        return @$self->{files_ref};
+        return $return;
 }

-=head2 C<$kw->readFile();>
+=head2 C<$kw->nextFile();>
+
+Shift next message in files_ref queue to current.
+
+=cut
+
+sub nextFile
+{
+        my $self = shift;
+        if (scalar(@{$self->{'files_ref'}})) {
+                $self->{'path'} = shift(@{$self->{'files_ref'}});
+        } else {
+                return "End of array\n";
+        }
+        return 0;
+}
+
+=head2 C<$kw->clearFiles();>
+
+Clear the current file reference queue.
+
+=cut
+
+sub clearFiles
+{
+        my $self = shift;
+        delete($self->{'rules'}) || return "Failed to delete rules hash\n";
+        return 0;
+}
+
+=head2 C<$kw->readFile(%args);>

 Read in properly formatted keyword list file. The basic format is one keyword
 per line, an optional score, and an optional list of 'groups'.
@ -218,45 +494,314 @@ When keyword groups are omitted, that keyword defaults to just the 'LOCAL' group

 Examples with other combinations are:

-# word 0                        Same as previous
-# word 2                        Used for LOCAL, not GLOBAL, scores 2 on it's own
-# word GROUP                    GROUP group, not LOCAL, not GLOBAL, no score
-# word 0 GROUP                  Same as previous
-# word 1 GROUP GLOBAL LOCAL     GROUP group, in GLOBAL, in LOCAL, scores 1
+    word 0                      # Same as previous
+    word 2                      # Used for LOCAL, not GLOBAL, scores 2
+    word GROUP                  # GROUP group, not LOCAL, not GLOBAL, no score
+    word 0 GROUP                # Same as previous
+    word 1 GROUP GLOBAL LOCAL   # GROUP group, in GLOBAL, in LOCAL, scores 1
+
+C<%args> can be used to override the attributes used to index the rules,
+including the id, file
+
+This function intentionally does not iterate through the files queue so that you
+can step through and make modifications as you go.

 =cut

-sub readFiles
+sub readFile
 {
-        my $key_ref = shift;
-        my @files = @_;
+        my $self = shift;
+        my $file = shift || return 'No file provided';
+        my %args = @_;
+        $self->{'file'} = $file;
+        $self->setOutfile();
+        my $n = 0;

-        foreach my $file (@files) {
-                my $n = 0; # Track line number for errors
-                my $name = $file;
-                $name =~ s/\//_/g; # Change dir slashes to _
-                $name =~ s/(\.[^\.]*)*$//g; # Remove extensions
-                $name = uc($name); # Convert to uppercase for rule names
-                if (open(my $fh, '<', $file)) {
-                        while (<$fh>) {
-                                $n++;
-                                # Ignore blank lines and comments
-                                if ($_ =~ m/^\s*$/ || $_ =~ m/^#/) {
-                                        next;
-                                # Verify formatting
-                                } elsif ($_ =~ m/^([^\s]+)\s+([0-9]+)(?:\s+(.*))?/) {
-                                        my ($word, $score, $groups) = ($1, $2, $3);
-                                        print "'$word' '$score' " . (join(',',$groups)) . "\n";
-                                } else {
-                                        die "Invalid input in $file, line $n: $_\n";
+        if (open(my $fh, '<', $file)) {
+                my $rules = 0;
+                while (<$fh>) {
+                        $n++;
+                        # Ignore blank lines and comments
+                        if ($_ =~ m/^\s*$/ || $_ =~ m/^#/) {
+                                next;
+                        # Verify formatting
+                        } elsif (my ($word, $score, $comment, @groups) = $self->readLine($_)) {
+                                if ($self->{'debug'}) {
+                                        print "FOUND: '$word' '$score' " . (join(',',@groups)) . "\n";
                                }
+                                foreach my $group (@groups) {
+                                        if ($group eq 'GLOBAL') {
+                                                push(@{$self->{'rules'}->{'GLOBAL'}}, $word);
+                                        } else {
+                                                push(@{$self->{'rules'}->{$self->{'outfile'}}->{$group}}, $word);
+                                        }
+                                }
+                                $self->{'rules'}->{$self->{'outfile'}}->{'SCORED'}->{$word} = $score if ($score);
+                                $self->{'rules'}->{$self->{'outfile'}}->{'COMMENTS'}->{$word} = $comment if ($comment);
+                        } elsif ($self->{debug}) {
+                                print STDERR "ERROR: Invalid input in $file, line $n: $_\n";
+                        }
+                }
+                print STDERR "No rules found in $file\n" unless ($self->{'rules'}->{$self->{'outfile'}} || !$self->{'debug'});
+                return "No rules found in $file\n" unless ($self->{'rules'}->{$self->{'outfile'}});
+        } else {
+                delete($self->{'file'});
+                return "Failed to read $file";
+        }
+        return undef;
+}
+
+=head2 C<$kw->readAll();>
+
+Loops through C<readFile> for all files in the queue.
+
+=cut
+
+sub readAll
+{
+        my $self = shift;
+        my @files = ( shift ) || return "No files provided\n";
+        while (scalar(@_)) {
+                push(@files, shift);
+        }
+
+        my @failed;
+        foreach my $file (@files) {
+                if (!-e $file) {
+                        push(@failed, $file);
+                        print STDERR "$file does not exist\n" if ($self->{'debug'});
+                } elsif (-l $file) {
+                        # Accept only links within PWD
+                        my $dest = readlink($file);
+                        if ($dest =~ m/^\//) {
+                                $dest =~ s/$ENV{'PWD'}\/// ;
+                                if ($dest eq readlink($file)) {
+                                        push(@failed, $file);
+                                        print STDERR "Symlink outside of PWD $file\n" if ($self->{'debug'});
+                                        next;
+                                }
+                        }
+                        $self->readAll($dest);
+                } elsif (-f $file) {
+                        push(@failed, "Failed to read $file") if $self->readFile($file);
+                } elsif (-d $file) {
+                        push(@failed, @_) if $self->readAll(glob($file.'/*'));
+                # Bash pattern?
+                } else {
+                        if (my @glob = glob($file)) {
+                                push(@failed, ( @_ )) if $self->readAll(@glob);
+                        } else {
+                                print STDERR "Bad file $file\n" if ($self->{'debug'});
                        }
                }
        }
-        #print "$_\n" foreach(@$files_ref);
+        return @failed if (scalar(@failed));
+        return undef;
 }

-=head 2 C<$kw->writeFiles($out_dir)>
+=head2 C<$kw->readLine($line)>
+
+Reads a line from a configuration file and returns the relevant values or undef
+if the line is not properly formatted.
+
+=cut
+
+sub readLine
+{
+        my $self = shift;
+        my $line = shift;
+        my $invalid = shift;
+        my ($word, $score, $comment, @groups);
+        if (my @sections = $line =~ m/(?:^|\s+)(?:([^\d\s#]\S+)|(\d+(?:\.\d+)?\b)|([^\d\s#]+)|(#.*$))/g) {
+                while (@sections) {
+                        next unless my $section = shift(@sections);
+                        if (defined($comment)) {
+                                $comment .= ' ' if ($comment ne '');
+                                $comment .= $section;
+                        } elsif (!defined($word) && $section =~ m/^([^\d\s#]\S+)$/) {
+                                $word = $section;
+                        } elsif (defined($word) && $section =~ m/^(\d+(?:\.\d+)?)$/ && !defined($score)) {
+                                $score = $section;
+                        } elsif (defined($word) && $section =~ m/^([^\d\s#]+)$/) {
+                                push(@groups, $section);
+                        } elsif (defined($word) && $section =~ m/^#.*$/ && !defined($comment)) {
+                                $comment = $section;
+                                $comment =~ s/^#\s*//;
+                        } else {
+                                push(@invalid, $section);
+                        }
+                }
+                if (scalar(@invalid)) {
+                        if ($self->{'debug'}) {
+                                print("Invalid clauses: '".join("', ", @invalid)."' in '$line'\n");
+                        }
+                        return undef;
+                }
+                return undef unless ($word);
+                $word = lc($word);
+                $score //= 0;
+                $comment //= '';
+                @groups = ( 'LOCAL', 'GLOBAL' ) unless (scalar(@groups));
+                return ( $word, $score, $comment, @groups );
+        }
+        return ();
+}
+
+=head2 C<$kw->joinRules(%rules)>
+
+Merge rules hash into working output hash. Generally called from C<readFile>,
+but if you want to add rules manually, it will require the following format:
+
+{
+        'word' => {
+                'score' => 0,
+                'groups' => ( 'LOCAL' )
+        },
+        'other' => {
+                'score' => 1,
+                'groups' => ( 'GLOBAL', 'group' )
+        }
+}
+
+The main context object will provide the working 'ID', 'FILE' and other
+attributes necessary to nest these rules in the existing hash.
+
+=cut
+
+sub joinRules
+{
+        my $self = shift;
+        my %rules = @_;
+
+        if ($self->{'unified'}) {
+                $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'out'} = $self->{'priority'}.'_'.$self->{'id'}.'.cf';
+        } else {
+                $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'out'} = $self->{'priority'}.'_'.$self->{'id'}.'_'.$self->{'file'}.'.cf';
+        }
+        foreach my $word (keys(%rules)) {
+                my $score = $rules{$word}{'score'} || 0;
+                my @groups = $rules{$word}{'groups'} || ( 'GLOBAL' );
+                if ($score) {
+                        $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'words'}->{$word} = $score;
+                }
+                foreach my $group (@groups) {
+                        if ($group eq 'GLOBAL') {
+                                if (scalar(keys(%{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}}))) {
+                                        push(@{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}}, $word);
+                                } else {
+                                        $self->{'rules'}->{$self->{'id'}}->{'GLOBAL'} = ( $word );
+                                }
+                                next;
+                        }
+                        if (scalar(keys(%{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group}}))) {
+                                push(@{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group}}, $word);
+                        } else {
+                                $self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'groups'}->{$group} = ( $word );
+                        }
+                }
+        }
+        return 0;
+}
+
+=head2 C<$kw->processMetas($outfile, $file);>
+
+Create all of the component meta rules for the declared C<$file>. Those that
+will be used for the standalone and count rules. Output to C<$outfile>. This
+must be run before the other process methods and must be run for 'GLOBAL' first,
+otherwise output will be invalid. Meta rules for file-specific words will not be
+generated if they are also in the 'GLOBAL' group, instead the meta rules from
+the 'GLOBAL' file will be used for the count rules in all other files. This
+will prevent duplicates, but also requires that you not rename output files such
+that the they appear before the 'GLOBAL' file (without '_C<$file>' at the end).
+
+=cut
+
+sub processMetas
+{
+        my $self = shift;
+        my $outfile = shift;
+        my $file = shift;
+        my $rules = shift;
+
+        my $prefix = $self->{'id'};
+        my @words;
+        if ($file eq 'GLOBAL') {
+                @words = @{$rules->{'GLOBAL'}};
+        } else {
+                $prefix .= "_".$file;
+                foreach (keys(%{$rules->{$file}->{'groups'}})) {
+                        next if (grep {/^$_$/} @{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}});
+                }
+                foreach (keys(%{$self->{'rules'}->{$self->{'id'}}->{$self->{'file'}}->{'words'}})) {
+                        next if (grep {/^$_$/} @{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}});
+                        next if (grep {/^$_$/} @words);
+                        push (@words, $_);
+                }
+        }
+        foreach my $word (@words) {
+                $self->{'output'}->{$outfile} .=
+                        "body\t__".$prefix."_".uc($word)."_BODY\t/\\b".$word."\\b/\n" .
+                        "header\t__".$prefix."_".uc($word)."_SUBJ\tSubject =~ /\\b".$word."\\b/\n" .
+                        "meta\t__".$prefix."_".uc($word)."\t( __".$prefix."_".uc($word)."_BODY || __".$prefix."_".uc($word)."_SUBJ )\n\n";
+        }
+
+}
+
+=head2 C<$kw->processWords($outfile,%args);>
+
+Take a list of words with scores and add them to C<$outfile>.
+
+=cut
+
+sub processWords
+{
+        my $self = shift;
+        my $out = shift;
+}
+
+=head2 C<$kw->processGroup(%args);>
+
+Take a single group, including 'GLOBAL' and add it to C<$outfile>.
+
+=cut
+
+sub processGroup
+{
+        my $self = shift;
+        my $outfile = shift;
+
+        for (my $i = 0; $i < scalar(@all); $i++) {
+                $files->{$self->{'priority'}.'_'.$self->{'id'}.'cf'} .=
+                        "meta\t".$self->{'id'}."_".$i."\t( ".join(' + ',@all)." ) > $i\n" .
+                        "describe\tMatched ".($i+1)."of keywords: ".join(', ',@{$self->{'rules'}->{$self->{'id'}}->{'GLOBAL'}})."\n\n";
+        }
+}
+
+=head2 C<$kw->processAll(%args);>
+
+Process all groups.
+
+=cut
+
+sub processAll
+{
+        my $self = shift;
+        my %args = @_;
+
+        $self->{'output'} = {};
+        foreach my $id (keys(%{$self->{'rules'}})) {
+                $self->processMetas($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id});
+                $self->processGroups($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id});
+                $self->processWords($self->{'priority'}.'_'.$id.'.cf','GLOBAL',$self->{'rules'}->{$id});
+                foreach my $file (keys(%{$self->{'rules'}->{$id}})) {
+                        $self->processMetas($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id});
+                        $self->processGroups($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id});
+                        $self->processWords($self->{'priority'}.'_'.$id.'_'.$file.'.cf','GLOBAL',$self->{'rules'}->{$id});
+                }
+        }
+}
+
+=head2 C<$kw->writeFiles($out_dir)>

 Output files will use the name of each input file, stripping any extension, and
 forcing the name to uppercase. Rules in each file will be called:
@ -282,13 +827,60 @@ body for that word.
 sub writeFiles
 {
        my $self = shift;
-        my $dir = shift;
+        foreach my $out (keys(%{$self->{'rules'}})) {
+                unless ($self->{'joinScores'}) {
+                        $file =~ s/(.*)\.cf/$1_SCORES.cf/;
+                        print STDERR $self->{'scores'};
+                }
+                if (open(my $fh, '>', $out)) {
+                        print $fh $self->{'output'}->{$out};
+                        close($fh);
+                } else {
+                        print STDERR "Failed to write $out\n";
+                }
+        }

-        use Data::Dump;
-        print Data::Dump::dump($self->{keywords});
        return 0;
 }

+=head1 MORE
+
+For discussion of the module and examples, see:
+
+E<lt>https://john.me.tz/projects/article.php?topic=Mail-SpamAssassin-KeywordRuleGenerator<gt>
+
+=head1 SEE ALSO
+
+Mail::SpamAssassin
+spamassassin
+
+=head1 BUGS
+
+Report issues to:
+
+E<lt>https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator/issuesE<gt>
+
+=head1 AUTHOR
+
+John Mertz <git@john.me.tz>
+
+=head1 COPYRIGHT
+
+Mail::SpamAssassin::KeywordRuleGenerator is distributed under the Apache License
+Version 2.0, as described in this file and the file C<LICENSE> included with the
+distribution.
+
+=head1 AVAILABILITY
+
+If possible, the latest version of this library will be made available from CPAN
+as well as:
+
+E<lt>https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGeneratorE<gt>
+
+=cut
+
+1;
+
 =pod
 die "Please provide rules file(s) as an argument\n" unless (defined($ARGV[0]));

--- a/t/02_files.cf
+++ b/t/02_files.cf
@ -0,0 +1 @@
+word
--- a/t/02_files.dir/02_one.cf
+++ b/t/02_files.dir/02_one.cf
@ -0,0 +1,3 @@
+oneone 2 LOCAL
+onetwo 0 GLOBAL
+onethree three
--- a/t/02_files.dir/02_two.cf
+++ b/t/02_files.dir/02_two.cf
@ -0,0 +1,3 @@
+twoone 2 LOCAL
+twotwo 0 GLOBAL
+twothree three
--- a/t/02_files.lnk
+++ b/t/02_files.lnk
@ -0,0 +1 @@
+/home/jpm/KeywordRuleGenerator/t/02_files.cf
--- a/t/02_files.t
+++ b/t/02_files.t
@ -0,0 +1,47 @@
+#!/usr/bin/perl
+
+# Basic test to ensure files can be found and read. Verification will come later
+
+use Test::More;
+
+use lib 'lib/';
+use Mail::SpamAssassin::KeywordRuleGenerator;
+
+my $id = '02';
+
+my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id } );
+
+# Read a single file
+my $file = 't/02_files.cf';
+ok (!$kw->readFile($file), "Run readFile on $file");
+ok ($kw->getFile($file) == $file, "Get current working file");
+ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files in rules");
+ok ($kw->{'rules'}->{'50_02_T_02_FILES.cf'}, "Loaded correct output name");
+ok (!$kw->clearFiles(), "Clear 'rules' hash");
+ok (!$kw->{'rules'}, "Fetched none after clearing");
+
+# Read a directory
+my $dir = 't/02_files.dir';
+ok (!$kw->readAll($dir), "Run file on dir");
+ok (scalar(keys(%{$kw->{'rules'}})) == 3, "Fetched correct number of files from dir");
+ok (!$kw->clearFiles(), "Clear dir files");
+
+# Read a symlink
+my $link = 't/02_files.lnk';
+ok (!$kw->readAll($link), "Run file on link");
+ok ($kw->getFile() == $file, "Fetched correct name ($file) from link ($link)");
+ok (scalar(keys(%{$kw->{'rules'}})) == 2, "Fetched correct number of files from link");
+ok (!$kw->clearFiles(), "Clear link files");
+
+# Read multiple mixed
+ok (!$kw->readAll($file, $dir, $link), "Run readAll on multiple/mixed");
+ok (scalar(keys(%{$kw->{'rules'}})) == 4, "Fetched correct number of files from all");
+ok (!$kw->clearFiles(), "Clear dir files");
+
+
+ok ($kw->readFile('does_not_exist'), "Correctly failed for non-existent file");
+ok (!$kw->getFile(), "Current name undefined for non-existent file");
+ok ($kw->readFile('t/02_unreadable.cf'), "Correctly failed for non-readable file");
+ok (!$kw->getFile(), "Current name undefined for non-readable file");
+
+done_testing();
--- a/t/03_verify.t
+++ b/t/03_verify.t
@ -0,0 +1,64 @@
+#!/usr/bin/perl
+
+# Verify file input line formatting
+
+use Test::More;
+
+use lib 'lib/';
+use Mail::SpamAssassin::KeywordRuleGenerator;
+
+my $id = '03';
+
+my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id, 'debug' => 0 } );
+
+my @good = (
+        'Lorem 1 GLOBAL # comment',
+        'ipsum 0 GLOBAL # comment',
+        'dolor GLOBAL # comment',
+        'sit 1 # comment',
+        'amet 0 # comment',
+        'consectetur # comment',
+        'adipiscing 1 GLOBAL',
+        'elit 0 GLOBAL',
+        'sed GLOBAL',
+        'do 1',
+        'eiusmod 0',
+        'tempor',
+);
+foreach my $score ( '', 0, 1 ) {
+        foreach my $group ( '', 'GLOBAL', 'GLOBAL LOCAL' ) {
+                foreach my $comment ( '', 'TESTING', 'LONGER COMMENT' ) {
+                        my $word = "word";
+                        my $rule = $word;
+                        if ($score ne '') {
+                                $rule .= ' '.$score;
+                        }
+                        if ($group ne '') {
+                                $rule .= ' '.$group;
+                        }
+                        if ($comment ne '') {
+                                $rule .= ' # '.$comment;
+                        }
+                        my ($rword, $rscore, $rcomment, @rgroups) = $kw->readLine($rule);
+                        ok($rword eq $word, "Word '$word' found for '$rule'");
+                        ok($rscore eq $score, "Score '$score' found for '$rule'") if ($score ne '');
+                        ok(join(' ',@rgroups) eq $group, "Groups '$group' found for '$rule'") if ($group ne '');
+                        ok(join(' ',@rgroups) eq 'LOCAL GLOBAL', "Inferred groups 'LOCAL GLOBAL' found for '$rule'") if ($group eq '');
+                        ok($rcomment eq $comment, "Comment '$comment' found for '$rule'") if ($comment ne '');
+                }
+        }
+}
+
+# Check for bad formatting
+
+my %bad = (
+        '' => 'Ignore empty line',
+        '# comment' => "Ignore line starting with comment",
+        'word 1 2' => "Ignore line with multiple scores",
+        '2 bad' => "Ignore line with score first",
+);
+foreach my $input (keys(%bad)) {
+        ok(!$kw->readLine($input), "$bad{$input} ($input)");
+}
+
+done_testing();
--- a/t/04_rules.t
+++ b/t/04_rules.t
@ -0,0 +1,283 @@
+#!/usr/bin/perl
+
+# Basic test to ensure files can be found and read. Verification will come later
+
+use Test::More;
+
+use lib 'lib/';
+use Mail::SpamAssassin::KeywordRuleGenerator;
+
+my $id = '04';
+
+my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new( { 'id' => $id, 'debug' => 0 } );
+
+my @files = ( 't/04_rules0.cf', 't/04_rules1.cf' );
+my @failed = @{$kw->readAll( @files )};
+ok(!scalar(@failed), "Load 'rules' hash with readAll");
+
+my $expected = getExpected();
+my ( $missing, $extra, $incorrect ) = 0;
+use Data::Dump;
+foreach my $file (keys(%{$expected})) {
+        if ($file eq 'GLOBAL') {
+                my ($m, $e) = compareGroups(
+                        $expected->{$file},
+                        $kw->{'rules'}->{$file}
+                );
+                print("Missing $m words\n") if ($m);
+                print("$e extra words\n") if ($e);
+                $missing += $m;
+                $extra += $e;
+        } else {
+                foreach my $group (keys(%{$expected->{$file}})) {
+                        if ($group eq 'SCORED') {
+                                my ($m, $e, $i) = compareScores(
+                                        $expected->{$file}->{$group},
+                                        $kw->{'rules'}->{$file}->{$group}
+                                );
+                                $missing += $m;
+                                $extra += $e;
+                                $incorrect += $i;
+                        } elsif ($group eq 'COMMENTS') {
+                                my ($m, $e, $i) = compareScores(
+                                        $expected->{$file}->{$group},
+                                        $kw->{'rules'}->{$file}->{$group}
+                                );
+                                $missing += $m;
+                                $extra += $e;
+                                $incorrect += $i;
+                        } else {
+                                my ($m, $e) = compareGroups(
+                                        $expected->{$file}->{$group},
+                                        $kw->{'rules'}->{$file}->{$group}
+                                );
+                                $missing += $m;
+                                $extra += $e;
+                        }
+                }
+        }
+}
+
+ok ($missing == 0, "No expected rules are missing");
+ok ($extra == 0, "No extra rules are found");
+ok ($incorrect == 0, "No incorrect scores are found");
+
+done_testing();
+
+sub compareGroups
+{
+        my $expect = shift;
+        my $loaded = shift;
+
+        my @e = sort(@{$expect}); 
+        my @l = sort(@{$loaded}); 
+        my ($missing, $extra) = (0, 0);
+
+        while (scalar(@e)) {
+                unless (scalar(@l)) {
+                        print("extra words @l\n");
+                        $extra += scalar(@e);
+                        last;
+                }
+                if ($e[0] eq $l[0]) {
+                        shift(@e);
+                        shift(@l);
+                        next();
+                }
+                if ($e[0] lt $l[0]) {
+                        print("Extra word $e[0]\n");
+                        $extra++;
+                        shift(@e);
+                        next;
+                }
+                if ($e[0] gt $l[0]) {
+                        print("Missing word $l[0]\n");
+                        $missing++;
+                        shift(@l);
+                        next;
+                }
+        }
+        if (scalar(@e)) {
+                print("Missing ".scalar(@e)." at the end of parsing\n");
+                $missing += scalar(@e);
+                last;
+        }
+
+        return ($missing, $extra);
+}
+
+sub compareScores
+{
+        my $expect = shift;
+        my $loaded = shift;
+
+        my %remaining = %{$loaded};
+        my ($missing, $extra, $incorrect) = (0, 0, 0);
+        foreach my $word (keys(%$expect)) {
+                if (!defined($loaded->{$word})) {
+                        print("Missing score assignment for $word\n");
+                        $missing++;
+                } elsif ($expect->{$word} != $loaded->{$word}) {
+                        print("Incorrect score assignment for $word\n");
+                        $incorrect++;
+                } else {
+                        delete($remaining{$word});
+                }
+        }
+        $extra = scalar(keys(%remaining));
+        print("Extra score assignment for $word\n") foreach (@{$extra});
+
+        return ($missing, $extra, $incorrect);
+}
+
+sub getExpected
+{
+        my %expected = (
+                'GLOBAL' => [
+                        'lorem',
+                        'ipsum',
+                        'dolor',
+                        'sit',
+                        'amet',
+                        'consectetur',
+                        'adipiscing',
+                        'elit',
+                        'sed',
+                        'do',
+                        'eiusmod',
+                        'tempor',
+                        'minim',
+                        'veniam',
+                        'quis',
+                        'nostrud',
+                        'exercitation',
+                        'ullamco',
+                        'laboris',
+                        'nisi',
+                        'ut',
+                        'aliquip',
+                        'ex',
+                        'ea',
+                        'commodo',
+                        'consequat',
+                        'duis',
+                        'dolore',
+                        'eu',
+                        'fugiat',
+                ],
+                '50_04_T_04_RULES0.cf' => {
+                        'SCORED' => {
+                                'lorem' => 1,
+                                'sit' => 1,
+                                'adipiscing' => 1,
+                                'do' => 1,
+                                'incididunt' => 1,
+                                'et' => 1,
+                                'aliqua' => 1,
+                                'minim' => 1,
+                        },
+                        'COMMENTS' => {
+                                'lorem' => '1, 1, 1, 1',
+                                'ipsum' => '1, 1, 1, 0',
+                                'dolor' => '1, 1, 1, undef',
+                                'sit' => '1, 1, 0, 1',
+                                'amet' => '1, 1, 0, 0',
+                                'consectetur' => '1, 1, 0, undef',
+                                'adipiscing' => '1, 0, 1, 1',
+                                'elit' => '1, 0, 1, 0',
+                                'sed' => '1, 0, 1, undef',
+                                'do' => '1, 0, 0, 1',
+                                'eiusmod' => '1, 0, 0, 0',
+                                'tempor' => '1, 0, 0, undef',
+                                'incididunt' => '0, 1, 1, 1',
+                                'ut' => '0, 1, 1, 0',
+                                'labore' => '0, 1, 1, undef',
+                                'et' => '0, 1, 0, 1',
+                                'dolore' => '0, 1, 0, 0',
+                                'magna' => '0, 1, 0, undef',
+                                'aliqua' => '0, 0, 1, 1',
+                                'enim' => '0, 0, 1, 0',
+                                'ad' => '0, 0, 1, undef',
+                                'minim' => '0, 0, 0, 1',
+                                'veniam' => '0, 0, 0, 0',
+                                'quis' => '0, 0, 0, undef',
+                        },
+                        'LOCAL' => [
+                                'lorem',
+                                'ipsum',
+                                'dolor',
+                                'sit',
+                                'amet',
+                                'consectetur',
+                                'incididunt',
+                                'ut',
+                                'labore',
+                                'et',
+                                'dolore',
+                                'magna',
+                                'minim',
+                                'veniam',
+                                'quis',
+                        ],
+                        'group' => [
+                                'lorem',
+                                'ipsum',
+                                'dolor',
+                                'adipiscing',
+                                'elit',
+                                'sed',
+                                'incididunt',
+                                'ut',
+                                'labore',
+                                'aliqua',
+                                'enim',
+                                'ad',
+                        ]
+                },
+                '50_04_T_04_RULES1.cf' => {
+                        'SCORED' => {
+                                'nostrud' => 1,
+                                'laboris' => 1,
+                                'aliquip' => 1,
+                                'consequat' => 1,
+                                'aute' => 1,
+                                'in' => 1,
+                                'velit' => 1,
+                                'dolore' => 1,
+                        },
+                        'LOCAL' => [
+                                'nostrud',
+                                'exercitation',
+                                'ullamco',
+                                'laboris',
+                                'nisi',
+                                'ut',
+                                'aute',
+                                'irure',
+                                'dolor',
+                                'in',
+                                'reprehenderit',
+                                'volupatate',
+                                'dolore',
+                                'eu',
+                                'fugiat',
+                        ],
+                        'group' => [
+                                'nostrud',
+                                'exercitation',
+                                'ullamco',
+                                'aliquip',
+                                'ex',
+                                'ea',
+                                'aute',
+                                'irure',
+                                'dolor',
+                                'velit',
+                                'esse',
+                                'cillum',
+                        ]
+                }
+        );
+
+        return \%expected;
+}
--- a/t/04_rules0.cf
+++ b/t/04_rules0.cf
@ -0,0 +1,28 @@
+# Comments to be ignored. As with following blank line.
+# Must test every combination of:
+# GLOBAL, LOCAL, GROUP, SCORE
+
+Lorem 1 GLOBAL LOCAL group # 1, 1, 1, 1
+ipsum 0 GLOBAL LOCAL group # 1, 1, 1, 0
+dolor GLOBAL LOCAL group # 1, 1, 1, undef
+sit 1 GLOBAL LOCAL # 1, 1, 0, 1
+amet 0 GLOBAL LOCAL # 1, 1, 0, 0
+consectetur GLOBAL LOCAL # 1, 1, 0, undef
+adipiscing 1 GLOBAL group # 1, 0, 1, 1
+elit 0 GLOBAL group # 1, 0, 1, 0
+sed GLOBAL group # 1, 0, 1, undef
+do 1 GLOBAL # 1, 0, 0, 1
+eiusmod 0 GLOBAL # 1, 0, 0, 0
+tempor GLOBAL # 1, 0, 0, undef
+incididunt 1 LOCAL group # 0, 1, 1, 1
+ut 0 LOCAL group # 0, 1, 1, 0
+labore LOCAL group # 0, 1, 1, undef
+et 1 LOCAL # 0, 1, 0, 1
+dolore 0 LOCAL # 0, 1, 0, 0
+magna LOCAL # 0, 1, 0, undef
+aliqua 1 group # 0, 0, 1, 1
+enim 0 group # 0, 0, 1, 0
+ad group # 0, 0, 1, undef
+minim 1 # 0, 0, 0, 1
+veniam 0 # 0, 0, 0, 0
+quis # 0, 0, 0, undef
--- a/t/04_rules1.cf
+++ b/t/04_rules1.cf
@ -0,0 +1,24 @@
+nostrud 1 GLOBAL LOCAL group
+exercitation 0 GLOBAL LOCAL group
+ullamco GLOBAL LOCAL group
+laboris 1 GLOBAL LOCAL
+nisi 0 GLOBAL LOCAL
+ut GLOBAL LOCAL
+aliquip 1 GLOBAL group
+ex 0 GLOBAL group
+ea GLOBAL group
+commodo 0 GLOBAL
+consequat 1 GLOBAL
+Duis GLOBAL
+aute 1 LOCAL group
+irure 0 LOCAL group
+dolor LOCAL group
+in 1 LOCAL
+reprehenderit 0 LOCAL
+volupatate LOCAL
+velit 1 group
+esse 0 group
+cillum group
+dolore 1
+eu 0
+fugiat
				`@ -0,0 +1 @@`
				`/home/jpm/KeywordRuleGenerator/t/02_files.cf`