Mail-SpamAssassin-KeywordRu.../lib/Mail/SpamAssassin/KeywordRuleGenerator.pm

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

Mail::SpamAssassin::KeywordRuleGenerator - Generate SA rules for keywords

=cut

package Mail::SpamAssassin::KeywordRuleGenerator;

=head1 SYNOPSIS

Generate SpamAssassin compatible configuration files given lists of keywords.
Implemented as a module largely for testing purposes.

    use Mail::SpamAssassin::KeywordRuleGenerator;
    my $kw = Mail::SpamAssassin::KeywordRuleGenerator->new($id);
    $kw->readFile('keywords.cf');
    $kw->writeAll();

=head1 DESCRIPTION

Mail::SpamAssassin::KeywordRuleGenerator does what it says on the tin; it
generates SpamAssassin compatible configuration files to catch keywords that you
specify. Most simply, it can take in one or more properly formatted input files
(see FILES), generate rules for individual files (optional), as well as meta
rules for the counts for each set of keywords (ie. 1 of N hit; 2 of N hit...).
See RULES for more on how the rules are generated.

The sets of keywords can be broken up into groups (see GROUPS).

=head1 PREREQUISITES

Requires C<spamassassin> executable and the following Perl modules

    Mail::SpamAssassin

=cut

use strict;
use warnings;

=head2 FILES

There are built-in functions to ingest formatted list files. See C<readFile>
method. By default, the output file name and the rules therein will use a
stripped and capitalized version of those filenames.

    $kw->readAll( 'example.cf', 'example.txt' );
    $kw->writeAll();

This will creates rules formatted like:

    ID_EXAMPLE_WORD

and will output to the file:

    70_ID_EXAMPLE.cf

Leading './' or $PWD will be stripped, but other paths will be included with '_'
substituted '/'. For example:

    $kw->readAll ( './example.cf', '/to/pwd/example2.cf', 'dir/example3.cf' );

will produce:

    70_KW_EXAMPLE.cf
    70_KW_EXAMPLE2.cf
    70_KW_DIR_EXAMPLE3.cf

If there are any rules in the 'GLOBAL' group, or if in C<singleOutfile> mode,
a file with just the priority and keyword will be produced with the select rules
or all rules, respectively. This will look like:

    70_KW.cf

See the C<generate*> methods for more information on this formatting. Also see
the C<new> method for discussion of the 'id'.

Finally, unless in C<joinScores> mode, each configuration will have a matching
scores file like:

    70_KW_EXAMPLE_SCORES.cf
    70_KW_SCORES.cf

In C<joinScores> mode, the scores will be directly appended in the same config
file with the rule definitions.

Note that the global files may have an incremented priority since it requires
all meta rules from the other files be defined first. For example, the above
would be left with 70 for the global priority, however if the C<id> were lower
in the alphabet, the global files would be incremented:

    70_AB_EXAMPLES.cf
    71_AB.cf

=head2 RULES

Two types of rules are created. One is a set of standalone keyword rules when a
score is provided for those words. This will create a meta rule for a simple
match in either the headers or body

header      __ID_FILE_WORD_H    /\bword\b/i
body        __ID_FILE_WORD_B    /\bword\b/i
meta        __ID_FILE_WORD      ( __ID_FILE_WORD_H || __ID_FILE_WORD_B )
meta        ID_FILE_WORD        __ID_FILE_WORD
describe    ID_FILE_WORD        Keyword 'word' found
score       ID_FILE_WORD        1

The other is a set of counters for each group. These will add the same first
three component rules (or co-opt the ones already created for the standalone
rules). It will then add a rule for each number of possible matches within that
group:

meta        ID_FILE_GROUP_1     ( __ID_FILE_WORD + __ID_FILE_OTHER ) > 0
describe    ID_FILE_GROUP_1     1 match in keyword group 'GROUP'
meta        ID_FILE_GROUP_2     ( __ID_FILE_WORD + __ID_FILE_OTHER ) > 1
describe    ID_FILE_GROUP_2     2 matches in keyword group 'GROUP'

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';

=head1 EXPORT

=cut

use Exporter qw(import);
use base 'Exporter';

our @EXPORT = qw( new );

our %EXPORT_TAGS = (
    'all' => [
        'new',
        'getId',
        'setId',
        'getPriority',
        'setPriority',
        'getOutfile',
        'setOutfile',
        'getScoresOutfile',
        'setScoresOutfile',
        'getGlobalOutfile',
        'setGlobalOutfile',
        'getGlobalScoresOutfile',
        'setGlobalScoresOutfile',
        'getOutput',
        'getScoresOutput',
        'getGlobalOutput',
        'getGlobalScoresOutput',
        'clearFile',
        'nextFile',
        'readFile',
        'getFile',
        'setFile',
        'createDir',
        'cleanDir',
        'generateMetas',
        'generateScored',
        'generateGroups',
        'generateGlobals',
        'generateAll',
        'writeFile',
        'writeAll',
        'verifyOutput'
    ]
);

our @EXPORT_OK = ( @{ $EXPORT_TAGS{all} } );

=head1 METHODS

Except for getter methods, all methods make changes directly within the context
object and only return errors. Thus, each call can be verified by checking that
it returned undef:

    die "Failed to run 'method'\n" if($kw->method($arg));

or:

    my $err = $kw->method($arg);
    die "Received $err\n" if ($err);

Getter methods will return the attribute or undef if it does not exist.

=head2 C<$kw = new( %args )>

Creates a new C<Mail::SpamAssassin::KeywordRuleGenerator> object and returns it.

C<$args> is an optional hash to assign any preliminary attributes or flags to
the object. See various 'get' and 'set' functions.

=cut

sub new
{
    my ($class, $args) = @_;
    my $self = $args;

=head3 Initial attributes

While other attributes will exist within the object during processing, those
which make sense to define up-front and which will not be overwritten by any of
the built-in functions (other than their associated getters) are:

=head3 id

    id scalar
    A short identifier which will be appending to all output files to allow
    for easier recognition of the files' sources. Default: 'KW'

=cut

    $self->{'id'}           //= 'KW';

=head3 priority

    priority scalar
    Typically a number used to define the priority of the rules.
    SpamAssassin will read configuration files in alphabetical order and the
    last iteration of a configuration for the same rule will be used. This
    means that the files read last will overwrite those read earlier. By
    convention the number is used for easy sorting. Any leading aphabetical
    character will be ordered after all numbers, and so given high priority.
    Default: '50'

=cut

    $self->{'priority'}         //= 50;

=head3 debug

    debug boolean
    Enable (1) or disable (0) debugging output. Default: 0

=cut

    $self->{'debug'}        //= 0;

=head3 singleOutfile

    singleOutfile boolean
    Indicates whether output rules should all be added to a single file (1),
    or to one file per input file (0). Note that this still requires
    'joinScores' to have one file total. Default: 0

=cut

    $self->{'singleOutfile'}    //= 0;

=head3 joinScores

    joinScores bootlean
    Indicates whether to include the scores in the same file as their
    associated rule definitions (1) or in a second file on their own (0).
    The second file will simply append '_SCORES' to the file name (prior to
    the '.cf'), unless overridden by C<$kw->setScoreFile($path)> or a second
    path argument in C<$kw->setOutfile($rulePath, $scorePath)>. Default: 1

=cut

    $self->{'joinScores'}       //= 1;

    bless $self, $class;
    return $self;
}

=head2 C<$kw->getId()>

Getter for C<$kw->{'id'}>. 'id' is used for top-level rule names.

=cut

sub getId
{
    my $self = shift;

    if (defined($self->{'id'})) {
        return $self->{'id'};
    }
}

=head2 C<$kw->setId()>

Setter for C<$kw->{'id'}>

=cut

sub setId
{
    my $self = shift;
    my $id = shift;

    if (defined($id)) {
        $id = uc($id);
        return "Must start with a letter" unless ($id =~ m/^[A-Z]/);
        $self->{'id'} = $id || return "Failed to set $id";
    } else {
        return "No ID provided\n";
    }
}

=head2 C<$kw->getPriority()>.

Getter for C<$kw->{'priority'}>. 'priority' value used for the output filepath
to indicate the load order. SpamAssassin reads in ascending order with later
iterations overriding earlier. The 'score' file will increment so that it comes
at the end.

=cut

sub getPriority
{
    my $self = shift;

    if (defined($self->{'priority'})) {
        return $self->{'priority'};
    }
}

=head2 C<$kw->setPriority()>

Setter for C<$kw->{'priority'}>.

=cut

sub setPriority
{
    my $self = shift;
    my $priority = shift;

    if (defined($priority)) {
        $self->{'priority'} = $priority || return "Failed to set priority: $priority";
    } else {
        return "No 'priority' provided\n";
    }
}

=head2 C<$kw->getFile()>

Getter for C<$kw->{'file'}>. This is the name of the current output file being
processed. Not to be confused with C<readFile> or C<getOutfile>.

=cut

sub getFile
{
    my $self = shift;

    if (defined($self->{'file'})) {
        return $self->{'file'};
    }
    return undef;
}

=head2 C<$kw->setFile()>

Setter for C<$kw->{'file'}>.

=cut

sub setFile
{
    my $self = shift;
    my $file = shift;

    if (defined($file)) {
        $self->{'file'} = $file;
        $self->setOutfile();
        $self->setScoresOutfile();
    } else {
        delete($self->{'file'});
    }
}

=head2 C<$kw->getOutfile()>

Getter for C<$kw->{'file'}>. 'file' represents the real filepath of the
output file. With a single argument, the output file for that input file will be
returned. Without an argument, the current working output file will be returned,
if available).

=cut

sub getOutfile
{
    my $self = shift;
    my $file = shift || $self->{'file'};

    if (defined($self->{'filemap'}->{$file})) {
        return $self->{'outfile'};
    }
}

=head2 C<$kw->setOutfile()>

Setter for C<$kw->{'outfile'}>. Can be defined manually with a scalar argument,
otherwise the path is constructed from the existing attributes.

=cut

sub setOutfile
{
    my $self = shift;
    my $path = shift;

    if (defined($path)) {
        $self->{'filemap'}->{$self->{'file'}} = $path || return "Failed to set $path";
    } else {
        if ($self->{'singleOutfile'}) {
            $self->{'filemap'}->{$self->{'file'}} = $self->{'dir'}."/".$self->{'priority'} .
                '_' . uc($self->{'id'}) .
                '.cf';
        } else {
            my $file = $self->{'file'};
            $file =~ s/\//_/g; # Change dir slashes to _
            $file =~ s/(\.[^\.]*)$//g; # Remove extensions
            $file = uc($file); # Convert to uppercase for rule names

            $self->{'filemap'}->{$self->{'file'}} = $self->{'dir'}."/".$self->{'priority'} .
                '_' . uc($self->{'id'}) .
                '_' . $file .
                '.cf';
        }
    }
}

=head2 C<$kw->getScoresOutfile()>

Getter for the output path for scores for the current C<$kw->{'file'}>.

=cut

sub getScoresOutfile
{
    my $self = shift;
    my $file = shift || $self->{'file'};

    $self->setScoresOutfile() unless (defined($self->{'filemap'}->{$file."_SCORES"}));
    return $self->{'filemap'}->{$file."_SCORES"};
}

=head2 C<$kw->setScoresOutfile()>

Setter for the output path for scores for the current C<$kw->{'outfile'}>. Can
be defined manually with a scalar argument, otherwise the path is constructed
from the existing attributes.

=cut

sub setScoresOutfile
{
    my $self = shift;
    my $path = shift;

    if (defined($path)) {
        $self->{'filemap'}->{$self->{'file'}."_SCORES"} = $path || return "Failed to set $path";
        return undef;
    }
    if ($self->{'joinScores'}) {
        if ($self->{'singleOutfile'}) {
            $self->setGlobalOutfile() unless (defined($self->{'filemap'}->{'GLOBAL'}));
            $self->{'filemap'}->{$self->{'file'}."_SCORES"} = $self->{'filemap'}->{'GLOBAL'};
        } else {
            $self->setOutfile() unless (defined($self->{'filemap'}->{$self->{'file'}}));
            $self->{'filemap'}->{$self->{'file'}."_SCORES"} = $self->{'filemap'}->{$self->{'file'}};
        }
        return undef;
    } else {
        if ($self->{'singleOutfile'}) {
            $self->setGlobalScoresOutfile() unless (defined($self->{'filemap'}->{'GLOBAL_SCORES'}));
            $self->{'filemap'}->{$self->{'file'}."_SCORES"} = $self->{'filemap'}->{'GLOBAL_SCORES'};
            return undef;
        }
    }
    my $file = $self->{'file'};
    $file =~ s/\//_/g; # Change dir slashes to _
    $file =~ s/(\.[^\.]*)$//g; # Remove extensions
    $file = uc($file); # Convert to uppercase for rule names
    $self->{'filemap'}->{$self->{'file'}."_SCORES"} = $self->{'dir'}."/".$self->{'priority'} .
        '_' . uc($self->{'id'}) .
        '_' . $file .
        '_SCORES' .
        '.cf';
}

=head2 C<$kw->getGlobalOutfile();>

Return the full path of the output file used for GLOBAL rules. If it is not yet
defined, then C<setGlobalOutfile()> will be run first to try to set it. If this
fails, then nothing will be returned.

=cut

sub getGlobalOutfile
{
    my $self = shift;
    return $self->{'filemap'}->{'GLOBAL'} if (defined($self->{'filemap'}->{'GLOBAL'}));
    my $ret = $self->setGlobalOutfile();
    return $self->{'filemap'}->{'GLOBAL'} unless ($ret);
}

=head2 C<$kw->setGlobalOutfile($file);>

Set the output file for global rules. This file must be either the same or
alphabetically after the last file with 'meta' rules. C<$file> can be used to
bypass that check, but it might lead to rules that do not work.

Will try to select a filename as close to the existing output files as possible.
First it will simply duplicate a name if it is in C<singleOutfile> mode. Then,
it will try to use the base name without the C<$file> portion. This will
generally not work because the '_' before the file will come after the dot
without:

    99_KW.cf
    99_KW_FILE.cf
    99_KW_FILE_SCORES.cf

Next, if the priority is less than 99, it will simply increment that. Finally,
if will try to double the first '_':

    99__KW.cf

If none of these techniques work, it will return an error.

=cut

sub setGlobalOutfile
{
    my $self = shift;
    my $path = shift;

    if (defined($path)) {
        $self->{'filemap'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
        return undef;
    }
    if ($self->{'singleOutfile'}) {
        $self->{'filemap'}->{'GLOBAL'} = $self->{'filemap'}->{(keys(%{$self->{'filemap'}}))[0]};
        return undef;
    }
    my $last;
    foreach my $file (keys(%{$self->{'filemap'}})) {
        $last = $file if (!defined($last) || $file gt $last);
    }
    my $file = $self->getDir().'/'.$self->getPriority()."_".$self->getId().".cf";
    if (defined($last) && $file gt $last) {
        $self->{'filemap'}->{'GLOBAL'} = $file;
        return undef;
    }
    if ($self->getPriority < 99) {
        $self->{'filemap'}->{'GLOBAL'} = $self->getDir().'/'.($self->getPriority()+1)."_".$self->getId().".cf";
        return undef;
    }
    $file =~ s/_/__/ unless ($file gt $last);
    if ($file gt $last) {
        $self->{'filemap'}->{'GLOBAL'} = $file;
        return undef;
    }
    return ("Cannot determine a valid GLOBAL output file\n");
}

=head2 C<$kw->getGlobalScoresOutfile();>

Return the full path of the output file used for GLOBAL rules. If it is not yet
defined, then C<setGlobalOutfile()> will be run first to try to set it. If this
fails, then nothing will be returned.

=cut
sub getGlobalScoresOutfile
{
    my $self = shift;
    return $self->{'filemap'}->{'GLOBAL_SCORES'} if (defined($self->{'filemap'}->{'GLOBAL_SCORES'}));
    my $ret = $self->setGlobalScoresOutfile();
    return $self->{'filemap'}->{'GLOBAL_SCORES'} unless ($ret);
}

=head2 C<$kw->setGlobalScoresOutfile($file);>

Set the output file for the scores associated with global rules. This file must
be either the same or alphabetically after the global rules file. C<$file> can
be used to bypass that check, but it might lead to rules that do not work.

Will try to select a filename as close to the existing output files as possible.
First it will simply copy the global rules file name if it is in C<joinScores>
mode. Then, it will try to simply append '_SCORES' to the base file name prior
to the extension. Next, if the priority is less than 99, it will simply
increment that. Finally, if will try to double the first '_'. If none of these
techniques work, it will return an error.

=cut

sub setGlobalScoresOutfile
{
    my $self = shift;
    my $path = shift;

    if (defined($path)) {
        $self->{'filemap'}->{'GLOBAL_SCORES'} = $path;
        return undef;
    }
    $self->setGlobalOutfile() unless (defined($self->{'filemap'}->{'GLOBAL'}));
    if ($self->{'joinScores'}) {
        $self->{'filemap'}->{'GLOBAL_SCORES'} = $self->{'filemap'}->{'GLOBAL'};
        return undef;
    }
    if (defined($self->{'filemap'}->{'GLOBAL_SCORES'}) && $self->{'filemap'}->{'GLOBAL_SCORES'} gt $self->{'filemap'}->{'GLOBAL'}) {
        return undef;
    }
    my $file = $self->getGlobalOutfile() || die "HUH?";
    $file =~ s/\.cf$/_SCORES.cf/;
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
    $file = $self->getDir().'/'.$self->getPriority()."_".$self->getId()."_SCORES.cf";
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
    my $gpriority = $self->{'filemap'}->{'GLOBAL'};
    $gpriority =~ s/^(\d\d).*/$1/;
    $file = $self->getDir().'/'.($gpriority+1)."_".$self->getId()."_SCORES.cf" if ($gpriority < 99);
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
    $file = $self->{'filemap'}->{'GLOBAL'};
    $file =~ s/_/__/;
    if ($file gt $self->{'filemap'}->{'GLOBAL'}) {
        $self->{'filemap'}->{'GLOBAL_SCORES'} = $file;
        return undef;
    }
    return ("Cannot determine a valid GLOBAL output file\n");
}

=head2 C<$kw->getOutput()>

Returns a reference to the output array buffer for the current C<$file>.

=cut

sub getOutput
{
    my $self = shift;
    my $file = shift || $self->{'file'};

    if ($self->{'singleOutfile'}) {
        return \@{$self->{'output'}->{$self->{'filemap'}->{'GLOBAL'}}};
    } elsif (defined($self->{'filemap'}->{$file})) {
        return \@{$self->{'output'}->{$self->{'filemap'}->{$file}}};
    }
}

=head2 C<$kw->getScoresOutput()>

Returns a reference to the score output array buffer for the current C<$file>.

=cut

sub getScoresOutput
{
    my $self = shift;
    my $file = shift || $self->{'file'};

    $self->setScoreOutfile() unless (defined($self->{'filemap'}->{$file."_SCORES"}));
    if ($self->{'joinScores'}) {
        if ($self->{'singleOutfile'}) {
            return \@{$self->{'output'}->{$self->{'filemap'}->{'GLOBAL'}}};
        } elsif (defined($self->{'filemap'}->{$file})) {
            return \@{$self->{'output'}->{$self->{'filemap'}->{$file}}};
        }
    } else {
        if ($self->{'singleOutfile'}) {
            return \@{$self->{'output'}->{$self->{'filemap'}->{'GLOBAL_SCORES'}}};
        } elsif (defined($self->{'filemap'}->{$file})) {
            return \@{$self->{'output'}->{$self->{'filemap'}->{$file."_SCORES"}}};
        }
    }
}

=head2 C<$kw->getGlobalOutput()>

Returns a reference to the output array buffer for the GLOBAL file.

=cut

sub getGlobalOutput
{
    my $self = shift;

    $self->setGlobalOutfile() unless (defined($self->{'filemap'}->{'GLOBAL'}));
    return \@{$self->{'output'}->{$self->{'filemap'}->{'GLOBAL'}}};
}

=head2 C<$kw->getGlobalScoresOutput()>

Returns a reference to the output array buffer for the GLOBAL scores file.

=cut

sub getGlobalScoresOutput
{
    my $self = shift;

    $self->setGlobalScoresOutfile unless (defined($self->{'filemap'}->{'GLOBAL_SCORES'}));
    if ($self->{'joinScores'}) {
        return \@{$self->{'output'}->{$self->{'filemap'}->{'GLOBAL'}}};
    } else {
        return \@{$self->{'output'}->{$self->{'filemap'}->{'GLOBAL_SCORES'}}};
    }
}

=head2 C<$kw->clearFile();>

Clear the current file reference queue.

=cut

sub clearFile
{
    my $self = shift;
    my $file = shift || $self->{'file'};
    delete($self->{'rules'}->{$file}) || return "Failed to delete rules hash\n";
    return 0;
}

=head2 C<$kw->clearAll();>

Clear all file reference queues.

=cut

sub clearAll
{
    my $self = shift;
    foreach my $file (keys(%{$self->{'rules'}})) {
        %{$self->{'rules'}->{'DEFAULT'}} = () if ($file eq 'DEFAULT');
        $self->clearFile($file);
    }
    return 0;
}
=head2 C<$kw->readFile(%args);>

Read in properly formatted keyword list file. The basic format is one keyword
per line, an optional score, and an optional list of 'groups'.

So, the minimum is just one word per line:

    word

When the score is omitted, it will not have a standalone score. It will be used
solely as part of a keyword group.

When keyword groups are omitted, that keyword defaults to just the 'LOCAL' group
(see GROUPS).

Examples with other combinations are:

    word 0              # Same as previous
    word 2              # Used for LOCAL, not GLOBAL, scores 2
    word GROUP          # GROUP group, not LOCAL, not GLOBAL, no score
    word 0 GROUP        # Same as previous
    word 1 GROUP GLOBAL LOCAL   # GROUP group, in GLOBAL, in LOCAL, scores 1

C<%args> can be used to override the attributes used to index the rules,
including the id, file

This function intentionally does not iterate through the files queue so that you
can step through and make modifications as you go.

=cut

sub readFile
{
    my $self = shift;
    my $file = shift || return 'No file provided';
    my %args = @_;
    # Clean to relative path
    $file =~ s/^\.\///;
    $file =~ s/^$ENV{'PWD'}\///;

    $self->getDir();
    $self->{'file'} = $file;
    $self->setOutfile();

    my $n = 0;
    if (open(my $fh, '<', $file)) {
        my $rules = 0;
        while (<$fh>) {
            $n++;
            # Ignore blank lines and comments
            if ($_ =~ m/^\s*$/ || $_ =~ m/^#/) {
                next;
            # Verify formatting
            } elsif (my ($word, $score, $comment, @groups) = $self->readLine($_)) {
                if ($self->{'debug'}) {
                    print "FOUND: '$word' '$score' " . (join(',',@groups)) . "\n";
                }
                foreach my $group (@groups) {
                    # Global rules must references the file where the component rule is located
                    if ($group eq 'GLOBAL') {
                        $self->{'rules'}->{'GLOBAL'}->{$word} = $self->{'file'};
                    # Local rules have enough information from context to determine the component rule
                    } else {
                        push(@{$self->{'rules'}->{$self->{'file'}}->{$group}}, $word);
                    }
                }
                $self->{'rules'}->{$self->{'file'}}->{'SCORED'}->{$word} = $score if ($score);
                $self->{'rules'}->{$self->{'file'}}->{'COMMENTS'}->{$word} = $comment if ($comment);
            } elsif ($self->{debug}) {
                print STDERR "ERROR: Invalid input in $file, line $n: $_\n";
            }
        }
        print STDERR "No rules found in $file\n" unless ($self->{'rules'}->{$self->{'file'}} || !$self->{'debug'});
        return "No rules found in $file\n" unless ($self->{'rules'}->{$self->{'file'}});
    } else {
        delete($self->{'file'});
        return "Failed to read $file";
    }
    return undef;
}

=head2 C<$kw->readAll();>

Loops through C<readFile> for all files in the queue.

=cut

sub readAll
{
    my $self = shift;
    my @files = @_;

    my @failed;
    foreach my $file (@files) {
        if (!-e $file) {
            push(@failed, "$file does not exist\n");
            print STDERR "$file does not exist\n" if ($self->{'debug'});
        } elsif (-l $file) {
            # Accept only links within PWD
            my $dest = readlink($file);
            if ($dest =~ m/^\//) {
                $dest =~ s/$ENV{'PWD'}\/// ;
                if ($dest eq readlink($file)) {
                    push(@failed, "Symlink outside of PWD $file\n");
                    print STDERR "Symlink outside of PWD $file\n" if ($self->{'debug'});
                    next;
                }
            }
            $self->readAll($dest);
        } elsif (-f $file) {
            push(@failed, "Failed to read $file") if $self->readFile($file);
        } elsif (-d $file) {
            push(@failed, $self->readAll(glob($file.'/*')) );
        # Bash pattern?
        } else {
            if (my @glob = glob($file)) {
                push(@failed, ( @_ )) if $self->readAll(@glob);
            } else {
                print STDERR "Bad file $file\n" if ($self->{'debug'});
            }
        }
    }
    return @failed;
}

=head2 C<$kw->readLine($line)>

Reads a line from a configuration file and returns the relevant values or undef
if the line is not properly formatted.

=cut

sub readLine
{
    my $self = shift;
    my $line = shift;
    my @invalid;
    my ($word, $score, $comment, @groups);
    if (my @sections = $line =~ m/(?:^|\s+)(?:([^\d\s#]\S+)|(\d+(?:\.\d+)?\b)|([^\d\s#]+)|(#.*$))/g) {
        while (@sections) {
            next unless my $section = shift(@sections);
            if (defined($comment)) {
                $comment .= ' ' if ($comment ne '');
                $comment .= $section;
            } elsif (!defined($word) && $section =~ m/^([^\d\s#]\S+)$/) {
                $word = $section;
            } elsif (defined($word) && $section =~ m/^(\d+(?:\.\d+)?)$/ && !defined($score)) {
                $score = $section;
            } elsif (defined($word) && $section =~ m/^([^\d\s#]+)$/) {
                push(@groups, uc($section));
            } elsif (defined($word) && $section =~ m/^#.*$/ && !defined($comment)) {
                $comment = $section;
                $comment =~ s/^#\s*//;
            } else {
                push(@invalid, $section);
            }
        }
        if (scalar(@invalid)) {
            if ($self->{'debug'}) {
                print("Invalid clauses: '".join("', ", @invalid)."' in '$line'\n");
            }
            return undef;
        }
        return undef unless ($word);
        $word = lc($word);
        $score //= 0;
        $comment //= '';
        @groups = ( 'LOCAL' ) unless (scalar(@groups));
        return ( $word, $score, $comment, @groups );
    }
    return ();
}


=head2 C<$kw->getPrefix($file);>

Return a standardized rule prefix using C<id> and C<file>.

=cut

sub getPrefix
{
    my $self = shift;
    my $file = shift || $self->getFile();
    my $id = $self->getId();
    $file =~ s/(?:.*\/)*(.*)\.cf/$1/ if (defined($file));
    return uc($id.(defined($file) ? "_$file" : ''));
}

=head2 C<$kw->generateMetas($file);>

Write component rules for file in the current file, or one set by C<$file> to
make them available to all other rule types.

These are a match for each word across all groups and 'SCORED' in the body and
subject header, then a 'meta' rule to connect them.

    body    __KW_FILE_WORD_BODY /\bword\b/
    header  __KW_FILE_WORD_SUBJ Subject =~ /\bword\b/
    meta    __KW_FILE_WORD ( __KW_FILE_WORD_BODY || __KW_FILE_WORD_SUBJ )

=cut

sub generateMetas
{
    my $self = shift;
    my $words = shift;
    my $file = $self->getFile();
    my $prefix = $self->getPrefix();

    my $output = $self->getOutput();
    my $scoresoutput = $self->getScoresOutput();

    if ($self->{'debug'}) {
        print STDERR "Writing Metas for $file\n";
    }
    if ($self->{'singleOutfile'}) {
        push(@{$output},
            "############".('#'*length($prefix))."#\n".
            "# Metas for $prefix\n".
            "############".('#'*length($prefix))."#\n\n"
        );
    }
    foreach my $word (keys(%{$words})) {
        push(@{$output}, "# ".$words->{$word}."\n") if ($words->{$word});
        push(@{$output},
            "body    __${prefix}_".uc($word).
            "_BODY /\\b${word}\\b/\n",
            "header      __${prefix}_".uc($word).
            "_SUBJ Subject =~ /\\b${word}\\b/\n",
            "meta    __${prefix}_".uc($word).
            " ( __${prefix}_".uc($word)."_BODY || __${prefix}_".
            uc($word)."_SUBJ )\n\n"
        );
    }
}

=head2 C<$kw->generateScored($file);>

Write 'SCORED' word rules for file in the current file, or one set by C<$file>.

These are simply a 'meta' rule for only the existing component rule (the same
rule with a '__' prefix).

    meta    KW_FILE_WORD ( __KW_FILE_WORD )
    score       KW_FILE_WORD 1.0

=cut

sub generateScored
{
    my $self = shift;
    my $file = shift || $self->getFile();
    my $prefix = $self->getPrefix();

    my $output = $self->getOutput();
    my $scoreoutput = $self->getScoresOutput();
    if ($self->{'singleOutfile'}) {
        push(@{$output},
            "############".('#'*length($prefix))."#\n".
            "# Scored words for $prefix\n".
            "############".('#'*length($prefix))."#\n\n"
        );
    }
    foreach ( keys(%{$self->{'rules'}->{$file}->{'SCORED'}}) ) {
        push(@{$output}, "meta    ${prefix}_".uc($_).
            " ( __${prefix}_".uc($_)." )\n"
        );
        if (defined($self->{'rules'}->{$file}->{'COMMENTS'}->{$_})) {
            push(@{$scoreoutput}, "describe    ${prefix}_".uc($_)." ".
                $self->{'rules'}->{$file}->{'COMMENTS'}->{$_}.
                "\n"
            );
        }
        push(@{$scoreoutput},
            "score       ${prefix}_".uc($_)." ".
            $self->{'rules'}->{$file}->{'SCORED'}->{$_}."\n\n"
        );
    }
}

=head2 C<$kw->generateGroups($file);>

Write group rules for file in the current file, or one set by C<$file>. These
are a list of 'meta' rules where the component rules are all of the 'meta's in
the group for each of the available match counts.

    meta    KW_FILE_GROUP_1 ( __KW_WORD1 + __KW_WORD2 ) >= 1
    describe    KW_FILE_GROUP_1 Found 1 word(s) from GROUP
    score       KW_FILE_GROUP_1 0.01

    meta    KW_FILE_GROUP_2 ( __KW_WORD1 + __KW_WORD2 ) >= 2
    describe    KW_FILE_GROUP_2 Found 2 word(s) from GROUP
    score       KW_FILE_GROUP_2 0.01

=cut

sub generateGroups
{
    my $self = shift;
    my $file = shift || $self->getFile();
    my $prefix = $self->getPrefix();

    my $output = $self->getOutput();
    my $scoreoutput = $self->getScoresOutput();
    if ($self->{'singleOutfile'}) {
        push(@{$output},
            "#############".('#'*length($prefix))."#\n".
            "# Groups for $prefix\n".
            "#############".('#'*length($prefix))."#\n\n"
        );
    }
    foreach my $group ( keys(%{$self->{'rules'}->{$file}}) ) {
        next if ( $group eq 'COMMENTS' || $group eq 'SCORED' );
        push(@{$output}, "# $group\n");
        my $gprefix = $prefix;
        unless ($group eq 'LOCAL') {
            $gprefix .= '_'.$group;
        }
        my $start = "meta    ${gprefix}_";
        my $words = " ( ";
        foreach my $word ( @{$self->{'rules'}->{$file}->{$group}} ) {
            $words .= "__${prefix}_".uc($word)." + ";
        }
        $words =~ s/\+ $/\) >= /;
        for (my $i = 1; $i <= scalar(@{$self->{'rules'}->{$file}->{$group}}); $i++) {
            push(@{$output}, $start.$i.$words.$i."\n\n");
            push(@{$scoreoutput},
                "describe    ${gprefix}_$i Found $i $group word"
                . ($i>1 ? 's' : '')." from $prefix\n",
                "score       ${gprefix}_$i 0.01\n\n"
            );
        }
    }
}

=head2 C<$kw->generateGlobals();>

Write 'GLOBAL' group rules. Similar to C<generateGroups> except that component
rules must be included from external files.

    meta    KW_1 ( ( __KW_FILE1_WORD + __KW_FILE2_WORD ) >= 1 )
    describe    KW_1 Found 1 GLOBAL word(s) from KW
    score       KW_1 0.01

    meta    KW_2 ( ( __KW_FILE1_WORD + __KW_FILE2_WORD ) >= 2 )
    describe    KW_2 Found 2 GLOBAL word(s) from KW
    score       KW_2 0.01

=cut

sub generateGlobals
{
    my $self = shift;

    $self->setGlobalOutfile() unless (defined($self->{'filemap'}->{'GLOBAL'}));
    $self->setGlobalScoresOutfile() unless (defined($self->{'filemap'}->{'GLOBAL_SCORES'}));
    my $output = $self->getGlobalOutput();
    my $scoreoutput = $self->getGlobalScoresOutput();
    if ($self->{'singleOutfile'}) {
        push(@{$output},
            "##########\n".
            "# Globals\n".
            "##########\n\n"
        );
    }
    $self->setFile(undef);
    my $gprefix = $self->getPrefix();
    my $start = "meta    ${gprefix}_";
    my $words = " ( ";
    my $prefix;
    foreach my $word ( keys(%{$self->{'rules'}->{'GLOBAL'}}) ) {
        $prefix = $self->getPrefix($self->{'rules'}->{'GLOBAL'}->{$word});
        $words .= "__${prefix}_" . uc($word) . " + ";
    }
    $words =~ s/\+ $/\) >= /;
    for (my $i = 1; $i <= scalar(keys(%{$self->{'rules'}->{'GLOBAL'}})); $i++) {
        my $line = $start.$i.$words.$i."\n";
        push(@{$output}, $line.($self->{'joinScores'} ? '' : "\n"));
        push(@{$scoreoutput}, "describe    ${gprefix}_${i} Found $i ".
            "GLOBAL word(s) from ${gprefix}\n",
            "score       ${gprefix}_${i} 0.01\n\n"
        );
    }
}

sub getDir
{
    my $self = shift;
    my $dir = $self->{'dir'} || $self->setDir();
    return $dir;
}

sub setDir
{
    my $self = shift;
    my $dir = shift || ("$ENV{'PWD'}/$self->{'id'}");
    $dir =~ m/([^\0]+)/;
    $self->{'dir'} = $1;
    return undef;
}

sub createDir
{
    my $self = shift;
    my $dir = shift || $self->getDir();
    unless (-d $dir) {
        mkdir($dir) || return "Failed to mkdir '$dir'";
    }
    return undef;
}

sub cleanDir
{
    my $self = shift;
    my @files = @_;
    unless (scalar(@files)) {
        my %uniq;
        foreach (keys(%{$self->{'filemap'}})) {
            $uniq{$_} = 1;
        }
        @files = keys(%uniq);
    }
    foreach my $file (@files) {
        if (-e $self->{'filemap'}->{$file}) {
            if ($self->{'debug'}) {
                print STDERR "Removing old file ".$self->{'filemap'}->{$file}."\n";
            }
            unlink($self->{'filemap'}->{$file}) || die "Output file '".$self->{'filemap'}->{$file}."' exists and could not be deleted\n";
        }
    }
}

=head2 C<$kw->generateAll($dir)>

Output files will use the name of each input file, stripping any extension, and
forcing the name to uppercase. Rules in each file will be called:

      ID_FILENAME_WORD
      ID_FILENAME_GROUP_1

where

ID      - Meaningful identifier. From C<$kw->new($id)>, or with C<$kw->id($id)>.
FILENAME- Trimmed input file name. Override with C<$kw->file($file)>. Absent
      for GLOBAL.
WORD    - The individual keyword. Used only if it has a independent score.
GROUP   - The group name. Absent for 'LOCAL'.
1       - The count for hits in that group.

For each scoring rule above, there will be constituent meta rules for each
keyword, as well as further consituent rules to match both the subject and the
body for that word.

=cut

sub generateAll
{
    my $self = shift;
    my @written;

    $self->cleanDir() if (-d $self->getDir());
    $self->createDir() unless (-d $self->getDir());

    foreach my $file (keys(%{$self->{'rules'}})) {
        # Reserve GLOBAL for last
        next if ($file eq 'GLOBAL');
        $self->setFile($file);
        my %all;
        foreach my $group (keys(%{$self->{'rules'}->{$file}})) {
            if ($group eq 'SCORED' || $group eq 'COMMENTS') {
                next;
            } else {
                foreach my $word (@{$self->{'rules'}->{$file}->{$group}}) {
                    $all{$word} = $self->{'rules'}->{$file}->{'COMMENTS'}->{$word} || '';
                }
            }
        }
        $self->generateMetas(\%all);
        $self->generateGroups();
        $self->generateScored() unless ($self->{joinScores});
    }
    $self->generateGlobals();
    return 0;
}

=head2 C<$kw->writeFile($file)>

=cut

sub writeFile
{
    my $self = shift;
    my $file = shift || return "Requires filename";
    return "Nothing generated for $file" unless (defined($self->{'output'}->{$file}));

    return unless (scalar(@{$self->{'output'}->{$file}}));
    if (open(my $fh, ">", $file)) {
        foreach my $line (@{$self->{'output'}->{$file}}) {
            print $fh $line;
        }
        close($fh);
    } else {
        return "Could not open $file for writing";
    }
}

=head2 C<$kw->writeAll()>

=cut

sub writeAll
{
    my $self = shift;
    $self->generateAll() unless (defined($self->{'output'}));
    my @errors;
    foreach my $file (keys(%{$self->{'output'}})) {
        my $ret = $self->writeFile($file);
        push(@errors, $ret) if (defined($ret));
    }
    return ("Errors:\n ".join("\n ", @errors)) if (scalar(@errors));
}

=head2 C<$kw->verifyOutput()>

Process output directory with SpamAssassin linter. Return errors or
undef if successful. An optional C<$path> can be provided to test a
different file or directory.

=cut

sub verifyOutput
{
    my $self = shift;
    my $path = shift || $self->{'dir'};

    use Mail::SpamAssassin;
    my $sa = Mail::SpamAssassin->new( { 'site_rules_filename' => $path } );

    return $sa->lint_rules();
}

=head1 MORE

For discussion of the module and examples, see:

E<lt>https://john.me.tz/projects/article.php?topic=Mail-SpamAssassin-KeywordRuleGenerator<gt>

=head1 SEE ALSO

Mail::SpamAssassin
spamassassin

=head1 BUGS

Report issues to:

E<lt>https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGenerator/issuesE<gt>

=head1 AUTHOR

John Mertz <git@john.me.tz>

=head1 COPYRIGHT

Mail::SpamAssassin::KeywordRuleGenerator is distributed under the Apache License
Version 2.0, as described in this file and the file C<LICENSE> included with the
distribution.

=head1 AVAILABILITY

If possible, the latest version of this library will be made available from CPAN
as well as:

E<lt>https://git.john.me.tz/jpm/Mail-SpamAssassin-KeywordRuleGeneratorE<gt>

=cut

1;