#!/usr/bin/perl

use strict;
use warnings;
use vars qw#$DEBUG#;

use Carp;
use DBI;
use Getopt::Long;

our $DB_NAME = 'obmf';
our $DB_HOST = 'localhost';
our $DB_USER = 'obmf_user';
our $DB_PASS = '';

=head1 NAME

octo's bayesian mail filter - version 0.05

parse mail with a little help from Thomas B.

=head1 USAGE

obmf.pl <mode>

 Modes:
  --filter	Use as a filter
  --spam	Mark the mail from STDIN as spam
  --good	Mark the mail from STDIN as good
  --score	Display a messages score

obmf expects to read a mail from STDIN and prints it's output to STDOUT.

=cut

our $DBH;
our %STH = ();

$DEBUG = 0;
my $filter = 0;
my $spam = 0;
my $good = 0;
my $score = 0;

GetOptions (filter => \$filter, spam => \$spam, good => \$good,
	score => \$score, debug => \$DEBUG)
	or die "Unable to parse arguments";

if (!($filter || $spam || $good || $score))
{
	print STDERR "Usage: ", $0, " <mode>", $/,
		"Run `perldoc ", $0, "' for the documentation.", $/;
	exit (1);
}

$DBH = DBI->connect ("DBI:mysql:database=$DB_NAME;host=$DB_HOST", $DB_USER, $DB_PASS)
	or die DBI->errstr ();

while (1)
{
	my ($head, $content) = read_stdin ();
	last unless ($head && $content);
	my $msgid = get_message_id ($head);
	my $text = remove_non_text ($head, $content);
	my @tokens = get_tokens_from_text ($text);

=head1 OPTIONS

=over 4

=item --filter

If you want to use this program as a filter:
The mail is read from STDIN, processed, a X-Spam-Probability is appended
and the tokens are, eventually, added to the database. After
that the mail is written to STDOUT again for further processing. This is
what you want to put into your .procmailrc

=cut

	if ($filter)
	{
		my @probs = map { calculate_probability_for_token ($_) } (@tokens);
		my ($m_tokens, $m_probs) = get_most_interesting (\@tokens, \@probs);
		my $prob = calculate_total_probability (@$m_probs);

		print STDOUT $head, $/;
		printf STDOUT ("X-Spam-Probability: %02.2f%%\n\n", $prob * 100);
		print STDOUT $content;
	
		if (!mail_in_db ($msgid))
		{
			if ($prob > 0.5)
			{
				mark_mail ('spam', $msgid);
				mark_tokens ('spam', @tokens);
			}
			elsif ($prob < 0.5)
			{
				mark_mail ('good', $msgid);
				mark_tokens ('good', @tokens);
			}
		}
	}

=item --spam

Reads the mail from STDIN and adds the tokens to the database, maked as
"spam". If the mail has been processed before (will be recognized by the
"Message-ID" header field) and the mail was not marked "spam" before it
will now be and the tokens are updated. The mail is not altered in any
way, so if you pipe a mail that has been processed using the --filter
option, the "X-Spam-Probability" header field will B<not> change!

=cut

	elsif ($spam)
	{
		if (mail_in_db ($msgid))
		{
			my $type = get_mail_type ($msgid);
			
			if ($type eq 'good')
			{
				change_mail   ('spam', $msgid);
				change_tokens ('spam', @tokens);
				print STDOUT "Message changed to `spam'", $/;
			}
			else
			{
				print STDOUT "Message already marked as `spam'", $/;
			}
		}
		else
		{
			mark_mail   ('spam', $msgid);
			mark_tokens ('spam', @tokens);
			print STDOUT "Message marked as `spam'", $/;
		}
	}

=item --good

Same as --spam, but vice versa..

=cut

	elsif ($good)
	{
		if (mail_in_db ($msgid))
		{
			my $type = get_mail_type ($msgid);
			
			if ($type eq 'spam')
			{
				change_mail   ('good', $msgid);
				change_tokens ('good', @tokens);
				print STDOUT "Message changed to `good'", $/;
			}
			else
			{
				print STDOUT "Message already marked as `good'", $/;
			}
		}
		else
		{
			mark_mail   ('good', $msgid);
			mark_tokens ('good', @tokens);
			print STDOUT "Message marked as `good'", $/;
		}
	}

=item --score

Display a message's probability of being spam

=cut

	elsif ($score)
	{
		my @probs = map { calculate_probability_for_token ($_); } (@tokens);
		my ($m_tokens, $m_probs) = get_most_interesting (\@tokens, \@probs);
		my $prob = calculate_total_probability (@$m_probs);
		
		while (@$m_tokens && @$m_probs)
		{
			my $t = shift (@$m_tokens);
			my $p = shift (@$m_probs);
			
			if (length ($t) > 20) { $t = substr ($t, 0, 20); }
			printf ("  %20s - %02.2f%%\n", $t, $p * 100);
		}
	
		printf STDOUT ("Score: %02.2f%%\n", $prob * 100);
	}
							
	else { die; }
}

=item --debug

Turns on debugging (as you might have guessed..)

=back

=cut

$DBH->disconnect ();

exit (0);

=head1 DATABASE LAYOUT

There are two tables which are used to store information about the mails
and one for the tokens:

=head2 Table "mails"

=over 4

=item messageid

The message ID (B<primary key>).

=item user

The username this mail belongs to.

=item type

"good" or "spam".

=item date

The date this entry was made. May be used by a cronjob to clean up
database.

=back

=head2 Table "tokens"

=over 4

=item token

The token itself (B<primary key>).

=item good

Number of occurences in good mails.

=item spam

Number of occurences in spam mails.

=item lastseen

The date this token has been seen the last time. May be used by
a cronjob to clean up database.

=back

=head1 INTERNALS

=head2 calculate_probability_for_token ($token);

Returns the probability of the mail containing this token being spam, which
calculated using Bayes' law. If the token did not yet appear in spam-
and/or regular mail, one occurance is being assumed. This is neccessary to
not force the result to become zero. However, the result is forced to be
somewhere between 0.01 and 0.99; again, for mathematical reasons.

=cut

sub calculate_probability_for_token
{
	my $token = shift;

	my $sth_mails;
	my $sth_tokens;
	
	if (defined ($STH{'calculate_probability_for_token__tokens'}))
	{
		$sth_tokens = $STH{'calculate_probability_for_token__tokens'};
	}
	else
	{
		$sth_tokens = $DBH->prepare ('SELECT good, spam FROM tokens WHERE token = ?')
			or die ('prepare: ' . $DBH->errstr ());
		$STH{'calculate_probability_for_token__tokens'} = $sth_tokens;
	}

	if (defined ($STH{'calculate_probability_for_token__mails'}))
	{
		$sth_mails = $STH{'calculate_probability_for_token__mails'};
	}
	else
	{
		$sth_mails = $DBH->prepare ('SELECT COUNT(*) FROM mails WHERE type = ?')
			or die ('prepare: ' . $DBH->errstr ());
		$STH{'calculate_probability_for_token__mails'} = $sth_mails;
	}
	
	$sth_mails->execute ('good')
		or die ('execute: ' . $sth_mails->errstr ());
	my ($total_good_mails) = $sth_mails->fetchrow_array ();
	$sth_mails->finish ();

	$sth_mails->execute ('spam')
		or die ('execute: ' . $sth_mails->errstr ());
	my ($total_spam_mails) = $sth_mails->fetchrow_array ();
	$sth_mails->finish ();

	$total_good_mails = 1
		unless (defined ($total_good_mails) and $total_good_mails);

	$total_spam_mails = 1
		unless (defined ($total_spam_mails) and $total_spam_mails);

	my $total_mails = $total_good_mails + $total_spam_mails;

	$sth_tokens->execute ($token)
		or die ('execute: ' . $sth_tokens->errstr ());
	my ($total_good_token, $total_spam_token) = $sth_tokens->fetchrow_array ();
	$sth_tokens->finish ();

	$total_good_token = 1
		unless (defined ($total_good_token) and $total_good_token);
	$total_spam_token = 1
		unless (defined ($total_spam_token) and $total_spam_token);

# FIXME ?
#	if (($total_good_token + $total_spam_token) < 5)
#	{
#		print STDERR "Not enough occurences of '$token', ",
#			"returning 0.50\n" if $DEBUG;
#		return (0.50);
#	}
	
	if ($total_good_token > $total_good_mails)
	{
		confess ('$total_good_token > $total_good_mails');
	}

	if ($total_spam_token > $total_spam_mails)
	{
		confess ('$total_spam_token > $total_spam_mails');
	}
	
	my $prob_good_total = $total_good_mails / $total_mails;
	my $prob_spam_total = $total_spam_mails / $total_mails;

	my $prob_good_token = $total_good_token / $total_good_mails;
	my $prob_spam_token = $total_spam_token / $total_spam_mails;

	my $prob_this_token_is_spam =
		($prob_spam_total * $prob_spam_token) /
		(($prob_spam_total * $prob_spam_token) +
			($prob_good_total * $prob_good_token));
		
	$prob_this_token_is_spam = 0.001
		unless ($prob_this_token_is_spam > 0.001);
	$prob_this_token_is_spam = 0.999
		unless ($prob_this_token_is_spam < 0.999);
		
	printf (STDERR "P(\"%s\") = %02.2f;\n", $token, $prob_this_token_is_spam) if ($DEBUG);
	return ($prob_this_token_is_spam);
}

=head2 calculate_total_probability (@probs)

Calculates the average probability. This is probably mathematically
incorrecty, but it works well.

=cut

# from http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
#sub calculate_total_probability_gary_robinson
sub calculate_total_probability
{
	my $count = 0;
	my $pro = 1;
	my $con = 1;

	for (@_)
	{
		$pro *= (1 - $_);
		$con *= $_;
		$count++;
	}
	
	return (0) unless ($count);

	$pro = 1 - $pro ** (1 / $count);
	$con = 1 - $con ** (1 / $count);

	return ((1 + (($pro - $con) / ($pro + $con))) / 2);
}

sub calculate_total_probability_octo
{
	my $sum_prob = 0;
	my $cnt_prob = 0;
	
	for (@_)
	{
		$sum_prob += $_;
		$cnt_prob++;
	}

	return ($sum_prob / ($cnt_prob ? $cnt_prob : 1));
}

sub calculate_total_probability_paul_graham
{
	my $pro = 100;
	my $con = 100;

	for (@_)
	{
		$pro *= $_;
		$con *= (1 - $_);

		if ($pro < 1 and $con < 1)
		{
			$pro *= 100;
			$con *= 100;
		}
	}
	
	return ($pro / ($pro + $con));
}

sub mark_tokens
{
	my $class = shift;
	my @tokens = @_;
	
	my $sql_check  = 'SELECT COUNT(*) FROM tokens WHERE token = ?';
	my $sql_update = "UPDATE tokens SET $class = $class + 1, lastseen = NOW() WHERE token = ?";
	my $sql_insert = "INSERT INTO tokens (token, $class, lastseen) VALUES (?, 1, NOW())";
	
	my $sth_check  = $DBH->prepare ($sql_check)
		or die ('prepare: ' . $DBH->errstr ());
	my $sth_update = $DBH->prepare ($sql_update)
		or die ('prepare: ' . $DBH->errstr ());
	my $sth_insert = $DBH->prepare ($sql_insert)
		or die ('prepare: ' . $DBH->errstr ());

	for (@tokens)
	{
		my $token = $_;
		my $count;
		
		$sth_check->execute ($token)
			or die ('execute: ' . $sth_check->errstr ());
		($count) = $sth_check->fetchrow_array ();

		if ($count)
		{
			$sth_update->execute ($token);
		}
		else
		{
			$sth_insert->execute ($token);
		}
	}

	$sth_check->finish ();
	$sth_update->finish ();
	$sth_insert->finish ();
}

sub change_tokens
{
	my $class = shift;
	my @tokens = @_;
	my $other_class = ($class eq 'good' ? 'spam' : 'good');
	
	my $sql_update = "UPDATE tokens SET $other_class = $other_class - 1 WHERE $other_class > 1 AND token = ?";
	my $sth_update = $DBH->prepare ($sql_update)
		or die ('prepare: ' . $DBH->errstr ());
	
	for (@tokens)
	{
		$sth_update->execute ($_);
	}
	
	$sth_update->finish ();
	
	mark_tokens ($class, @tokens);
}

sub mark_mail
{
	my $class = shift;
	my $message_id = shift;
	my $sth = $DBH->prepare ('INSERT INTO mails (messageid, user, type, date) VALUES (?, ?, ?, NOW())')
		or die ('prepare: ' . $DBH->errstr ());

	my $user = 'unknown';
	if (defined ($ENV{'USER'}))
	{
		$user = $ENV{'USER'};
	}

	$sth->execute ($message_id, $user, $class)
		or die ('execute: ' . $sth->errstr ());

	$sth->finish ();
}

sub change_mail
{
	my $class = shift;
	my $message_id = shift;
	my $sth = $DBH->prepare ("UPDATE mails SET type = ? WHERE messageid = ?")
		or die ('prepare: ' . $DBH->errstr ());

	$sth->execute ($class, $message_id)
		or die ('execute: ' . $sth->errstr ());

	$sth->finish ();
}

=head2 mail_in_db ($message_id)

Returns true if the message ID is in the database, false otherwise.

=cut

sub mail_in_db
{
	my $message_id = shift;
	my $sth;
	my $retval = 0;

	if (defined ($STH{'mail_in_db'}))
	{
		$sth = $STH{'mail_in_db'};
	}
	else
	{
		$sth = $DBH->prepare ('SELECT COUNT(*) FROM mails WHERE messageid = ?')
			or die ('prepare: ' . $DBH->errstr ());
		$STH{'mail_in_db'} = $sth;
	}

	$sth->execute ($message_id)
		or die ('execute: ' . $sth->errstr ());
	
	($retval) = $sth->fetchrow_array ();
	$sth->finish (); # don't fetch more data

	if (!defined ($retval)) { $retval = 0; }
	return ($retval);
}

=head2 get_mail_type ($message_id)

Returns the mail's status or "none" if the mail isn't in the database.

=cut

sub get_mail_type
{
	my $message_id = shift;
	my $type;
	my $sth = $DBH->prepare ('SELECT type FROM mails WHERE messageid = ?')
		or die ('prepare: ' . $DBH->errstr ());

	$sth->execute ($message_id)
		or die ('execute: ' . $sth->errstr ());

	($type) = $sth->fetchrow_array ();
	$sth->finish ();

	if (!defined ($type)) { $type = 'none'; }
	return ($type);
}

=head2 get_message_id ($header)

Extracts the Message-ID header field from the mail's header. If it does
not exist it will die. This sucks when you (try to) use the --filter
option, but I don't feel like doing anything else for now. Your patch
will be most welcome ;)

=cut

sub get_message_id
{
	my $header = shift;
	my $msgid;

	if ($header =~ m/^Message-ID: <([^>]+)>/mi)
	{
		$msgid = $1;
	}
	else
	{
		print STDERR "--- BEGIN HEADER DUMP ---", $/;
		print STDERR $header, $/;
		print STDERR "--- END HEADER DUMP ---", $/;
		die "No Message-ID found";
	}
	
	return ($msgid);
}

=head2 get_most_interesting ([token0, token1, ...], [prop0, prop1, ...]);

Returns two array references with the 15 most interesting (i.e. the
greatest difference from 50.0%) tokens and their propabilities (for being
spam).

=cut

sub get_most_interesting
{
	my @tokens = @{$_[0]};
	my @probs = @{$_[1]};
	my %probs = ();
	my $num = 15; # FIXME
	my @sorted = ();

	while (@tokens && @probs)
	{
		my $t = shift (@tokens);
		my $p = shift (@probs);

		$probs{$t} = $p;
	}

	if (@tokens || @probs)
	{
		die "\@tokens and \@probs not equally long";
	}

	@sorted = sort
	{
		abs (0.50 - $probs{$b})
			<=>
		abs (0.50 - $probs{$a})
	} keys %probs;

	for (; $num && @sorted; $num--)
	{
		my $t = shift (@sorted);

		push (@tokens, $t);
		push (@probs, $probs{$t});
	}

	return (\@tokens, \@probs);
}
	
=head2 get_tokens_from_text (@text)

Extracts "tokens" from the text line(s) given. This is done by breaking
the line at any character, except alphanumeric characters, the dollar
sign, dashes and apostrophes. After that all tokens that are shorter than
five characters or which entirely consist of non-alphanumeric characters
are discarded. By the way: Each token is returned only once and case does
does not matter.

=cut

sub get_tokens_from_text
{
	my $text = join (' ', @_);
	my %tokens = ();

	$text =~ s/[^\w\$\-\']+/ /sg;
	
	for (split (m/\s+/, $text))
	{
		next if (length ($_) < 5);
		next if (length ($_) > 30);
		next if ($_ !~ m/\w/);

		$tokens{lc ($_)}++;
	}

	return (keys %tokens);
}

{
	my $buf = '';
	
	sub read_stdin
	{
		my $text;
		my $mail = '';
		my $firstline = 1;
		
		if ($buf)
		{
			$mail = $buf;
			$buf = '';
		}
		
		while ($text = <STDIN>)
		{
			if ($firstline)
			{
				$mail .= $text;
				$firstline = 0;
			}
			elsif ($text =~ m/^From\s/)
			{
				$buf = $text;
				last;
			}
			else
			{
				$mail .= $text;
			}
		}

		$mail =~ s/\r//;
		my ($head, $content) = split (m/\n\n/s, $mail, 2);

		return ($head, $content);
	}
}

=head2 remove_non_text ($header, $content)

Parses the mail and removes any non-text parts of the mail using it's
Content-Type header. If there is none "plain/text" will be assumed and
the entire mail will be read. Multipart messages are being broken appart
and each part is handled seperately. message/rfc822 are also recognized
and being acted upon.

=cut

sub remove_non_text
{
	my $header = shift;
	my $content = shift;
	my $text = '';

	# remove multi_line_ header fields
	$header =~ s/\n\s+/ /gs;
	if ($header =~ s/^X-Spam-Probability:.+$//m) { $header =~ s/\n\n+/\n/s; }

	if ($header =~ m#^Content-Type:\s+multipart/.+boundary="?([^"]+)#mi)
	{
		my $boundary = quotemeta ($1);
		my @parts = split (m/^-*$boundary.*/m, $content);
		for (@parts) { s/^[\s\n]+//s; s/[\s\n]+$//s; }

		$text = "$header\n\n";
		
		for (grep { m/\n\n/s } @parts)
		{
			my ($header, $content) = split (m/\n\n/s, $_, 2);
			$text .= remove_non_text ($header, $content);
		}
	}
	elsif ($header =~ m#^Content-Type:\s+text/#mi)
	{
		$text = "$header\n$content\n";
	}
	elsif ($header =~ m#^Content-Type:\s+message/rfc822#mi)
	{
		my ($inline_header, $inline_content) =
			split (m/\n\n/s, $content, 2);
		$text = remove_non_text ($inline_header, $inline_content);
	}
	elsif ($header =~ m#^Content-Type:\s+(\S+)#mi)
	{
		print STDERR "Content-Type '$1' not supported", $/
			if ($DEBUG);
	}
	elsif ($header !~ m#^Content-Type:#mi)
	{
		$text = "$header\n$content\n";
	}

	return ($text);
}

=head1 AUTHOR

This program was written by Florian octo Forster (octo at verplant org)
after the lecture of "A Plan For Spam" by Paul Graham.

=cut
