#!/usr/bin/perl -w
# Version 1.108
use strict;
use usleep;
my $usage="usage: check4spam [-dhju] [-ADSpt <#>] [-acins <file>] [-bg <dir>] [-P <cmd>]\n";
my $safecat="/usr/local/bin/safecat";
my $threshold=3;
my $changesubject=0;
my $debug=-t STDERR;
my $dir="";
my $spamdir="";
my $maxspamsize=-1;
my $autospamsize=-1;
my $maxprobsize=-1;
my $maxdnsdemerits=3;
my $sender=""; # Will be set to From: email address
my @lines;
my @badmailer=();
my @probprog=();
my %headers=();
my %badfrom=();
my %badip=();
my %nocheck=();
my %skip=();
parseargs(@ARGV);
die "check4spam: no spamdir specified; aborting\n" unless($spamdir);
my $line;
@lines=keys %nocheck;
%nocheck=();
while(@lines) {
   open(NOCHECK,"<".shift @lines) or die "open: $!";
   while(defined($line=<NOCHECK>)) {
      chomp $line;
      $nocheck{$1}=1 while($line =~ /(\d+\.\d+\.\d+\.(\d+|\*))/g);
   }
   close(NOCHECK);
}
@lines=keys %skip;
%skip=();
while(@lines) {
   open(SKIP,"<".shift @lines) or die "open: $!";
   while(defined($line=<SKIP>)) {
      chomp $line;
      $skip{$1}=1 while($line =~ /(\d+\.\d+\.\d+\.(\d+|\*))/g);
   }
   close(SKIP);
}
@lines=keys %badfrom;
%badfrom=();
while(@lines) {
   open(BADFROM,"<".shift @lines) or die "open: $!";
   while(defined($line=<BADFROM>)) {
      chomp $line;
      $badfrom{lc $1}=(defined($2)?$2:$1) if($line =~ /^(\S+)(?:\s+(.*))?$/s);
   }
   close(BADFROM);
}
@lines=keys %badip;
%badip=();
while(@lines) {
   open(BADIP,"<".shift @lines) or die "open: $!";
   while(defined($line=<BADIP>)) {
      chomp $line;
      $badip{$1}=1 while($line =~ /(\d+\.\d+\.\d+\.(\d+|\*))/g);
   }
   close(BADIP);
}
@lines=@badmailer;
@badmailer=();
while(@lines) {
   open(BADMAILER,"<".shift @lines) or die "open: $!";
   while(defined($line=<BADMAILER>)) {
      chomp $line;
      push @badmailer,$1 if($line =~ /^\s*(.*)\s*$/s);
   }
   close(BADMAILER);
}
my ($ip,$classc,$revip,$curlinelen,@tmp);
my $subjnum=-1;
my $endhead=-1;
my $foundcleanone=0;
my $demerits=0;
my $dnsdemerits=0;
my $maxlinelen_notms=0;
my $maxlinelen=0;
my $emailsize=-1;
my $bodyread=0;
my $firsthost=1;
my $header=":";
my @heads=();
my %notes=();
my %ipschecked=();
@tmp=stat(STDIN);
$emailsize=$tmp[7] if(@tmp);
while(defined($line=<STDIN>)) {
   push @lines,$line;
   $line =~ s/\r?\n$//s;
   if(not length $line) {
      $endhead=$#lines;
      last;
   }
   if($line =~ /^(\S+):\s*(.*)$/s) {
		$header=lc($1);
      if(exists $headers{lc $1}) {
         $headers{$header}.=" $2";
      } else {
         $headers{$header}=$2;
      }
   } elsif($line =~ /^\S/s) {
      $endhead=$#lines;
      push @lines,"\n",pop @lines;
      last;
   }
   $curlinelen=0 unless($line =~ /^\s/s);
   $curlinelen+=length $line;
   unless($line =~ /^DKIM-Signature:/is) {
      $maxlinelen=$curlinelen if($maxlinelen<$curlinelen);
      $maxlinelen_notms=$curlinelen if($maxlinelen_notms<$curlinelen and "x-microsoft-exchange-diagnostics" ne $header and "x-ymail-osg" ne $header);
   }
   $subjnum=$#lines if($subjnum==-1 and "subject" eq $header);
# We used to do a generic IP-anywhere-in-the-line search:
#   if($line =~ /^Received: .*?\b(\d+)\.(\d+)\.(\d+)\.(\d+)\b/i) {
# But it was fooled so now we only look for some known formats:
   if($line =~ /^Received: .*?[@(](\d+)\.(\d+)\.(\d+)\.(\d+)\)$/i
   or $line =~ /^Received: .*?\(\S+ \[(\d+)\.(\d+)\.(\d+)\.(\d+)\]\)$/i
   or $line =~ /^Received: from \S* \(\[(\d+)\.(\d+)\.(\d+)\.(\d+)\]/i
   or $line =~ /^Received: from.*?\(\[(\d+)\.(\d+)\.(\d+)\.(\d+)\]/i
   or $line =~ /^Received: from \[(\d+)\.(\d+)\.(\d+)\.(\d+)\] \(helo=.*\)$/i
# And if all those fail we try a limited version of the original:
   or $line =~ /^Received: .*?\b(\d+)\.(\d+)\.(\d+)\.(\d+)(?:$|[^0-9.])/i) {
      next unless($1<256 and $2<256 and $3<256 and $4<256 and $1 != 10 and ($1 != 192 or $2 != 168) and ($1 != 172 or $2<16 or $2>31));
      # Some mail servers have a version number that looks like an IP address:
      next if(($1 and "0" eq substr($1,0,1)) or ($2 and "0" eq substr($2,0,1)) or ($3 and "0" eq substr($3,0,1)) or ($4 and "0" eq substr($4,0,1)));
      $ip="$1.$2.$3.$4";
      next if($ipschecked{$ip});
      $ipschecked{$ip}=1;
      $classc="$1.$2.$3.*";
      $revip="$4.$3.$2.$1";
      next if($skip{$ip} or $skip{$classc});
      if($nocheck{$ip} or $nocheck{$classc}) {
         $foundcleanone=1;
      } else {
         my $notes="X-SpamCheck-$ip:";
         $line=&checkrevdns($ip);
         unless($line) {
            $notes.=" DNS";
            if($foundcleanone) {
               $notes.="(ignored)";
               $dnsdemerits=1 if($dnsdemerits<1);
            } else {
               $notes{"{DNS} "}=1;
               $dnsdemerits+=2;
            }
         }
         $notes{"{badip} "}=1,$notes.=" badip",$dnsdemerits+=3 if($badip{$ip} or $badip{$classc});
#         $line=&checkrevip("$revip.dialups.mail-abuse.org."); # a.k.a. dul.maps.vix.com.
#         if($line) {
#            $notes.=" DUL";
#            if($foundcleanone) {
#               $notes.="(ignored)";
#               $dnsdemerits=1 if($dnsdemerits<1);
#            } else {
#               $notes{"{DUL} "}=1;
#               $dnsdemerits+=2;
#            }
#         }
#         $line=&checkrevip("$revip.relays.mail-abuse.org.");
#         $notes{"{RSS} "}=1,$notes.=" RSS",$dnsdemerits+=3 if($line);
#         $line=&checkrevip("$revip.blackholes.mail-abuse.org.");
#         $notes{"{BHMA} "}=1,$notes.=" BHMA",$dnsdemerits+=3 if($line);
         $line=&checkrevip("$revip.bl.spamcop.net.");
         $notes{"{SC} "}=1,$notes.=" SC",$dnsdemerits+=3 if($line);
         if($firsthost) {
            $line=&checkrevip("$revip.zen.spamhaus.org.");
            if($line) {
               $line =~ s/^127\.0\.0\.(\d+)$/$1/s;
               $notes{"{ZSH} "}=1,$notes.=" ZSH($line)";
               $demerits+=(10<=$line and $line<=11)?2:3;
            }
         }
         $line=&checkrevip("$revip.blackholes.easynet.nl.");
         $notes{"{BEN} "}=1,$notes.=" BEN",$dnsdemerits+=3 if($line);
         $line=&checkrevip("$revip.dynablock.easynet.nl.");
         if($line) {
            $notes.=" DEN";
            if($foundcleanone) {
               $notes.="(ignored)";
               $dnsdemerits=1 if($dnsdemerits<1);
            } else {
               $notes{"{DEN} "}=1;
               $dnsdemerits+=2;
            }
         }
         $line=&checkrevip("$revip.spam.pedantic.org.");
         $notes{"{SPO} "}=1,$notes.=" SPO",$dnsdemerits+=3 if($line);
#         $line=&checkrevip("$revip.netblock.pedantic.org.");
#			if($line) {
#            $notes.=" NPO";
#            if($foundcleanone) {
#               $notes.="(ignored)";
#               $dnsdemerits=1 if($dnsdemerits<1);
#            } else {
#               $notes{"{NPO} "}=1;
#               $dnsdemerits+=2;
#            }
#         }
         $foundcleanone=1 if(substr($notes,-1) eq ":");
         $notes.="\n";
         push @heads,$notes;
      }
      $firsthost=0;
   } elsif($line =~ /^(?:To|From):[ \t]+.*?<([^<>]+)>/i) {
      $sender=$1 if(-1<index($1,'@') and "F" eq uc substr($line,0,1));
      if($badfrom{lc $1}) {
         push @heads,"X-SpamCheck-$1: $badfrom{lc $1}\n";
         $notes{"{badfrom} "}=1;
         $demerits+=3;
      }
      if(20<($1 =~ /=/g)) {
         push @heads,"X-SpamCheck-From: Too many equals\n";
         $notes{"{tme} "}=1;
         $demerits+=3;
      }
   } elsif($line =~ /^(?:To|From):[ \t]+([^\s"<>()]+)$/i) {
      $sender=$1 if(-1<index($1,'@') and "F" eq uc substr($line,0,1));
      if($badfrom{lc $1}) {
         push @heads,"X-SpamCheck-$1: $badfrom{lc $1}\n";
         $notes{"{badfrom} "}=1;
         $demerits+=3;
      }
      if(20<($1 =~ /=/g)) {
         push @heads,"X-SpamCheck-From: Too many equals\n";
         $notes{"{tme} "}=1;
         $demerits+=3;
      }
   } elsif($line =~ /^From:[ \t]+.*?<>/i) {
      $sender="";
      push @heads,"X-SpamCheck-from: blank from\n";
      $notes{"{badfrom} "}=1;
      $demerits+=3;
   } elsif($line =~ /^Content-Type:[ \t]+text\/html/i) {
      push @heads,"X-SpamCheck-Content-Type: HTML only\n";
      $notes{"{HTML} "}=1;
      $demerits+=2;
   } elsif($line =~ /^Content-Type:.*charset ?="?(iso-2022-jp|Big5|euc-jp)/i) {
      push @heads,"X-SpamCheck-Content-Type: Japanese\n";
      $notes{"{CT=JP} "}=1;
      $demerits+=3;
   } elsif($line =~ /^Content-Type:.*charset ?="?(koi8-r)/i) {
      push @heads,"X-SpamCheck-Content-Type: Russian\n";
      $notes{"{CT=KR} "}=1;
      $demerits+=3;
   } elsif($line =~ /^Content-Type:.*charset ?="?(gb2312|ks_c_5601-1987)/i) {
      push @heads,"X-SpamCheck-Content-Type: charset\n";
      $notes{"{CT=CS} "}=1;
      $demerits+=3;
   } elsif($line =~ /^Content-Type:.*charset ?="?(windows-1251)/i) {
      push @heads,"X-SpamCheck-Content-Type: Windows\n";
      $notes{"{CT=W} "}=1;
      $demerits+=2;
   } elsif($line =~ /^Content-Transfer-Encoding:.*(base64)/i) {
      push @heads,"X-SpamCheck-Content-Transfer-Encoding: $1\n";
      $notes{"{CTE} "}=1;
      $demerits+=1;
   } elsif($line =~ /^X-Mailer:\s+(.*)$/i) {
      $line=$1;
      foreach my $tmp (@badmailer) {
         if($line =~ /$tmp/is) {
            push @heads,"X-SpamCheck-X-Mailer: $line\n";
            $notes{"{X-Mailer} "}=1;
            $demerits+=3;
         }
      }
   } elsif($line =~ /^X-Converted-To-Plain-Text:/i and not $notes{"{XCTPT} "}) {
      push @heads,"X-SpamCheck-XCTPT: XCTPT\n";
      $notes{"{XCTPT} "}=1;
      $demerits+=1;
   } elsif($line =~ /^X-Bulkmail:/i and not $notes{"{XBM} "}) {
      push @heads,"X-SpamCheck-XBM: XBM\n";
      $notes{"{XBM} "}=1;
      $demerits+=3;
   }
}
exit(99) if(-1==$endhead);
if($maxlinelen_notms>1536) {
   push @heads,"X-SpamCheck-Line-Too-Long: $maxlinelen_notms\n";
   $notes{"{X-LTL} "}=1;
   $demerits+=2;
} elsif($maxlinelen>1536) {
   push @heads,"X-SpamCheck-Line-Too-Long: $maxlinelen (really $maxlinelen_notms)\n";
   $notes{"{X-LTL} "}=1;
   $demerits+=0;
}
unless($headers{date}) {
   push @heads,"X-SpamCheck-nodate: nodate\n";
   $notes{"{nodate} "}=1;
   $demerits+=2;
}
if(-1==$emailsize) {
   push @lines,<STDIN>;
   $bodyread=1;
   $emailsize=0;
   map {$emailsize+=length} @lines;
}
$dnsdemerits=$maxdnsdemerits if($dnsdemerits>$maxdnsdemerits);
$demerits+=$dnsdemerits;
if(-1 != $autospamsize and -1 != $emailsize and $autospamsize<$emailsize) {
   $demerits+=$threshold;
}
if(@probprog and $demerits<$threshold and (-1==$maxprobsize or $emailsize<=$maxprobsize)) {
   use FileHandle;
   use IPC::Open2;
   my $pid=open2(\*READER,\*WRITER,@probprog);
   WRITER->autoflush();
   unless($bodyread) {
      push @lines,<STDIN>;
      $bodyread=1;
   }
   $SIG{PIPE}="IGNORE";
   print WRITER @lines;
   close(WRITER);
   my $prob=<READER>;
   $prob=int(100*$prob);
   push @heads,"X-SpamProbability: $prob\%\n";
   close(READER);
   if($prob==100) {
      $demerits+=1;
   }
   $demerits+=2 if($prob>97);
   $demerits-=1 if($prob<25);
   $demerits-=1 if($prob<3);
}
$line=join('',sort keys %notes);
if($demerits>=$threshold) {
   ($dir,$spamdir)=($spamdir,$dir);
   push @heads,"X-IsSpam: $demerits\n";
   exit 99 if(-1 != $maxspamsize and $maxspamsize<$emailsize);
} elsif(not $dir) {
   exit 0;
}
if($changesubject) {
   if($subjnum>-1) {
      $lines[$subjnum] =~ s/^(Subject: )/$1$line/i;
   } else {
      push @heads,"Subject: $line<no subject>\n";
   }
}
splice @lines,$endhead,0,@heads;
$endhead+=scalar(@heads);
unless($bodyread) {
   push @lines,<STDIN>;
   $bodyread=1;
}
if($debug or $dir eq "-") {
   print @lines;
} else {
   open(SAFECAT,"|$safecat '$dir/tmp' '$dir/new'") or exit 111;
   print SAFECAT @lines or exit 100;
   close(SAFECAT); # No exit here; we catch exit code later
   if($spamdir or $?>>8) {
      exit($?>>8);
   }
   exit(99);
}

sub checkrevip {
   my $revip=shift;
   print STDERR "Checking $revip..." if($debug);
   my $addr=gethostbyname($revip);
   if(defined $addr) {
      $addr=join('.',unpack("C".length($addr),$addr));
   } else {
      $addr="";
   }
   print STDERR "$addr\n" if($debug);
   return $addr;
}

sub checkrevdns {
   use Socket;
   my $addr=shift;
   my $ip=pack("C4",split /\./,$addr);
   print STDERR "Checking $addr..." if($debug);
   my $name=gethostbyaddr($ip,AF_INET);
   $name="" unless(defined $name);
	$name="" unless($name =~ /\./s); # 204.188.196.44 returns "No-RDNS-Record"
   print STDERR "$name\n" if($debug);
   return $name;
}

sub parseargs {
   my @args=@_;
   while(@args and ($args[0] =~ /^-(([ADPSabcgimnpst])|[dhj])$/s) and @args>($2?1:0)) {
      my $tmp=$1;
      shift @args;
      if($tmp eq "A") {
         $autospamsize=0+shift @args;
      } elsif($tmp eq "D") {
         $maxdnsdemerits=0+shift @args;
      } elsif($tmp eq "P") {
         @probprog=@args;
         @args=();
      } elsif($tmp eq "S") {
         $maxspamsize=0+shift @args;
      } elsif($tmp eq "a") {
         $badfrom{shift @args}=1;
      } elsif($tmp eq "b") {
         die $usage if($spamdir);
         $spamdir=shift @args;
      } elsif($tmp eq "c") {
         open(CONFIG,"<".shift(@args)) or die "open: $!";
         my @newargs=grep !/^#/,<CONFIG>;
         close(CONFIG);
         chomp(@newargs);
         parseargs(@newargs);
      } elsif($tmp eq "d") {
         $debug=not $debug;
      } elsif($tmp eq "g") {
         die $usage if($dir);
         $dir=shift @args;
      } elsif($tmp eq "h") {
         showusage();
         exit 0;
      } elsif($tmp eq "i") {
         $badip{shift @args}=1;
      } elsif($tmp eq "j") {
         $changesubject=not $changesubject;
      } elsif($tmp eq "m") {
         push @badmailer,shift @args;
      } elsif($tmp eq "n") {
         $nocheck{shift @args}=1;
      } elsif($tmp eq "p") {
         die $usage unless($args[0] =~ /^\d+$/s);
         $maxprobsize=0+shift @args;
      } elsif($tmp eq "s") {
         $skip{shift @args}=1;
      } elsif($tmp eq "t") {
         die $usage unless($args[0] =~ /^\d+$/s);
         $threshold=0+shift @args;
      } else {
         die $usage;
      }
   }
   shift @args if(@args and $args[0] eq "--");
   die $usage if(@args);
}

sub showusage {
   print $usage,<<"EOD";
	-A <#>     Automatically mark as spam any emails of at least <#> bytes
	-D <#>     Maximum number of demerits allowed due to DNS/RBL lookups
	-P <args>  Run <args> as spam-probability filter (must be last args)
	-S <#>     Throw away emails categorized as spam at least <#> bytes
	-a <file>  File containing list of bad "From:"s
	-b <dir>   Directory in which to store mail categorized as spam
	-c <file>  Break up file by lines and insert into current command line
	-d         Toggle debug mode
	-g <dir>   Directory in which to store mail not categorized as spam
	-h         Show this help message
	-i <file>  File containing list of bad IPs
	-j         Prefix subject with spam categorization(s)
	-m <file>  File containing list of bad X-Mailers
	-n <file>  File containing list of IPs to ignore (intermediate MXs)
	-p <#>     Don't run spam-probability filter on emails >= <#> bytes
	-s <file>  File containing list of IPs to not check (known good IPs)
	-t <#>     Set demerit threshold to <#>
EOD
}
