modules/exim/files/common/exim_surbl.pl

   1 #
   2 ## THIS FILE IS UNDER PUPPET CONTROL. DON'T EDIT IT HERE.
   3 ## USE: git clone git+ssh://$USER@puppet.debian.org/srv/puppet.debian.org/git/dsa-puppet.git
   4 ##
   5 #
   6 # Copyright (c) 2006-2007 Erik Mugele.  All rights reserved.
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions
  10 # are met:
  11 # 1. Redistributions of source code must retain the above copyright
  12 #    notice, this list of conditions and the following disclaimer.
  13 # 2. Redistributions in binary form must reproduce the above copyright
  14 #    notice, this list of conditions and the following disclaimer in the
  15 #    documentation and/or other materials provided with the distribution.
  16 #
  17 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27 #
  28 # NOTES
  29 # -----
  30 #
  31 # 1. This script makes use of the Country Code Top Level
  32 # Domains (ccTLD) provided by the SURBL group at
  33 # http://spamcheck.freeapp.net/two-level-tlds
  34 # THE VARIABLE $cctld_file MUST BE SET TO THE FULL PATH AND
  35 # NAME OF THE FILE CONTAINING THE CCTLD LIST!  (see below)
  36 #
  37 # 2. This script makes use of whitelisting of popular domains.  The
  38 # source of the list can be found here:
  39 # http://spamassassin.apache.org/full/3.1.x/dist/rules/25_uribl.cf
  40 # These are domains that are whitelisted by the SURBL group so it
  41 # doesn't make sense to waste resources doing lookups on them.
  42 # THE VARIABLE $whitelist_file MUST BE SET TO THE FULL PATH AND
  43 # NAME OF THE FILE CONTAINING THE WHITE LIST!  (see below)
  44 #
  45 # 3. Per the guidelines at http://www.surbl.org, if your site processes
  46 # more than 100,000 messages per day, you should NOT be using the
  47 # public SURBL name servers but should be rsync-ing from them and
  48 # running your own.  See http://www3.surbl.org/rsync-signup.html
  49 #
  50 sub surblspamcheck
  51 {
  52 # Designed and written by Erik Mugele, 2004-2006
  53 # http://www.teuton.org/~ejm
  54 # Version 2.0
  55
  56     # The following variable is the full path to the file containing the
  57     # list of Country Code Top Level Domains (ccTLD).
  58     # ---------------------------------------------------------------------
  59     # THIS VARIABLE MUST BE SET TO THE FULL PATH AND NAME OF THE FILE
  60     # CONTAINING THE CCTLD LIST!
  61     # ---------------------------------------------------------------------
  62     my $cctld_file = "/etc/exim4/ccTLD.txt";
  63
  64     # The following variable is the full path to the file containing
  65     # whitelist entries.
  66     # ---------------------------------------------------------------------
  67     # THIS VARIABLE MUST BE SET TO THE FULL PATH AND NAME OF THE FILE
  68     # CONTAINING THE WHITELIST DOMAINS!
  69     # ---------------------------------------------------------------------
  70     my $whitelist_file = "/etc/exim4/surbl_whitelist.txt";
  71
  72     # This variable defines the maximum MIME file size that will be checked
  73     # if this script is called by the MIME ACL.  This is primarily to
  74     # keep the load down on the server.  Size is in bytes.
  75     my $max_file_size = 50000;
  76
  77     # The following two variables enable or disable the SURBL and URIBL
  78     # lookups.  Set to 1 to enable and 0 to disable.
  79     my $surbl_enable = 1;
  80     my $uribl_enable = 1;
  81
  82     # Check to see if a decode MIME attachment is being checked or
  83     # just a plain old text message with no attachments
  84     my $exim_body = "";
  85     my $mime_filename = Exim::expand_string('$mime_decoded_filename');
  86     if ($mime_filename) {
  87         # DEBUG Statement
  88         #warn ("MIME FILENAME: $mime_filename\n");
  89         # If the MIME file is too large, skip it.
  90         if (-s $mime_filename <= $max_file_size) {
  91             open(fh,"<$mime_filename");
  92             binmode(fh);
  93             while (read(fh,$buff,1024)) {
  94                 $exim_body .= $buff;
  95             }
  96             close (fh);
  97         } else {
  98             $exim_body = "";
  99         }
 100     } else {
 101         $exim_body = Exim::expand_string('$message_body');
 102     }
 103
 104     sub surbllookup {
 105         # This subroutine does the actual DNS lookup and builds and returns
 106         # the return message for the SURBL lookup.
 107         my @params = @_;
 108         my $surbldomain = ".multi.surbl.org";
 109         @dnsbladdr=gethostbyname($params[0].$surbldomain);
 110         # If gethostbyname() returned anything, build a return message.
 111         $return_string = "";
 112         if (scalar(@dnsbladdr) != 0) {
 113             $return_string = "Blacklisted URL in message. (".$params[0].") in";
 114             @surblipaddr = unpack('C4',($dnsbladdr[4])[0]);
 115             if ($surblipaddr[3] & 64) {
 116                 $return_string .= " [jp]";
 117             }
 118             if ($surblipaddr[3] & 32) {
 119                 $return_string .= " [ab]";
 120             }
 121             if ($surblipaddr[3] & 16) {
 122                 $return_string .= " [ob]";
 123             }
 124             if ($surblipaddr[3] & 8) {
 125                 $return_string .= " [ph]";
 126             }
 127             if ($surblipaddr[3] & 4) {
 128                 $return_string .= " [ws]";
 129             }
 130             if ($surblipaddr[3] & 2) {
 131                 $return_string .= " [sc]";
 132             }
 133             $return_string .= ". See http://www.surbl.org/lists.html.";
 134         }
 135         return $return_string;
 136     }
 137
 138     sub uribllookup {
 139         # This subroutine does the actual DNS lookup and builds and returns
 140         # the return message for the URIBL check.
 141         my @params = @_;
 142         my $surbldomain = ".black.uribl.com";
 143         @dnsbladdr=gethostbyname($params[0].$surbldomain);
 144         # If gethostbyname() returned anything, build a return message.
 145         $return_string = "";
 146         if (scalar(@dnsbladdr) != 0) {
 147             $return_string = "Blacklisted URL in message. (".$params[0].") in";
 148             @surblipaddr = unpack('C4',($dnsbladdr[4])[0]);
 149             if ($surblipaddr[3] & 8) {
 150                 $return_string .= " [red]";
 151             }
 152             if ($surblipaddr[3] & 4) {
 153                 $return_string .= " [grey]";
 154             }
 155             if ($surblipaddr[3] & 2) {
 156                 $return_string .= " [black]";
 157             }
 158             $return_string .= ". See http://lookup.uribl.com.";
 159         }
 160         return $return_string;
 161     }
 162
 163     sub converthex {
 164         # This subroutin converts two hex characters to an ASCII character.
 165         # It is called when ASCII obfuscation or Printed-Quatable characters
 166         # are found (i.e. %AE or =AE).
 167         # It should return a converted/plain address after splitting off
 168         # everything that isn't part of the address portion of the URL.
 169         my @ob_parts = @_;
 170         my $address = $ob_parts[0];
 171         for (my $j=1; $j < scalar(@ob_parts); $j++) {
 172             $address .= chr(hex(substr($ob_parts[$j],0,2)));
 173             $address .= substr($ob_parts[$j],2,);
 174         }
 175         $address = (split(/[^A-Za-z0-9._\-]/,$address))[0];
 176         return $address
 177     }
 178
 179     ################
 180     # Main Program #
 181     ################
 182
 183     if ($exim_body) {
 184         # Find all the URLs in the message by finding the HTTP string
 185         @parts = split /[hH][tT][tT][pP]:\/\//,$exim_body;
 186         if (scalar(@parts) > 1) {
 187             # Read the entries from the ccTLD file.
 188             open (cctld_handle,$cctld_file) or die "Can't open $cctld_file.\n";
 189             while (<cctld_handle>) {
 190                 next if (/^#/ || /^$/ || /^\s$/);
 191                 push(@cctlds,$_);
 192             }
 193             close (cctld_handle) or die "Close: $!\n";
 194             # Read the entries from the whitelist file.
 195             open (whitelist_handle,$whitelist_file) or die "Can't open $whitelist_file.\n";
 196             while (<whitelist_handle>) {
 197                 next if (/^#/ || /^$/ || /^\s$/);
 198                 push(@whitelist,$_);
 199             }
 200             close (whitelist_handle) or die "Close: $!\n";
 201             # Go through each of the HTTP parts that were found in the message
 202             for ($i=1; $i < scalar(@parts); $i++) {
 203                 # Special case of Quoted Printable EOL marker
 204                 $parts[$i] =~ s/=\n//g;
 205                     # Split the parts and find the address portion of the URL.
 206                 # Address SHOULD be either a FQDN, IP address, or encoded address.
 207                 $address = (split(/[^A-Za-z0-9\._\-%=]/,$parts[$i]))[0];
 208                 # Check for an =.  If it exists, we assume the URL is doing
 209                 # Quoted-Printable.  Decode it and redine $address
 210                 if ($address =~ /=/) {
 211                     @ob_parts = split /=/,$address;
 212                     $address = converthex(@ob_parts);
 213                 }
 214                 # Check for a %.  If it exists the URL is using % ASCII
 215                 # obfuscation.  Decode it and redefine $address.
 216                 if ($address =~ /%/) {
 217                     @ob_parts = split /%/,$address;
 218                     $address = converthex(@ob_parts);
 219                 }
 220                 # Split the the address into the elements separated by periods.
 221                 @domain = split /\./,$address;
 222                 # Check the length of the domain name.  If less then two elements
 223                 # at this point it is probably bogus or there is a bug in one of
 224                 # the decoding/converting routines above.
 225                 if (scalar(@domain) >= 2) {
 226                     $return_result="";
 227                     # By default, assume that the domain check is on a
 228                     # "standard" two level domain
 229                     $spamcheckdomain=$domain[-2].".".$domain[-1];
 230                     # Check for a two level domain
 231                     if (((scalar(@domain) == 2) || (scalar(@domain) >= 5))  &&
 232                         (grep(/^$spamcheckdomain$/i,@cctlds))) {
 233                         $return_result="cctld";
 234                     }
 235                     # Check for a three level domain
 236                     if (scalar(@domain) == 3) {
 237                         if (grep(/^$spamcheckdomain$/i,@cctlds)) {
 238                             $spamcheckdomain=$domain[-3].".".$spamcheckdomain;
 239                             if (grep(/^$spamcheckdomain$/,@cctlds)) {
 240                                 $return_result="cctld";
 241                             }
 242                         }
 243                     }
 244                     # Check for a four level domain
 245                     if (scalar(@domain) == 4) {
 246                         # Check to see if the domain is an IP address
 247                         if ($domain[-1] =~ /[a-zA-Z]/) {
 248                             if (grep(/^$spamcheckdomain$/i,@cctlds)) {
 249                                 $spamcheckdomain=$domain[-3].".".$spamcheckdomain;
 250                                 if (grep(/^$spamcheckdomain$/i,@cctlds)) {
 251                                     $spamcheckdomain=$domain[-4].".".$spamcheckdomain;
 252                                 }
 253                             }
 254                         } else {
 255                             # Domain is an IP address
 256                             $spamcheckdomain=$domain[3].".".$domain[2].
 257                                 ".".$domain[1].".".$domain[0];
 258                         }
 259                     }
 260                     # DEBUG statement
 261                     #warn ("FOUND DOMAIN ($mime_filename): $spamcheckdomain\n");
 262                     # If whitelisting is enabled check domain against the
 263                     # whitelist.
 264                     if ($whitelist_file ne "") {
 265                         foreach $whitelist_entry (@whitelist) {
 266                             chomp($whitelist_entry);
 267                             if ($spamcheckdomain =~ m/^$whitelist_entry$/i) {
 268                                 $return_result="whitelisted";
 269                                 last;
 270                             }
 271                         }
 272                     }
 273                     # If the domain is whitelisted or in the cctld skip adding
 274                     # it to the lookup list.
 275                     if ($return_result eq "") {
 276                         if (scalar(@lookupdomains) > 0) {
 277                             # Check so see if the domain already is in the list.
 278                             if (not grep(/^$spamcheckdomain$/i,@lookupdomains)) {
 279                                     push(@lookupdomains,$spamcheckdomain);
 280                             }
 281                         } else {
 282                             push(@lookupdomains,$spamcheckdomain);
 283                         }
 284                     }
 285                 }
 286             }
 287             # If there are items in the lookupdomains list then
 288             # perform lookups on them.  If there are not, something is wrong
 289             # and just return false.  There should always be something in the list.
 290             if (scalar(@lookupdomains) > 0) {
 291                 foreach $i (@lookupdomains) {
 292                     # DEBUG statement.
 293                     #warn ("CHECKING DOMAIN ($mime_filename): $i\n");
 294                     # If SURBL lookups are enabled do an SURBL lookup
 295                     if ($surbl_enable == 1) {
 296                         $return_result = surbllookup($i);
 297                     }
 298                     # If URIBL lookups are enabled and the SURBL lookup failed
 299                     # do a URIBL lookup
 300                     if (($uribl_enable == 1) && ($return_result eq "")) {
 301                         $return_result = uribllookup($i);
 302                     }
 303                     # If we got a hit return the result to Exim
 304                     if ($return_result ne "") {
 305                         undef @cctlds;
 306                         undef @whitelist;
 307                         return $return_result;
 308                     }
 309                 }
 310             }
 311         }
 312     }
 313     # We didn't find any URLs or the URLs we did find were not
 314     # listed so return false.
 315     undef @cctlds;
 316     undef @whitelist;
 317     return false;
 318 }
 319