2 # Copyright (c) 2006-2007 Erik Mugele. All rights reserved.
4 # Redistribution and use in source and binary forms, with or without
5 # modification, are permitted provided that the following conditions
7 # 1. Redistributions of source code must retain the above copyright
8 # notice, this list of conditions and the following disclaimer.
9 # 2. Redistributions in binary form must reproduce the above copyright
10 # notice, this list of conditions and the following disclaimer in the
11 # documentation and/or other materials provided with the distribution.
13 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
14 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
15 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
16 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
17 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
18 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
19 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
20 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
22 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 # 1. This script makes use of the Country Code Top Level
28 # Domains (ccTLD) provided by the SURBL group at
29 # http://spamcheck.freeapp.net/two-level-tlds
30 # THE VARIABLE $cctld_file MUST BE SET TO THE FULL PATH AND
31 # NAME OF THE FILE CONTAINING THE CCTLD LIST! (see below)
33 # 2. This script makes use of whitelisting of popular domains. The
34 # source of the list can be found here:
35 # http://spamassassin.apache.org/full/3.1.x/dist/rules/25_uribl.cf
36 # These are domains that are whitelisted by the SURBL group so it
37 # doesn't make sense to waste resources doing lookups on them.
38 # THE VARIABLE $whitelist_file MUST BE SET TO THE FULL PATH AND
39 # NAME OF THE FILE CONTAINING THE WHITE LIST! (see below)
41 # 3. Per the guidelines at http://www.surbl.org, if your site processes
42 # more than 100,000 messages per day, you should NOT be using the
43 # public SURBL name servers but should be rsync-ing from them and
44 # running your own. See http://www3.surbl.org/rsync-signup.html
48 # Designed and written by Erik Mugele, 2004-2006
49 # http://www.teuton.org/~ejm
52 # The following variable is the full path to the file containing the
53 # list of Country Code Top Level Domains (ccTLD).
54 # ---------------------------------------------------------------------
55 # THIS VARIABLE MUST BE SET TO THE FULL PATH AND NAME OF THE FILE
56 # CONTAINING THE CCTLD LIST!
57 # ---------------------------------------------------------------------
58 my $cctld_file = "/etc/exim4/ccTLD.txt";
60 # The following variable is the full path to the file containing
62 # ---------------------------------------------------------------------
63 # THIS VARIABLE MUST BE SET TO THE FULL PATH AND NAME OF THE FILE
64 # CONTAINING THE WHITELIST DOMAINS!
65 # ---------------------------------------------------------------------
66 my $whitelist_file = "/etc/exim4/surbl_whitelist.txt";
68 # This variable defines the maximum MIME file size that will be checked
69 # if this script is called by the MIME ACL. This is primarily to
70 # keep the load down on the server. Size is in bytes.
71 my $max_file_size = 50000;
73 # The following two variables enable or disable the SURBL and URIBL
74 # lookups. Set to 1 to enable and 0 to disable.
78 # Check to see if a decode MIME attachment is being checked or
79 # just a plain old text message with no attachments
81 my $mime_filename = Exim::expand_string('$mime_decoded_filename');
84 #warn ("MIME FILENAME: $mime_filename\n");
85 # If the MIME file is too large, skip it.
86 if (-s $mime_filename <= $max_file_size) {
87 open(fh,"<$mime_filename");
89 while (read(fh,$buff,1024)) {
97 $exim_body = Exim::expand_string('$message_body');
101 # This subroutine does the actual DNS lookup and builds and returns
102 # the return message for the SURBL lookup.
104 my $surbldomain = ".multi.surbl.org";
105 @dnsbladdr=gethostbyname($params[0].$surbldomain);
106 # If gethostbyname() returned anything, build a return message.
108 if (scalar(@dnsbladdr) != 0) {
109 $return_string = "Blacklisted URL in message. (".$params[0].") in";
110 @surblipaddr = unpack('C4',($dnsbladdr[4])[0]);
111 if ($surblipaddr[3] & 64) {
112 $return_string .= " [jp]";
114 if ($surblipaddr[3] & 32) {
115 $return_string .= " [ab]";
117 if ($surblipaddr[3] & 16) {
118 $return_string .= " [ob]";
120 if ($surblipaddr[3] & 8) {
121 $return_string .= " [ph]";
123 if ($surblipaddr[3] & 4) {
124 $return_string .= " [ws]";
126 if ($surblipaddr[3] & 2) {
127 $return_string .= " [sc]";
129 $return_string .= ". See http://www.surbl.org/lists.html.";
131 return $return_string;
135 # This subroutine does the actual DNS lookup and builds and returns
136 # the return message for the URIBL check.
138 my $surbldomain = ".black.uribl.com";
139 @dnsbladdr=gethostbyname($params[0].$surbldomain);
140 # If gethostbyname() returned anything, build a return message.
142 if (scalar(@dnsbladdr) != 0) {
143 $return_string = "Blacklisted URL in message. (".$params[0].") in";
144 @surblipaddr = unpack('C4',($dnsbladdr[4])[0]);
145 if ($surblipaddr[3] & 8) {
146 $return_string .= " [red]";
148 if ($surblipaddr[3] & 4) {
149 $return_string .= " [grey]";
151 if ($surblipaddr[3] & 2) {
152 $return_string .= " [black]";
154 $return_string .= ". See http://lookup.uribl.com.";
156 return $return_string;
160 # This subroutin converts two hex characters to an ASCII character.
161 # It is called when ASCII obfuscation or Printed-Quatable characters
162 # are found (i.e. %AE or =AE).
163 # It should return a converted/plain address after splitting off
164 # everything that isn't part of the address portion of the URL.
166 my $address = $ob_parts[0];
167 for (my $j=1; $j < scalar(@ob_parts); $j++) {
168 $address .= chr(hex(substr($ob_parts[$j],0,2)));
169 $address .= substr($ob_parts[$j],2,);
171 $address = (split(/[^A-Za-z0-9._\-]/,$address))[0];
180 # Find all the URLs in the message by finding the HTTP string
181 @parts = split /[hH][tT][tT][pP]:\/\//,$exim_body;
182 if (scalar(@parts) > 1) {
183 # Read the entries from the ccTLD file.
184 open (cctld_handle,$cctld_file) or die "Can't open $cctld_file.\n";
185 while (<cctld_handle>) {
186 next if (/^#/ || /^$/ || /^\s$/);
189 close (cctld_handle) or die "Close: $!\n";
190 # Read the entries from the whitelist file.
191 open (whitelist_handle,$whitelist_file) or die "Can't open $whitelist_file.\n";
192 while (<whitelist_handle>) {
193 next if (/^#/ || /^$/ || /^\s$/);
196 close (whitelist_handle) or die "Close: $!\n";
197 # Go through each of the HTTP parts that were found in the message
198 for ($i=1; $i < scalar(@parts); $i++) {
199 # Special case of Quoted Printable EOL marker
200 $parts[$i] =~ s/=\n//g;
201 # Split the parts and find the address portion of the URL.
202 # Address SHOULD be either a FQDN, IP address, or encoded address.
203 $address = (split(/[^A-Za-z0-9\._\-%=]/,$parts[$i]))[0];
204 # Check for an =. If it exists, we assume the URL is doing
205 # Quoted-Printable. Decode it and redine $address
206 if ($address =~ /=/) {
207 @ob_parts = split /=/,$address;
208 $address = converthex(@ob_parts);
210 # Check for a %. If it exists the URL is using % ASCII
211 # obfuscation. Decode it and redefine $address.
212 if ($address =~ /%/) {
213 @ob_parts = split /%/,$address;
214 $address = converthex(@ob_parts);
216 # Split the the address into the elements separated by periods.
217 @domain = split /\./,$address;
218 # Check the length of the domain name. If less then two elements
219 # at this point it is probably bogus or there is a bug in one of
220 # the decoding/converting routines above.
221 if (scalar(@domain) >= 2) {
223 # By default, assume that the domain check is on a
224 # "standard" two level domain
225 $spamcheckdomain=$domain[-2].".".$domain[-1];
226 # Check for a two level domain
227 if (((scalar(@domain) == 2) || (scalar(@domain) >= 5)) &&
228 (grep(/^$spamcheckdomain$/i,@cctlds))) {
229 $return_result="cctld";
231 # Check for a three level domain
232 if (scalar(@domain) == 3) {
233 if (grep(/^$spamcheckdomain$/i,@cctlds)) {
234 $spamcheckdomain=$domain[-3].".".$spamcheckdomain;
235 if (grep(/^$spamcheckdomain$/,@cctlds)) {
236 $return_result="cctld";
240 # Check for a four level domain
241 if (scalar(@domain) == 4) {
242 # Check to see if the domain is an IP address
243 if ($domain[-1] =~ /[a-zA-Z]/) {
244 if (grep(/^$spamcheckdomain$/i,@cctlds)) {
245 $spamcheckdomain=$domain[-3].".".$spamcheckdomain;
246 if (grep(/^$spamcheckdomain$/i,@cctlds)) {
247 $spamcheckdomain=$domain[-4].".".$spamcheckdomain;
251 # Domain is an IP address
252 $spamcheckdomain=$domain[3].".".$domain[2].
253 ".".$domain[1].".".$domain[0];
257 #warn ("FOUND DOMAIN ($mime_filename): $spamcheckdomain\n");
258 # If whitelisting is enabled check domain against the
260 if ($whitelist_file ne "") {
261 foreach $whitelist_entry (@whitelist) {
262 chomp($whitelist_entry);
263 if ($spamcheckdomain =~ m/^$whitelist_entry$/i) {
264 $return_result="whitelisted";
269 # If the domain is whitelisted or in the cctld skip adding
270 # it to the lookup list.
271 if ($return_result eq "") {
272 if (scalar(@lookupdomains) > 0) {
273 # Check so see if the domain already is in the list.
274 if (not grep(/^$spamcheckdomain$/i,@lookupdomains)) {
275 push(@lookupdomains,$spamcheckdomain);
278 push(@lookupdomains,$spamcheckdomain);
283 # If there are items in the lookupdomains list then
284 # perform lookups on them. If there are not, something is wrong
285 # and just return false. There should always be something in the list.
286 if (scalar(@lookupdomains) > 0) {
287 foreach $i (@lookupdomains) {
289 #warn ("CHECKING DOMAIN ($mime_filename): $i\n");
290 # If SURBL lookups are enabled do an SURBL lookup
291 if ($surbl_enable == 1) {
292 $return_result = surbllookup($i);
294 # If URIBL lookups are enabled and the SURBL lookup failed
296 if (($uribl_enable == 1) && ($return_result eq "")) {
297 $return_result = uribllookup($i);
299 # If we got a hit return the result to Exim
300 if ($return_result ne "") {
303 return $return_result;
309 # We didn't find any URLs or the URLs we did find were not
310 # listed so return false.