#!/usr/bin/python
#
# Search logs for high-bandwith users and create a list of suspicious IPs.
# There are three states: bulk, block, ban. The first are bulk requesters
# that need throtteling, the second bulk requesters that have overdone it
# and the last manually banned IPs.
#
# The list can then be used in apache using rewrite rules to
# direct bulk users to smaller thread pools or block them. A
# typical apache config that uses php-fpm pools would look
# like this:
#
#    Alias /nominatim-www/ "/var/www/nominatim/"
#    Alias /nominatim-bulk/ "/var/www/nominatim/"
#    <Directory "/var/www/nominatim/">
#        Options MultiViews FollowSymLinks
#        AddType text/html   .php
#    </Directory>
#
#    <Location /nominatim-www>
#        AddHandler fcgi:/var/run/php5-fpm-www.sock .php
#    </Location>
#    <Location /nominatim-bulk>
#        AddHandler fcgi:/var/run/php5-fpm-bulk.sock .php
#    </Location>
#
#    Redirect 509 /nominatim-block/
#    ErrorDocument 509 "Bandwidth limit exceeded."
#    Redirect 403 /nominatim-ban/
#    ErrorDocument 403 "Access blocked."
#
#    RewriteEngine On
#    RewriteMap bulklist txt:/home/wherever/ip-block.map
#    RewriteRule ^/(.*) /nominatim-${bulklist:%{REMOTE_ADDR}|www}/$1 [PT]
#

import os
import psycopg2
import datetime

BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))

#
# DEFAULT SETTINGS
#
# Copy into settings/ip_blcoks.conf and adapt as required.
#
BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
LOGFILE= BASEDIR + '/log/restricted_ip.log'

# space-separated list of IPs that are never banned
WHITELIST = ''
# space-separated list of IPs manually blocked
BLACKLIST = ''
# user-agents that should be blocked from bulk mode
# (matched with startswith)
UA_BLOCKLIST = ()

# time before a automatically blocked IP is allowed back
BLOCKCOOLOFF_PERIOD='1 hour'
# quiet time before an IP is released from the bulk pool
BULKCOOLOFF_PERIOD='15 min'

BULKLONG_LIMIT=8000
BULKSHORT_LIMIT=2000
BLOCK_UPPER=19000
BLOCK_LOWER=4000
BLOCK_LOADFAC=380
BULK_LOADFAC=160
BULK_LOWER=1500
MAX_BULK_IPS=85

#
# END OF DEFAULT SETTINGS
#

try:
    execfile(os.path.expanduser(BASEDIR + "/settings/ip_blocks.conf"))
except IOError:
    pass

# read the previous blocklist
WHITELIST = set(WHITELIST.split()) if WHITELIST else set()
prevblocks = []
prevbulks = []
BLACKLIST = set(BLACKLIST.split()) if BLACKLIST else set()
newblocks = set()
newbulks = set()

try:
    fd = open(BLOCKEDFILE)
    for line in fd:
        ip, typ = line.strip().split(' ')
        if ip not in BLACKLIST:
            if typ == 'block':
                prevblocks.append(ip)
            elif typ == 'bulk':
                prevbulks.append(ip)
    fd.close()
except IOError:
    pass #ignore non-existing file

# determine current load
fd = open("/proc/loadavg")
avgload = int(float(fd.readline().split()[2]))
fd.close()
# DB load
conn = psycopg2.connect('dbname=nominatim')
cur = conn.cursor()
cur.execute("select count(*)/60 from new_query_log where starttime > now() - interval '1min'")
dbload = int(cur.fetchone()[0])

BLOCK_LIMIT = max(BLOCK_LOWER, BLOCK_UPPER - BLOCK_LOADFAC * (dbload - 75))
BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * (avgload - 14))
if len(prevbulks) > MAX_BULK_IPS:
    BLOCK_LIMIT = max(3600, BLOCK_LOWER - (len(prevbulks) - MAX_BULK_IPS)*10)
# if the bulk pool is still empty, clients will be faster, avoid having
# them blocked in this case
if len(prevbulks) < 10:
    BLOCK_LIMIT = 2*BLOCK_UPPER


# get the new block candidates
cur.execute("""
  SELECT ipaddress, max(count), max(ua) FROM
   ((SELECT * FROM
     (SELECT ipaddress, sum(case when endtime is null then 1 else 1+1.5*date_part('epoch',endtime-starttime) end) as count, substring(max(useragent) from 1 for 30) as ua FROM new_query_log
      WHERE starttime > now() - interval '1 hour' GROUP BY ipaddress) as i
   WHERE count > %s)
   UNION
   (SELECT ipaddress, count * 3, ua FROM
     (SELECT ipaddress, sum(case when endtime is null then 1 else 1+1.5*date_part('epoch',endtime-starttime) end) as count, substring(max(useragent) from 1 for 30) as ua FROM new_query_log 
      WHERE starttime > now() - interval '10 min' GROUP BY ipaddress) as i
   WHERE count > %s)) as o
  GROUP BY ipaddress
""", (BULKLONG_LIMIT, BULKSHORT_LIMIT))

bulkips = {}
emergencyblocks = []
useragentblocks = []

for c in cur:
    if c[0] not in WHITELIST and c[0] not in BLACKLIST:
        # check for user agents that receive an immediate block
        missing_agent = not c[2]
        if not missing_agent:
            for ua in UA_BLOCKLIST:
                if c[2].startswith(ua):
                    missing_agent = True
                    break
        if (missing_agent or c[1] > BLOCK_UPPER) and c[0] not in prevblocks:
            newblocks.add(c[0])
            if missing_agent:
                useragentblocks.append(c[0])
            else:
                emergencyblocks.append(c[0])
        else:
            bulkips[c[0]] = c[1]

# IPs from the block list that are no longer in the bulk list
deblockcandidates = set()
# IPs from the bulk list that are no longer in the bulk list
debulkcandidates = set()
# new IPs to go into the block list
newlyblocked = []


for ip in prevblocks:
    if ip in bulkips:
        newblocks.add(ip)
        del bulkips[ip]
    else:
        deblockcandidates.add(ip)    
        
for ip in prevbulks:
    if ip not in newblocks:
        if ip in bulkips:
            if bulkips[ip] > BLOCK_LIMIT:
                newblocks.add(ip)
                newlyblocked.append(ip)
            else:
                newbulks.add(ip)
            del bulkips[ip]
        else:
            debulkcandidates.add(ip)

# cross-check deblock candidates
if deblockcandidates:
    cur.execute("""
        SELECT DISTINCT ipaddress FROM new_query_log
        WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
        """ % ("','".join(deblockcandidates), BLOCKCOOLOFF_PERIOD))

    for c in cur:
        newblocks.add(c[0])
        deblockcandidates.remove(c[0])
# deblocked IPs go back to the bulk pool to catch the ones that simply
# ignored the HTTP error and just continue to hammer the API.
# Those that behave and stopped will be debulked a minute later.
for ip in deblockcandidates:
    newbulks.add(ip)

# cross-check debulk candidates
if debulkcandidates:
    cur.execute("""
        SELECT DISTINCT ipaddress FROM new_query_log
        WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
        AND starttime > date_trunc('day', now())
        """ % ("','".join(debulkcandidates), BULKCOOLOFF_PERIOD))

    for c in cur:
        newbulks.add(c[0])
        debulkcandidates.remove(c[0])

for ip in bulkips.iterkeys():
    newbulks.add(ip)

# write out the new list
fd = open(BLOCKEDFILE, 'w')
for ip in newblocks:
    fd.write(ip + " block\n")
for ip in newbulks:
    fd.write(ip + " bulk\n")
for ip in BLACKLIST:
    fd.write(ip + " ban\n")
fd.close()

# write out the log
logstr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
fd = open(LOGFILE, 'a')
if deblockcandidates:
    fd.write(logstr % ('unblocked:', ', '.join(deblockcandidates)))
if debulkcandidates:
    fd.write(logstr % (' debulked:', ', '.join(debulkcandidates)))
if bulkips:
    fd.write(logstr % ('new bulks:', ', '.join(bulkips.keys())))
if emergencyblocks:
    fd.write(logstr % ('dir.block:', ', '.join(emergencyblocks)))
if useragentblocks:
    fd.write(logstr % (' ua block:', ', '.join(useragentblocks)))
if newlyblocked:
    fd.write(logstr % ('new block:', ', '.join(newlyblocked)))
fd.close()
