3 # Search logs for high-bandwith users and create a list of suspicious IPs.
4 # There are three states: bulk, block, ban. The first are bulk requesters
5 # that need throtteling, the second bulk requesters that have overdone it
6 # and the last manually banned IPs.
8 # The list can then be used in apache using rewrite rules to
9 # direct bulk users to smaller thread pools or block them. A
10 # typical apache config that uses php-fpm pools would look
13 # Alias /nominatim-www/ "/var/www/nominatim/"
14 # Alias /nominatim-bulk/ "/var/www/nominatim/"
15 # <Directory "/var/www/nominatim/">
16 # Options MultiViews FollowSymLinks
17 # AddType text/html .php
20 # <Location /nominatim-www>
21 # AddHandler fcgi:/var/run/php5-fpm-www.sock .php
23 # <Location /nominatim-bulk>
24 # AddHandler fcgi:/var/run/php5-fpm-bulk.sock .php
27 # Redirect 509 /nominatim-block/
28 # ErrorDocument 509 "Bandwidth limit exceeded."
29 # Redirect 403 /nominatim-ban/
30 # ErrorDocument 403 "Access blocked."
33 # RewriteMap bulklist txt:/home/wherever/ip-block.map
34 # RewriteRule ^/(.*) /nominatim-${bulklist:%{REMOTE_ADDR}|www}/$1 [PT]
41 BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
46 # Copy into settings/ip_blcoks.conf and adapt as required.
48 BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
49 LOGFILE= BASEDIR + '/log/restricted_ip.log'
51 # space-separated list of IPs that are never banned
53 # space-separated list of IPs manually blocked
55 # user-agents that should be blocked from bulk mode
56 # (matched with startswith)
59 # time before a automatically blocked IP is allowed back
60 BLOCKCOOLOFF_PERIOD='1 hour'
61 # quiet time before an IP is released from the bulk pool
62 BULKCOOLOFF_PERIOD='15 min'
74 # END OF DEFAULT SETTINGS
78 execfile(os.path.expanduser(BASEDIR + "/settings/ip_blocks.conf"))
82 # read the previous blocklist
83 WHITELIST = set(WHITELIST.split()) if WHITELIST else set()
86 BLACKLIST = set(BLACKLIST.split()) if BLACKLIST else set()
91 fd = open(BLOCKEDFILE)
93 ip, typ = line.strip().split(' ')
94 if ip not in BLACKLIST:
101 pass #ignore non-existing file
103 # determine current load
104 fd = open("/proc/loadavg")
105 avgload = int(float(fd.readline().split()[2]))
108 conn = psycopg2.connect('dbname=nominatim')
110 cur.execute("select count(*)/60 from new_query_log where starttime > now() - interval '1min'")
111 dbload = int(cur.fetchone()[0])
113 BLOCK_LIMIT = max(BLOCK_LOWER, BLOCK_UPPER - BLOCK_LOADFAC * (dbload - 75))
114 BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * (avgload - 14))
115 if len(prevbulks) > MAX_BULK_IPS:
116 BLOCK_LIMIT = max(3600, BLOCK_LOWER - (len(prevbulks) - MAX_BULK_IPS)*10)
118 # get the new block candidates
120 SELECT ipaddress, max(count), max(ua) FROM
122 (SELECT ipaddress, sum(case when endtime is null then 1 else 1+1.5*date_part('epoch',endtime-starttime) end) as count, substring(max(useragent) from 1 for 30) as ua FROM new_query_log
123 WHERE starttime > now() - interval '1 hour' GROUP BY ipaddress) as i
126 (SELECT ipaddress, count * 3, ua FROM
127 (SELECT ipaddress, sum(case when endtime is null then 1 else 1+1.5*date_part('epoch',endtime-starttime) end) as count, substring(max(useragent) from 1 for 30) as ua FROM new_query_log
128 WHERE starttime > now() - interval '10 min' GROUP BY ipaddress) as i
129 WHERE count > %s)) as o
131 """, (BULKLONG_LIMIT, BULKSHORT_LIMIT))
138 if c[0] not in WHITELIST and c[0] not in BLACKLIST:
139 # check for user agents that receive an immediate block
140 missing_agent = not c[2]
141 if not missing_agent:
142 for ua in UA_BLOCKLIST:
143 if c[2].startswith(ua):
146 if (missing_agent or c[1] > BLOCK_UPPER) and c[0] not in prevblocks:
149 useragentblocks.append(c[0])
151 emergencyblocks.append(c[0])
155 # IPs from the block list that are no longer in the bulk list
156 deblockcandidates = set()
157 # IPs from the bulk list that are no longer in the bulk list
158 debulkcandidates = set()
159 # new IPs to go into the block list
163 for ip in prevblocks:
168 deblockcandidates.add(ip)
171 if ip not in newblocks:
173 if bulkips[ip] > BLOCK_LIMIT:
175 newlyblocked.append(ip)
180 debulkcandidates.add(ip)
182 # cross-check deblock candidates
183 if deblockcandidates:
185 SELECT DISTINCT ipaddress FROM new_query_log
186 WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
187 """ % ("','".join(deblockcandidates), BLOCKCOOLOFF_PERIOD))
191 deblockcandidates.remove(c[0])
192 # deblocked IPs go back to the bulk pool to catch the ones that simply
193 # ignored the HTTP error and just continue to hammer the API.
194 # Those that behave and stopped will be debulked a minute later.
195 for ip in deblockcandidates:
198 # cross-check debulk candidates
201 SELECT DISTINCT ipaddress FROM new_query_log
202 WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
203 AND starttime > date_trunc('day', now())
204 """ % ("','".join(debulkcandidates), BULKCOOLOFF_PERIOD))
208 debulkcandidates.remove(c[0])
210 for ip in bulkips.iterkeys():
213 # write out the new list
214 fd = open(BLOCKEDFILE, 'w')
216 fd.write(ip + " block\n")
218 fd.write(ip + " bulk\n")
220 fd.write(ip + " ban\n")
224 logstr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
225 fd = open(LOGFILE, 'a')
226 if deblockcandidates:
227 fd.write(logstr % ('unblocked:', ', '.join(deblockcandidates)))
229 fd.write(logstr % (' debulked:', ', '.join(debulkcandidates)))
231 fd.write(logstr % ('new bulks:', ', '.join(bulkips.keys())))
233 fd.write(logstr % ('dir.block:', ', '.join(emergencyblocks)))
235 fd.write(logstr % (' ua block:', ', '.join(useragentblocks)))
237 fd.write(logstr % ('new block:', ', '.join(newlyblocked)))