.*?)"')
+time_pat= re.compile(r'[a-f\d:\.]+ - - \[' + time_regex + '\] ')
+
+logtime_pat = "%d/%b/%Y:%H:%M:%S %z"
+
+MONTHS = { 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6,
+ 'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12 }
+
+class LogEntry:
+ def __init__(self, logline):
+ e = format_pat.match(logline)
+ if e is None:
+ raise ValueError("Invalid log line:", logline)
+ e = e.groupdict()
+ self.ip = e['ip']
+ self.date = datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
+ int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+ qp = e['query'].split(' ', 2)
+ if len(qp) < 2:
+ self.request = None
+ self.query = None
+ else:
+ self.query = qp[1]
+ if qp[0] == 'OPTIONS':
+ self.request = None
+ else:
+ if '/?' in qp[1]:
+ self.request = 'S'
+ elif '/search' in qp[1]:
+ self.request = 'S'
+ elif '/reverse' in qp[1]:
+ self.request = 'R'
+ elif '/details' in qp[1]:
+ self.request = 'D'
+ elif '/lookup' in qp[1]:
+ self.request = 'L'
+ else:
+ self.request = None
+ self.query = e['query']
+ self.retcode = int(e['return'])
+ self.referer = e['referer'] if e['referer'] != '-' else None
+ self.ua = e['ua'] if e['ua'] != '-' else None
+
+ def get_log_time(logline):
+ e = format_pat.match(logline)
+ if e is None:
+ return None
+ e = e.groupdict()
+ #return datetime.strptime(e['time'], logtime_pat).replace(tzinfo=None)
+ return datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
+ int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+
+
+class LogFile:
+ """ An apache log file, unpacked. """
+
+ def __init__(self, filename):
+ self.fd = open(filename)
+ self.len = os.path.getsize(filename)
+
+ def __del__(self):
+ self.fd.close()
+
+ def seek_next(self, abstime):
+ self.fd.seek(abstime)
+ self.fd.readline()
+ l = self.fd.readline()
+ return LogEntry.get_log_time(l) if l is not None else None
+
+ def seek_to_date(self, target):
+ # start position for binary search
+ fromseek = 0
+ fromdate = self.seek_next(0)
+ if fromdate > target:
+ return True
+ # end position for binary search
+ toseek = -100
+ while -toseek < self.len:
+ todate = self.seek_next(self.len + toseek)
+ if todate is not None:
+ break
+ toseek -= 100
+ if todate is None or todate < target:
+ return False
+ toseek = self.len + toseek
+
+
+ while True:
+ bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
+ newseek = fromseek + int((target - fromdate).total_seconds() * bps)
+ newdate = self.seek_next(newseek)
+ if newdate is None:
+ return False;
+ error = abs((target - newdate).total_seconds())
+ if error < 1:
+ return True
+ if newdate > target:
+ toseek = newseek
+ todate = newdate
+ oldfromseek = fromseek
+ fromseek = toseek - error * bps
+ while True:
+ if fromseek <= oldfromseek:
+ fromseek = oldfromseek
+ fromdate = self.seek_next(fromseek)
+ break
+ fromdate = self.seek_next(fromseek)
+ if fromdate < target:
+ break;
+ bps *=2
+ fromseek -= error * bps
+ else:
+ fromseek = newseek
+ fromdate = newdate
+ oldtoseek = toseek
+ toseek = fromseek + error * bps
+ while True:
+ if toseek > oldtoseek:
+ toseek = oldtoseek
+ todate = self.seek_next(toseek)
+ break
+ todate = self.seek_next(toseek)
+ if todate > target:
+ break
+ bps *=2
+ toseek += error * bps
+ if toseek - fromseek < 500:
+ return True
+
+
+ def loglines(self):
+ for l in self.fd:
+ try:
+ yield LogEntry(l)
+ except ValueError:
+ pass # ignore invalid lines
+
+class BlockList:
+
+ def __init__(self):
+ self.whitelist = set(WHITELIST.split()) if WHITELIST else set()
+ self.blacklist = set(BLACKLIST.split()) if BLACKLIST else set()
+ self.prevblocks = set()
+ self.prevbulks = set()
+
+ try:
+ fd = open(BLOCKEDFILE)
+ for line in fd:
+ ip, typ = line.strip().split(' ')
+ if ip not in self.blacklist:
+ if typ == 'block':
+ self.prevblocks.add(ip)
+ elif typ == 'bulk':
+ self.prevbulks.add(ip)
+ fd.close()
+ except IOError:
+ pass #ignore non-existing file
+
+
+class IPstats:
+
+ def __init__(self):
+ self.short_total = 0
+ self.short_api = 0
+ self.long_total = 0
+ self.long_api = 0
+ self.bad_ua = False
+
+ def add_long(self, logentry):
+ self.long_total += 1
+ if logentry.request is not None:
+ self.long_api += 1
+ if not self.bad_ua:
+ if logentry.ua is None:
+ self.bad_ua = True
+
+ def add_short(self, logentry):
+ self.short_total += 1
+ if logentry.request is not None:
+ self.short_api += 1
+ self.add_long(logentry)
+
+ def new_state(self, was_blocked, was_bulked):
+ if was_blocked:
+ # deblock only if the IP has been really quiet
+ # (properly catches the ones that simply ignore the HTTP error)
+ return None if self.long_total < 20 else 'block'
+ if self.long_api > BLOCK_UPPER or self.short_api > BLOCK_UPPER / 3:
+ # client totally overdoing it
+ return 'block'
+ if was_bulked:
+ if self.short_total < 20:
+ # client has stopped, debulk
+ return None
+ if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
+ # client is still hammering us, block
+ return 'emblock'
+ return 'bulk'
+
+ if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
+ #if self.bad_ua:
+ # return 'uablock' # bad useragent
+ return 'bulk'
+
+ return None
+
+
+
+if __name__ == '__main__':
+ if len(sys.argv) < 2:
+ print("Usage: %s logfile startdate" % sys.argv[0])
+ sys.exit(-1)
+
+ if len(sys.argv) == 2:
+ dt = datetime.now() - BLOCKCOOLOFF_DELTA
+ else:
+ dt = datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
+
+ if os.path.getsize(sys.argv[1]) < 2*1030*1024:
+ sys.exit(0) # not enough data
+
+ lf = LogFile(sys.argv[1])
+ if not lf.seek_to_date(dt):
+ sys.exit(0)
+
+ bl = BlockList()
+
+ shortstart = dt + BLOCKCOOLOFF_DELTA - BULKCOOLOFF_DELTA
+ notlogged = bl.whitelist | bl.blacklist
+
+ stats = defaultdict(IPstats)
+
+ for l in lf.loglines():
+ if l.ip not in notlogged:
+ stats[l.ip].add_long(l)
+ if l.date > shortstart:
+ break
+
+ total200 = 0
+ for l in lf.loglines():
+ if l.ip not in notlogged:
+ stats[l.ip].add_short(l)
+ if l.request is not None and l.retcode == 200:
+ total200 += 1
+
+ # adapt limits according to CPU and DB load
+ fd = open("/proc/loadavg")
+ cpuload = int(float(fd.readline().split()[2]))
+ fd.close()
+ # check the number of excess connections to apache
+ dbcons = int(subprocess.check_output("netstat -s | grep 'connections established' | sed 's:^\s*::;s: .*::'", shell=True))
+ fpms = int(subprocess.check_output('ps -Af | grep php-fpm | wc -l', shell=True))
+ dbload = max(0, dbcons - fpms)
+
+ numbulks = len(bl.prevbulks)
+ BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * dbload)
+ BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * cpuload)
+ if numbulks > MAX_BULK_IPS:
+ BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10)
+ # if the bulk pool is still empty, clients will be faster, avoid having
+ # them blocked in this case
+ if numbulks < 10:
+ BLOCK_UPPER *= 2
+ BLOCK_LIMIT = BLOCK_UPPER
+
+
+ # collecting statistics
+ unblocked = []
+ debulked = []
+ bulked = []
+ blocked = []
+ uablocked = []
+ emblocked = []
+ # write out new state file
+ fd = open(BLOCKEDFILE, 'w')
+ for k,v in stats.items():
+ wasblocked = k in bl.prevblocks
+ wasbulked = k in bl.prevbulks
+ state = v.new_state(wasblocked, wasbulked)
+ if state is not None:
+ if state == 'uablock':
+ uablocked.append(k)
+ state = 'block'
+ elif state == 'emblock':
+ emblocked.append(k)
+ state = 'block'
+ elif state == 'block':
+ if not wasblocked:
+ blocked.append(k)
+ elif state == 'bulk':
+ if not wasbulked:
+ bulked.append(k)
+ fd.write("%s %s\n" % (k, state))
+ else:
+ if wasblocked:
+ unblocked.append(k)
+ elif wasbulked:
+ debulked.append(k)
+ for i in bl.blacklist:
+ fd.write("%s ban\n" % i)
+ fd.close()
+
+ # TODO write logs (need to collect some statistics)
+ logstr = datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
+ fd = open(LOGFILE, 'a')
+ if unblocked:
+ fd.write(logstr % ('unblocked:', ', '.join(unblocked)))
+ if debulked:
+ fd.write(logstr % (' debulked:', ', '.join(debulked)))
+ if bulked:
+ fd.write(logstr % ('new bulks:', ', '.join(bulked)))
+ if emblocked:
+ fd.write(logstr % ('dir.block:', ', '.join(emblocked)))
+ if uablocked:
+ fd.write(logstr % (' ua block:', ', '.join(uablocked)))
+ if blocked:
+ fd.write(logstr % ('new block:', ', '.join(blocked)))
+ fd.close()
diff --git a/utils/cron_logrotate.sh b/utils/cron_logrotate.sh
new file mode 100755
index 00000000..b9291d95
--- /dev/null
+++ b/utils/cron_logrotate.sh
@@ -0,0 +1,20 @@
+#!/bin/bash -e
+#
+# Rotate query logs.
+
+dbname=nominatim
+
+basedir=`dirname $0`
+logfile=`date "+$basedir/../log/query-%F.log.gz"`
+
+# dump the old logfile
+pg_dump -a -F p -t backup_query_log $dbname | gzip -9 > $logfile
+
+# remove the old logs
+psql -q -d $dbname -c 'DROP TABLE backup_query_log'
+
+# rotate
+psql -q -1 -d $dbname -c 'ALTER TABLE new_query_log RENAME TO backup_query_log;CREATE TABLE new_query_log as (select * from backup_query_log limit 0);GRANT SELECT, INSERT, UPDATE ON new_query_log TO "www-data"'
+psql -q -d $dbname -c 'ALTER INDEX idx_new_query_log_starttime RENAME TO idx_backup_query_log_starttime'
+psql -q -d $dbname -c 'CREATE INDEX idx_new_query_log_starttime ON new_query_log USING BTREE (starttime)'
+
diff --git a/utils/cron_vacuum.sh b/utils/cron_vacuum.sh
new file mode 100755
index 00000000..4c16fc65
--- /dev/null
+++ b/utils/cron_vacuum.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+#
+# Vaccum all tables with indices on integer arrays.
+# Agressive vacuuming seems to help against index bloat.
+#
+
+psql -q -d nominatim -c 'VACUUM ANALYSE search_name'
+psql -q -d nominatim -c 'VACUUM ANALYSE search_name_country'
+#psql -q -d nominatim -c 'VACUUM ANALYSE planet_osm_ways'
+
+for i in `seq 0 246`; do
+ psql -q -d nominatim -c "VACUUM ANALYSE search_name_${i}"
+done
+
diff --git a/utils/update.php b/utils/update.php
index 3cc9cdd6..d4dc60c5 100755
--- a/utils/update.php
+++ b/utils/update.php
@@ -38,6 +38,7 @@ $aCMDOptions
getCmdOpt($_SERVER['argv'], $aCMDOptions, $aResult, true, true);
if (!isset($aResult['index-instances'])) $aResult['index-instances'] = 1;
+
if (!isset($aResult['index-rank'])) $aResult['index-rank'] = 0;
date_default_timezone_set('Etc/UTC');
diff --git a/website/403.html b/website/403.html
new file mode 100644
index 00000000..8d414aa7
--- /dev/null
+++ b/website/403.html
@@ -0,0 +1,23 @@
+
+
+Access blocked
+
+
+Access blocked
+
+You have been blocked because you have violated the
+usage policy
+of OSM's Nominatim geocoding service. Please be aware that OSM's resources are
+limited and shared between many users. The usage policy is there to ensure that
+the service remains usable for everybody.
+
+Please review the terms and make sure that your
+software adheres to the terms. You should in particular verify that you have set a
++valid referrer or a user agent that identifies your application, and
+that you are not overusing the service with massive bulk requests.
+
+If you feel that this block is unjustified or remains after you have adopted
+your usage, you may contact the Nominatim system administrator at
+nominatim@openstreetmap.org to have this block lifted.
+
+
diff --git a/website/509.html b/website/509.html
new file mode 100644
index 00000000..08622e32
--- /dev/null
+++ b/website/509.html
@@ -0,0 +1,12 @@
+
+
+Bandwidth limit exceeded
+
+
+Bandwidth limit exceeded
+
+You have been temporarily blocked because you have been overusing OSM's geocoding service or because you have not provided sufficient identification of your application. This block will be automatically lifted after a while. Please take the time and adapt your scripts to reduce the number of requests and make sure that you send a valid UserAgent or Referer.
+
+For more information, consult the usage policy for the OSM Nominatim server.
+
+
diff --git a/website/crossdomain.xml b/website/crossdomain.xml
new file mode 100644
index 00000000..963a682b
--- /dev/null
+++ b/website/crossdomain.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/website/favicon.ico b/website/favicon.ico
new file mode 100644
index 00000000..0157ea00
Binary files /dev/null and b/website/favicon.ico differ
diff --git a/website/last_update.php b/website/last_update.php
new file mode 100644
index 00000000..a8435861
--- /dev/null
+++ b/website/last_update.php
@@ -0,0 +1,26 @@
+getOne("select * from import_status");
+ if (PEAR::isError($sLastUpdate))
+ {
+ statusError("Update status unknown.");
+ }
+ echo $sLastUpdate;
+ exit;
+
diff --git a/website/nominatim.xml b/website/nominatim.xml
new file mode 100644
index 00000000..28684b16
--- /dev/null
+++ b/website/nominatim.xml
@@ -0,0 +1,15 @@
+
+
+ Nominatim
+ Nominatim OSM Search
+ Search for a place in OpenStreetMap Nominatim
+ UTF-8
+ UTF-8
+
+
+ Brian Quinion
+ false
+ Data © OpenStreetMap contributors, Some Rights Reserved. ODbL, http://www.osm.org/copyright.
+
+
diff --git a/website/reverse.php b/website/reverse.php
index 026fa85b..00bcf635 100755
--- a/website/reverse.php
+++ b/website/reverse.php
@@ -91,6 +91,7 @@ if (isset($aPlace)) {
$aPlace = [];
}
+logEnd($oDB, $hLog, sizeof($aPlace)?1:0);
if (CONST_Debug) {
var_dump($aPlace);
diff --git a/website/robots.txt b/website/robots.txt
new file mode 100644
index 00000000..9624d97c
--- /dev/null
+++ b/website/robots.txt
@@ -0,0 +1,14 @@
+User-agent: ia_archiver
+Allow: /
+
+User-agent: *
+Disallow: /search.php
+Disallow: /search
+Disallow: /details.php
+Disallow: /details
+Disallow: /reverse.php
+Disallow: /reverse
+Disallow: /hierarchy
+Disallow: /hierarchy.php
+Disallow: /lookup
+Disallow: /lookup.php
diff --git a/website/taginfo.json b/website/taginfo.json
new file mode 100644
index 00000000..36180267
--- /dev/null
+++ b/website/taginfo.json
@@ -0,0 +1,86 @@
+{
+ "data_format": 1,
+ "data_url": "http://nominatim.openstreetmap.org/taginfo.json",
+ "project": {
+ "name": "Nominatim",
+ "description": "OSM search engine.",
+ "project_url": "http://nominatim.openstreetmap.org",
+ "doc_url": "http://wiki.osm.org/wiki/Nominatim",
+ "contact_name": "Sarah Hoffmann",
+ "contact_email": "lonvia@denofr.de"
+ },
+ "tags": [
+ { "key" : "ref", "description": "Searchable name of the place."},
+ { "key" : "int_ref", "description": "Searchable name of the place."},
+ { "key" : "nat_ref", "description": "Searchable name of the place."},
+ { "key" : "reg_ref", "description": "Searchable name of the place."},
+ { "key" : "loc_ref", "description": "Searchable name of the place."},
+ { "key" : "old_ref", "description": "Searchable name of the place."},
+ { "key" : "iata", "description": "Searchable name of the place."},
+ { "key" : "icao", "description": "Searchable name of the place."},
+ { "key" : "pcode:1", "description": "Searchable name of the place."},
+ { "key" : "pcode:2", "description": "Searchable name of the place."},
+ { "key" : "pcode:3", "description": "Searchable name of the place."},
+ { "key" : "name", "description": "Searchable name of the place."},
+ { "key" : "int_name", "description": "Searchable name of the place."},
+ { "key" : "nat_name", "description": "Searchable name of the place."},
+ { "key" : "reg_name", "description": "Searchable name of the place."},
+ { "key" : "loc_name", "description": "Searchable name of the place."},
+ { "key" : "old_name", "description": "Searchable name of the place."},
+ { "key" : "alt_name", "description": "Searchable name of the place."},
+ { "key" : "official_name", "description": "Searchable name of the place."},
+ { "key" : "place_name", "description": "Searchable name of the place."},
+ { "key" : "short_name", "description": "Searchable name of the place."},
+ { "key" : "addr:housename", "description": "Searchable name of the place."},
+ { "key" : "operator", "description": "Searchable name for amenities and shops." },
+ { "key" : "brand", "description": "Searchable name of POI places."},
+ { "key" : "bridge:name", "description" : "Searchable name for bridges."},
+ { "key" : "tunnel:name", "description" : "Searchable name for tunnels."},
+ { "key" : "emergency", "description": "POI in the search database." },
+ { "key" : "tourism", "description": "POI in the search database." },
+ { "key" : "historic", "description": "POI in the search database." },
+ { "key" : "military", "description": "POI in the search database." },
+ { "key" : "natural", "description": "POI in the search database." },
+ { "key" : "man_made", "description": "POI in the search database." },
+ { "key" : "mountain_pass", "description": "POI in the search database." },
+ { "key" : "highway", "description": "POI or street in the search database (not added are: 'no', 'turning_circle', 'traffic_signals', 'mini_roundabout', 'noexit', 'crossing')." },
+ { "key" : "aerialway", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "aeroway", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "amenity", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "boundary", "description": "Area in the search database (used to compute addresses of other places)." },
+ { "key" : "bridge", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "craft", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "leisure", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "office", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "railway", "description": "Geographic feature in the search database (unless value is 'no')." },
+ { "key" : "landuse", "description": "Geographic feature in the search database (unless value is 'no')." },
+ { "key" : "shop", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "tunnel", "description": "POI in the search database (unless value is 'no')." },
+ { "key" : "waterway", "description": "Geographic feature in the search database (unless value is 'riverbank')."},
+ { "key" : "place", "description": "Settlement on the search database (used to compute addresses of other places)." },
+ { "key" : "postal_code", "description": "Postcode in search database (used to compute postcodes of places around)." },
+ { "key" : "postcode", "description": "Postcode in search database (used to compute postcodes of places around)." },
+ { "key" : "addr:postcode", "description": "Postcode in search database (used to compute postcodes of places around)." },
+ { "key" : "tiger:zip_left", "description": "Postcode in search database (used to compute postcodes of places around)." },
+ { "key" : "tiger:zip_right", "description": "Postcode in search database (used to compute postcodes of places around)." },
+ { "key" : "addr:street", "description": "Used to determine the street of a house or POI. Note that a street with the same name must exist for the tag to be effective."},
+ { "key" : "addr:place", "description": "Used to determine the settlement of a house or POI with a street-less address. Note that a place with the same name must exist for the tag to be effective."},
+ { "key" : "country_code", "description": "Used to determine the country a place is in."},
+ { "key" : "ISO3166-1", "description": "Used to determine the country a place is in."},
+ { "key" : "is_in:country_code", "description": "Used to determine the country a place is in."},
+ { "key" : "addr:country", "description": "Used to determine the country a place is in."},
+ { "key" : "addr:country_code", "description": "Used to determine the country a place is in."},
+ { "key" : "addr:housenumber", "description": "House number of the place (no ranges)."},
+ { "key" : "addr:conscriptionnumber", "description": "House number of the place (Eastern European system)."},
+ { "key" : "addr:streetnumber", "description": "House number of the place (Eastern European system)."},
+ { "key" : "addr:interpolation", "description": "Way along which house numbers are interpolated."} ,
+ { "key" : "tiger:county", "description": "Used to determine the address in the US (needs a place with the same name and a county suffix)."},
+ { "key" : "is_in", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+ { "key" : "addr:suburb", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+ { "key" : "addr:city", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+ { "key" : "addr:state_code", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+ { "key" : "addr:state", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+ { "key" : "admin_level", "description": "Determines the hierarchy for administrative boundaries."},
+ { "key" : "wikipedia", "description": "Linking to the right wikipedia article helps to guess the importance of a place, which determines how far up in the search results it appears."}
+ ]
+}