]> git.openstreetmap.org Git - nominatim.git/commitdiff
Merge remote-tracking branch 'upstream/master'
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 9 Mar 2016 22:28:33 +0000 (23:28 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 9 Mar 2016 22:28:33 +0000 (23:28 +0100)
29 files changed:
lib/Geocode.php
lib/init-website.php
lib/lib.php
lib/log.php
munin/nominatim_query_speed_querylog [new file with mode: 0755]
munin/nominatim_requests_querylog [new file with mode: 0755]
munin/nominatim_throttled_ips [new file with mode: 0755]
settings/settings.php
sql/functions.sql
sql/tables.sql
sql/tiger_import_finish.sql
tests/features/api/regression.feature
tests/features/api/search_params.feature
tests/features/api/search_simple.feature
utils/cron_banip.py [new file with mode: 0755]
utils/cron_ipanalyse.py [new file with mode: 0755]
utils/cron_logrotate.sh [new file with mode: 0755]
utils/cron_vacuum.sh [new file with mode: 0755]
utils/setup.php
utils/update.php
website/403.html [new file with mode: 0644]
website/509.html [new file with mode: 0644]
website/crossdomain.xml [new file with mode: 0644]
website/favicon.ico [new file with mode: 0644]
website/last_update.php [new file with mode: 0644]
website/nominatim.xml [new file with mode: 0644]
website/reverse.php
website/robots.txt [new file with mode: 0644]
website/taginfo.json [new file with mode: 0644]

index 1dbde919d4fda8975763af3e79d300206c6737a0..48055ec6780cbfe708a843e33070fc7ed869c22d 100644 (file)
@@ -20,7 +20,7 @@
 
                protected $aExcludePlaceIDs = array();
                protected $bDeDupe = true;
-               protected $bReverseInPlan = false;
+               protected $bReverseInPlan = true;
 
                protected $iLimit = 20;
                protected $iFinalLimit = 10;
                                $sSQL .= "and 30 between $this->iMinAddressRank and $this->iMaxAddressRank ";
                                $sSQL .= "group by place_id";
                                if (!$this->bDeDupe) $sSQL .= ",place_id ";
+                               /*
                                $sSQL .= " union ";
                                $sSQL .= "select 'L' as osm_type,place_id as osm_id,'place' as class,'house' as type,null as admin_level,30 as rank_search,30 as rank_address,min(place_id) as place_id, min(parent_place_id) as parent_place_id,'us' as country_code,";
                                $sSQL .= "get_address_by_language(place_id, $sLanguagePrefArraySQL) as langaddress,";
                                $sSQL .= "group by place_id";
                                if (!$this->bDeDupe) $sSQL .= ",place_id";
                                $sSQL .= ",get_address_by_language(place_id, $sLanguagePrefArraySQL) ";
+                               */
                        }
 
                        $sSQL .= " order by importance desc";
                                                        // TODO: filter out the pointless search terms (2 letter name tokens and less)
                                                        // they might be right - but they are just too darned expensive to run
                                                        if (sizeof($aSearch['aName'])) $aTerms[] = "name_vector @> ARRAY[".join($aSearch['aName'],",")."]";
-                                                       if (sizeof($aSearch['aNameNonSearch'])) $aTerms[] = "array_cat(name_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aNameNonSearch'],",")."]";
+                                                       //if (sizeof($aSearch['aNameNonSearch'])) $aTerms[] = "array_cat(name_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aNameNonSearch'],",")."]";
                                                        if (sizeof($aSearch['aAddress']) && $aSearch['aName'] != $aSearch['aAddress'])
                                                        {
                                                                // For infrequent name terms disable index usage for address
                                                                                sizeof($aSearch['aName']) == 1 &&
                                                                                $aWordFrequencyScores[$aSearch['aName'][reset($aSearch['aName'])]] < CONST_Search_NameOnlySearchFrequencyThreshold)
                                                                {
-                                                                       $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join(array_merge($aSearch['aAddress'],$aSearch['aAddressNonSearch']),",")."]";
+                                                                       //$aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join(array_merge($aSearch['aAddress'],$aSearch['aAddressNonSearch']),",")."]";
+                                                                       $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aAddress'],",")."]";
                                                                }
                                                                else
                                                                {
                                                                        $aTerms[] = "nameaddress_vector @> ARRAY[".join($aSearch['aAddress'],",")."]";
-                                                                       if (sizeof($aSearch['aAddressNonSearch'])) $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aAddressNonSearch'],",")."]";
+                                                                       //if (sizeof($aSearch['aAddressNonSearch'])) $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aAddressNonSearch'],",")."]";
                                                                }
                                                        }
                                                        if ($aSearch['sCountryCode']) $aTerms[] = "country_code = '".pg_escape_string($aSearch['sCountryCode'])."'";
                                                                $aPlaceIDs = $this->oDB->getCol($sSQL);
 
                                                                // If not try the aux fallback table
+                                                               /*
                                                                if (!sizeof($aPlaceIDs))
                                                                {
                                                                        $sSQL = "select place_id from location_property_aux where parent_place_id in (".$sPlaceIDs.") and housenumber = '".pg_escape_string($aSearch['sHouseNumber'])."'";
                                                                        if (CONST_Debug) var_dump($sSQL);
                                                                        $aPlaceIDs = $this->oDB->getCol($sSQL);
                                                                }
+                                                               */
 
                                                                if (!sizeof($aPlaceIDs))
                                                                {
index a34e4c82787ae1dee4f77d5431a66376aeb077dc..03e269656a0516fcad57db2bd687413c4b25b113 100644 (file)
@@ -5,6 +5,7 @@
        {
                header("Access-Control-Allow-Origin: *");
                header("Access-Control-Allow-Methods: OPTIONS,GET");
+               header("Access-Control-Max-Age: 8640000");
                if (!empty($_SERVER['HTTP_ACCESS_CONTROL_REQUEST_HEADERS']))
                {
                        header("Access-Control-Allow-Headers: ".$_SERVER['HTTP_ACCESS_CONTROL_REQUEST_HEADERS']);
        }
        if ($_SERVER['REQUEST_METHOD'] == 'OPTIONS') exit;
 
-       if (CONST_ClosedForIndexing && strpos(CONST_ClosedForIndexingExceptionIPs, ','.$_SERVER["REMOTE_ADDR"].',') === false)
-       {
-               echo "Closed for re-indexing...";
-               exit;
-       }
-
-       $aBucketKeys = array();
-
-       if (isset($_SERVER["HTTP_REFERER"])) $aBucketKeys[] = str_replace('www.','',strtolower(parse_url($_SERVER["HTTP_REFERER"], PHP_URL_HOST)));
-       if (isset($_SERVER["REMOTE_ADDR"])) $aBucketKeys[] = $_SERVER["REMOTE_ADDR"];
-       if (isset($_GET["email"])) $aBucketKeys[] = $_GET["email"];
-
-       $fBucketVal = doBucket($aBucketKeys, 
-                       (defined('CONST_ConnectionBucket_PageType')?constant('CONST_ConnectionBucket_Cost_'.CONST_ConnectionBucket_PageType):1) + user_busy_cost(),
-                       CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit);
-
-       if ($fBucketVal > CONST_ConnectionBucket_WaitLimit && $fBucketVal < CONST_ConnectionBucket_BlockLimit)
-       {
-               $m = getBucketMemcache();
-               $iCurrentSleeping = $m->increment('sleepCounter');
-               if (false === $iCurrentSleeping)
-               {
-                       $m->add('sleepCounter', 0);
-                       $iCurrentSleeping = $m->increment('sleepCounter');
-               }
-               if ($iCurrentSleeping >= CONST_ConnectionBucket_MaxSleeping || isBucketSleeping($aBucketKeys))
-               {
-                       // Too many threads sleeping already.  This becomes a hard block.
-                       $fBucketVal = doBucket($aBucketKeys, CONST_ConnectionBucket_BlockLimit, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit);
-               }
-               else
-               {
-                       setBucketSleeping($aBucketKeys, true);
-                       sleep(($fBucketVal - CONST_ConnectionBucket_WaitLimit)/CONST_ConnectionBucket_LeakRate);
-                       $fBucketVal = doBucket($aBucketKeys, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_LeakRate, CONST_ConnectionBucket_BlockLimit);
-                       setBucketSleeping($aBucketKeys, false);
-               }
-               $m->decrement('sleepCounter');
-       }
-
-       if (strpos(CONST_BlockedIPs, ','.$_SERVER["REMOTE_ADDR"].',') !== false || $fBucketVal >= CONST_ConnectionBucket_BlockLimit)
-       {
-               header("HTTP/1.0 429 Too Many Requests");
-               echo "Your IP has been blocked. \n";
-               echo CONST_BlockMessage;
-               exit;
-       }
-
        header('Content-type: text/html; charset=utf-8');
-
index 51bf7ebb13381e32507535ca394f7aad46545685..e657daf4207c7bd433e85425990b0ccf5cc20ffe 100644 (file)
        {
                $aResult = array(array(join(' ',$aWords)));
                $sFirstToken = '';
-               if ($iDepth < 8) {
+               if ($iDepth < 7) {
                        while(sizeof($aWords) > 1)
                        {
                                $sWord = array_shift($aWords);
                {
                        preg_match_all('/(-?[0-9.]+) (-?[0-9.]+)/', $aMatch[1], $aPolyPoints, PREG_SET_ORDER);
                }
-               elseif (preg_match('#MULTIPOLYGON\\(\\(\\(([- 0-9.,]+)#', $geometry_as_text, $aMatch))
+/*             elseif (preg_match('#MULTIPOLYGON\\(\\(\\(([- 0-9.,]+)#', $geometry_as_text, $aMatch))
                {
                        preg_match_all('/(-?[0-9.]+) (-?[0-9.]+)/', $aMatch[1], $aPolyPoints, PREG_SET_ORDER);
-               }
+        }*/
                elseif (preg_match('#POINT\\((-?[0-9.]+) (-?[0-9.]+)\\)#', $geometry_as_text, $aMatch))
                {
                        $aPolyPoints = createPointsAroundCenter($aMatch[1], $aMatch[2], $fRadius);
index 37d83c4771fdb410a46290de8cb60cc8eedb1346..de19167bbb864294e07d8c0418ab51abe9fd7bf3 100644 (file)
@@ -2,68 +2,59 @@
 
        function logStart(&$oDB, $sType = '', $sQuery = '', $aLanguageList = array())
        {
-               $aStartTime = explode('.',microtime(true));
+               $fStartTime = microtime(true);
+               $aStartTime = explode('.', $fStartTime);
                if (!isset($aStartTime[1])) $aStartTime[1] = '0';
 
                $sOutputFormat = '';
                if (isset($_GET['format'])) $sOutputFormat = $_GET['format'];
 
+               if ($sType == 'reverse')
+               {
+                       $sOutQuery = (isset($_GET['lat'])?$_GET['lat']:'').'/';
+                       if (isset($_GET['lon'])) $sOutQuery .= $_GET['lon'];
+                       if (isset($_GET['zoom'])) $sOutQuery .= '/'.$_GET['zoom'];
+               }
+               else
+                       $sOutQuery = $sQuery;
+
                $hLog = array(
                                date('Y-m-d H:i:s',$aStartTime[0]).'.'.$aStartTime[1],
                                $_SERVER["REMOTE_ADDR"],
                                $_SERVER['QUERY_STRING'],
-                               $sQuery
+                               $sOutQuery,
+                               $sType,
+                               $fStartTime
                                );
 
                if (CONST_Log_DB)
                {
-                       // Log
-                       if ($sType == 'search')
-                       {
-                               $oDB->query('insert into query_log values ('.getDBQuoted($hLog[0]).','.getDBQuoted($hLog[3]).','.getDBQuoted($hLog[1]).')');
-                       }
-
-                       $sSQL = 'insert into new_query_log (type,starttime,query,ipaddress,useragent,language,format)';
+                       if (isset($_GET['email']))
+                               $sUserAgent = $_GET['email'];
+                       elseif (isset($_SERVER['HTTP_REFERER']))
+                               $sUserAgent = $_SERVER['HTTP_REFERER'];
+                       elseif (isset($_SERVER['HTTP_USER_AGENT']))
+                               $sUserAgent = $_SERVER['HTTP_USER_AGENT'];
+                       else
+                               $sUserAgent = '';
+                       $sSQL = 'insert into new_query_log (type,starttime,query,ipaddress,useragent,language,format,searchterm)';
                        $sSQL .= ' values ('.getDBQuoted($sType).','.getDBQuoted($hLog[0]).','.getDBQuoted($hLog[2]);
-                       $sSQL .= ','.getDBQuoted($hLog[1]).','.getDBQuoted($_SERVER['HTTP_USER_AGENT']).','.getDBQuoted(join(',',$aLanguageList)).','.getDBQuoted($sOutputFormat).')';
+                       $sSQL .= ','.getDBQuoted($hLog[1]).','.getDBQuoted($sUserAgent).','.getDBQuoted(join(',',$aLanguageList)).','.getDBQuoted($sOutputFormat).','.getDBQuoted($hLog[3]).')';
                        $oDB->query($sSQL);
                }
 
-               if (CONST_Log_File && CONST_Log_File_ReverseLog != '')
-               {
-                       if ($sType == 'reverse')
-                       {
-                               $aStartTime = explode('.',$hLog[0]);
-                               file_put_contents(CONST_Log_File_ReverseLog,
-                                                               $aStartTime[0].','.$aStartTime[1].','.
-                                                               php_uname('n').','.
-                                                               '"'.addslashes(isset($_SERVER['HTTP_REFERER'])?$_SERVER['HTTP_REFERER']:'').'",'.
-                                                               '"'.addslashes($hLog[1]).'",'.
-                                                               $_GET['lat'].','.
-                                                               $_GET['lon'].','.
-                                                               $_GET['zoom'].','.
-                                                               '"'.addslashes($_SERVER['HTTP_USER_AGENT']).'",'.
-                                                               '"'.addslashes($sOutputFormat).'"'."\n",
-                                                               FILE_APPEND);
-                       }
-               }
-
                return $hLog;
        }
 
        function logEnd(&$oDB, $hLog, $iNumResults)
        {
-               $aEndTime = explode('.',microtime(true));
-               if (!$aEndTime[1]) $aEndTime[1] = '0';
-               $sEndTime = date('Y-m-d H:i:s',$aEndTime[0]).'.'.$aEndTime[1];
+               $fEndTime = microtime(true);
 
                if (CONST_Log_DB)
                {
-                       $sSQL = 'update query_log set endtime = '.getDBQuoted($sEndTime).', results = '.$iNumResults;
-                       $sSQL .= ' where starttime = '.getDBQuoted($hLog[0]);
-                       $sSQL .= ' and ipaddress = '.getDBQuoted($hLog[1]);
-                       $sSQL .= ' and query = '.getDBQuoted($hLog[3]);
-                       $oDB->query($sSQL);
+                       $aEndTime = explode('.', $fEndTime);
+                       if (!$aEndTime[1]) $aEndTime[1] = '0';
+                       $sEndTime = date('Y-m-d H:i:s',$aEndTime[0]).'.'.$aEndTime[1];
 
                        $sSQL = 'update new_query_log set endtime = '.getDBQuoted($sEndTime).', results = '.$iNumResults;
                        $sSQL .= ' where starttime = '.getDBQuoted($hLog[0]);
                        $oDB->query($sSQL);
                }
 
-               if (CONST_Log_File && CONST_Log_File_SearchLog != '')
+               if (CONST_Log_File)
                {
-                       $aStartTime = explode('.',$hLog[0]);
-                       file_put_contents(CONST_Log_File_SearchLog,
-                                       $aStartTime[0].','.$aStartTime[1].','.
-                                       php_uname('n').','.
-                                       '"'.addslashes(isset($_SERVER['HTTP_REFERER'])?$_SERVER['HTTP_REFERER']:'').'",'.
-                                       '"'.addslashes($hLog[1]).'",'.
-                                       '"'.addslashes($hLog[3]).'",'.
-                                       '"'.addslashes($_SERVER['HTTP_USER_AGENT']).'",'.
-                                       '"'.addslashes((isset($_GET['format']))?$_GET['format']:'').'",'.
-                                       $iNumResults."\n",
-                                       FILE_APPEND);
+                       $aOutdata = sprintf("[%s] %.4f %d %s \"%s\"\n",
+                                           $hLog[0], $fEndTime-$hLog[5], $iNumResults,
+                                           $hLog[4], $hLog[2]);
+                       file_put_contents(CONST_Log_File, $aOutdata, FILE_APPEND | LOCK_EX);
                }
 
        }
diff --git a/munin/nominatim_query_speed_querylog b/munin/nominatim_query_speed_querylog
new file mode 100755 (executable)
index 0000000..f35793f
--- /dev/null
@@ -0,0 +1,163 @@
+#!/usr/bin/python3
+#
+# Plugin to monitor the types of requsts made to the API
+#
+# Uses the query log.
+#
+# Parameters:
+#
+#       config   (required)
+#       autoconf (optional - used by munin-config)
+#
+
+import re
+import os
+import sys
+from datetime import datetime, timedelta
+
+CONFIG="""graph_title Total Nominatim response time
+graph_vlabel Time to response
+graph_category Nominatim 
+graph_period minute
+graph_args --base 1000
+
+avgs.label Average search time
+avgs.draw LINE
+avgs.type GAUGE
+avgs.min 0
+avgs.info Moving 5 minute average time to perform search
+
+avgr.label Average reverse time
+avgr.draw LINE
+avgr.type GAUGE
+avgr.min 0
+avgr.info Moving 5 minute average time to perform reverse
+
+max.label Slowest time to response (1/100)
+max.draw LINE
+max.type GAUGE
+max.min 0
+max.info Slowest query in last 5 minutes (unit: 100s)"""
+
+ENTRY_REGEX = re.compile(r'\[[^]]+\] (?P<dur>[0-9.]+) (?P<numres>\d+) (?P<type>[a-z]+) ')
+TIME_REGEX = re.compile(r'\[(?P<t_year>\d\d\d\d)-(?P<t_month>\d\d)-(?P<t_day>\d\d) (?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d)[0-9.]*\] ')
+
+
+class LogFile:
+    """ A query log file, unpacked. """
+
+    def __init__(self, filename):
+        self.fd = open(filename, encoding='utf-8', errors='replace')
+        self.len = os.path.getsize(filename)
+
+    def __del__(self):
+        self.fd.close()
+
+    def seek_next(self, abstime):
+        self.fd.seek(abstime)
+        self.fd.readline()
+        l = self.fd.readline()
+        e = TIME_REGEX.match(l)
+        if e is None:
+            return None
+        e = e.groupdict()
+        return datetime(int(e['t_year']), int(e['t_month']), int(e['t_day']),
+                             int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+
+    def seek_to_date(self, target):
+        # start position for binary search
+        fromseek = 0
+        fromdate = self.seek_next(0)
+        if fromdate > target:
+            return True
+        # end position for binary search
+        toseek = -100
+        while -toseek < self.len:
+            todate = self.seek_next(self.len + toseek)
+            if todate is not None:
+                break
+            toseek -= 100
+        if todate is None or todate < target:
+            return False
+        toseek = self.len + toseek
+
+
+        while True:
+            bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
+            newseek = fromseek + int((target - fromdate).total_seconds() * bps)
+            newdate = self.seek_next(newseek)
+            if newdate is None:
+                return False;
+            error = abs((target - newdate).total_seconds())
+            if error < 1:
+                return True
+            if newdate > target:
+                toseek = newseek
+                todate = newdate
+                oldfromseek = fromseek
+                fromseek = toseek - error * bps
+                while True:
+                    if fromseek <= oldfromseek:
+                        fromseek = oldfromseek
+                        fromdate = self.seek_next(fromseek)
+                        break
+                    fromdate = self.seek_next(fromseek)
+                    if fromdate < target:
+                        break;
+                    bps *=2
+                    fromseek -= error * bps
+            else:
+                fromseek = newseek
+                fromdate = newdate
+                oldtoseek = toseek
+                toseek = fromseek + error * bps
+                while True:
+                    if toseek > oldtoseek:
+                        toseek = oldtoseek
+                        todate = self.seek_next(toseek)
+                        break
+                    todate = self.seek_next(toseek)
+                    if todate > target:
+                        break
+                    bps *=2
+                    toseek += error * bps
+            if toseek - fromseek < 500:
+                return True
+
+
+    def loglines(self):
+        for l in self.fd:
+            e = ENTRY_REGEX.match(l)
+            if e is not None:
+                yield e.groupdict()
+
+
+if __name__ == '__main__':
+
+    if len(sys.argv) > 1 and sys.argv[1] == 'config':
+        print(CONFIG)
+        sys.exit(0)
+
+    sumrev = 0
+    numrev = 0
+    sumsearch = 0
+    numsearch = 0
+    maxres = 0
+    if 'NOMINATIM_QUERYLOG' in os.environ:
+        lf = LogFile(os.environ['NOMINATIM_QUERYLOG'])
+        if lf.seek_to_date(datetime.now() - timedelta(minutes=5)):
+            for l in lf.loglines():
+                dur = float(l['dur'])
+                if l['type'] == 'reverse':
+                    numrev += 1
+                    sumrev += dur
+                elif  l['type'] == 'search':
+                    numsearch += 1
+                    sumsearch += dur
+                if dur > maxres:
+                    maxres = dur
+
+
+    print('avgs.value', 0 if numsearch == 0 else sumsearch/numsearch)
+    print('avgr.value', 0 if numrev == 0 else sumrev/numrev)
+    print('max.value', maxres/100.0)
diff --git a/munin/nominatim_requests_querylog b/munin/nominatim_requests_querylog
new file mode 100755 (executable)
index 0000000..aa41a4d
--- /dev/null
@@ -0,0 +1,156 @@
+#!/usr/bin/python3
+#
+# Plugin to monitor the types of requsts made to the API
+#
+# Uses the query log.
+#
+# Parameters: 
+#
+#       config   (required)
+#       autoconf (optional - used by munin-config)
+#
+
+import re
+import os
+import sys
+from datetime import datetime, timedelta
+
+CONFIG="""graph_title Requests by API call
+graph_args --base 1000 -l 0
+graph_vlabel requests per minute
+graph_category nominatim
+z1.label reverse
+z1.draw AREA
+z1.type GAUGE
+z2.label search (successful)
+z2.draw STACK
+z2.type GAUGE
+z3.label search (no result)
+z3.draw STACK
+z3.type GAUGE
+z4.label details
+z4.draw STACK
+z4.type GAUGE"""
+
+ENTRY_REGEX = re.compile(r'\[[^]]+\] (?P<dur>[0-9.]+) (?P<numres>\d+) (?P<type>[a-z]+) ')
+TIME_REGEX = re.compile(r'\[(?P<t_year>\d\d\d\d)-(?P<t_month>\d\d)-(?P<t_day>\d\d) (?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d)[0-9.]*\] ')
+
+
+class LogFile:
+    """ A query log file, unpacked. """
+
+    def __init__(self, filename):
+        self.fd = open(filename, encoding='utf-8', errors='replace')
+        self.len = os.path.getsize(filename)
+
+    def __del__(self):
+        self.fd.close()
+
+    def seek_next(self, abstime):
+        self.fd.seek(abstime)
+        self.fd.readline()
+        l = self.fd.readline()
+        e = TIME_REGEX.match(l)
+        if e is None:
+            return None
+        e = e.groupdict()
+        return datetime(int(e['t_year']), int(e['t_month']), int(e['t_day']),
+                             int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+
+    def seek_to_date(self, target):
+        # start position for binary search
+        fromseek = 0
+        fromdate = self.seek_next(0)
+        if fromdate > target:
+            return True
+        # end position for binary search
+        toseek = -100
+        while -toseek < self.len:
+            todate = self.seek_next(self.len + toseek)
+            if todate is not None:
+                break
+            toseek -= 100
+        if todate is None or todate < target:
+            return False
+        toseek = self.len + toseek
+
+
+        while True:
+            bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
+            newseek = fromseek + int((target - fromdate).total_seconds() * bps)
+            newdate = self.seek_next(newseek)
+            if newdate is None:
+                return False;
+            error = abs((target - newdate).total_seconds())
+            if error < 1:
+                return True
+            if newdate > target:
+                toseek = newseek
+                todate = newdate
+                oldfromseek = fromseek
+                fromseek = toseek - error * bps
+                while True:
+                    if fromseek <= oldfromseek:
+                        fromseek = oldfromseek
+                        fromdate = self.seek_next(fromseek)
+                        break
+                    fromdate = self.seek_next(fromseek)
+                    if fromdate < target:
+                        break;
+                    bps *=2
+                    fromseek -= error * bps
+            else:
+                fromseek = newseek
+                fromdate = newdate
+                oldtoseek = toseek
+                toseek = fromseek + error * bps
+                while True:
+                    if toseek > oldtoseek:
+                        toseek = oldtoseek
+                        todate = self.seek_next(toseek)
+                        break
+                    todate = self.seek_next(toseek)
+                    if todate > target:
+                        break
+                    bps *=2
+                    toseek += error * bps
+            if toseek - fromseek < 500:
+                return True
+
+
+    def loglines(self):
+        for l in self.fd:
+            e = ENTRY_REGEX.match(l)
+            if e is not None:
+                yield e.groupdict()
+
+
+if __name__ == '__main__':
+
+    if len(sys.argv) > 1 and sys.argv[1] == 'config':
+        print(CONFIG)
+        sys.exit(0)
+
+    reverse = 0
+    searchy = 0
+    searchn = 0
+    details = 0
+    if 'NOMINATIM_QUERYLOG' in os.environ:
+        lf = LogFile(os.environ['NOMINATIM_QUERYLOG'])
+        if lf.seek_to_date(datetime.now() - timedelta(minutes=5)):
+            for l in lf.loglines():
+                if l['type'] == 'reverse':
+                    reverse += 1
+                elif  l['type'] == 'search':
+                    if l['numres'] == '0':
+                        searchn += 1
+                    else:
+                        searchy += 1
+                else:
+                    details += 1
+
+
+    print('z1.value', reverse/5)
+    print('z2.value', searchy/5)
+    print('z3.value', searchn/5)
+    print('z4.value', details/5)
diff --git a/munin/nominatim_throttled_ips b/munin/nominatim_throttled_ips
new file mode 100755 (executable)
index 0000000..cdfc88d
--- /dev/null
@@ -0,0 +1,28 @@
+#!/bin/sh
+#
+# Plugin to monitor the number of IPs in special pools
+#
+# Parameters: 
+#
+#       config   (required)
+#       autoconf (optional - used by munin-config)
+#
+if [ "$1" = "config" ]; then
+        echo 'graph_title Restricted IPs' 
+        echo 'graph_args -l 0'
+        echo 'graph_vlabel number of IPs'
+        echo 'graph_category nominatim'
+        echo 'bulk.label bulk'
+        echo 'bulk.draw AREA'
+        echo 'bulk.type GAUGE'
+        echo 'block.label blocked'
+        echo 'block.draw STACK'
+        echo 'block.type GAUGE'
+        exit 0
+fi
+BASEDIR="$(dirname "$(readlink -f "$0")")"
+
+cut -f 2 -d ' ' $BASEDIR/../settings/ip_blocks.map | sort | uniq -c | sed 's:[[:space:]]*\([0-9]\+\) \(.*\):\2.value \1:'
index 04ffe1621caa785362c4a6d55aa69875029b9b43..765849e74a7052ad7fa6e22d41b067df6d0d3e6e 100644 (file)
 
        // Website settings
        @define('CONST_NoAccessControl', true);
-       @define('CONST_ClosedForIndexing', false);
-       @define('CONST_ClosedForIndexingExceptionIPs', '');
        @define('CONST_BlockedIPs', '');
+       @define('CONST_IPBanFile', CONST_BasePath.'/settings/ip_blocks');
+       @define('CONST_WhitelistedIPs', '');
+       @define('CONST_BlockedUserAgents', '');
+       @define('CONST_BlockReverseMaxLoad', 15);
        @define('CONST_BulkUserIPs', '');
        @define('CONST_BlockMessage', ''); // additional info to show for blocked IPs
 
-       @define('CONST_Website_BaseURL', 'http://'.php_uname('n').'/');
+       @define('CONST_Website_BaseURL', 'http://nominatim.openstreetmap.org/');
        @define('CONST_Tile_Default', 'Mapnik');
 
        @define('CONST_Default_Language', false);
index bd64697ad0801d53d26a641e72ef4040e4f84ac5..18e8bdbc5278d10226cc9c97a209035f2c3a5a37 100644 (file)
@@ -2055,6 +2055,12 @@ BEGIN
   END IF;
 
 
+  -- refuse to update multiplpoygons with too many objects, too much of a performance hit
+  IF ST_NumGeometries(NEW.geometry) > 2000 THEN
+    RAISE WARNING 'Dropping update of % % because of geometry complexity.', NEW.osm_type, NEW.osm_id;
+    RETURN NULL;
+  END IF;
+
   IF coalesce(existing.name::text, '') != coalesce(NEW.name::text, '')
      OR coalesce(existing.extratags::text, '') != coalesce(NEW.extratags::text, '')
      OR coalesce(existing.housenumber, '') != coalesce(NEW.housenumber, '')
index d13eee6ec0aa26c7009ec585146b7af8dc3e6cce..ccca8f138f044485314bdeec244da80f695d3d29 100644 (file)
@@ -23,19 +23,6 @@ CREATE TABLE import_npi_log (
   event text
   );
 
---drop table IF EXISTS query_log;
-CREATE TABLE query_log (
-  starttime timestamp,
-  query text,
-  ipaddress text,
-  endtime timestamp,
-  results integer
-  );
-CREATE INDEX idx_query_log ON query_log USING BTREE (starttime);
-GRANT SELECT ON query_log TO "{www-user}" ;
-GRANT INSERT ON query_log TO "{www-user}" ;
-GRANT UPDATE ON query_log TO "{www-user}" ;
-
 CREATE TABLE new_query_log (
   type text,
   starttime timestamp,
@@ -43,6 +30,7 @@ CREATE TABLE new_query_log (
   useragent text,
   language text,
   query text,
+  searchterm text,
   endtime timestamp,
   results integer,
   format text,
@@ -56,9 +44,6 @@ GRANT SELECT ON new_query_log TO "{www-user}" ;
 GRANT SELECT ON TABLE country_name TO "{www-user}";
 GRANT SELECT ON TABLE gb_postcode TO "{www-user}";
 
-create view vw_search_query_log as SELECT substr(query, 1, 50) AS query, starttime, endtime - starttime AS duration, substr(useragent, 1, 20) as 
-useragent, language, results, ipaddress FROM new_query_log WHERE type = 'search' ORDER BY starttime DESC;
-
 drop table IF EXISTS word;
 CREATE TABLE word (
   word_id INTEGER,
index 4718d50263df84c55d646da70129f3bbce448dc1..a7d837f4c02a30927b93c77e63f3592c1b3e24c5 100644 (file)
@@ -3,10 +3,10 @@ CREATE UNIQUE INDEX idx_location_property_tiger_place_id_imp ON location_propert
 
 GRANT SELECT ON location_property_tiger_import TO "{www-user}";
 
-DROP TABLE IF EXISTS location_property_tiger;
-ALTER TABLE location_property_tiger_import RENAME TO location_property_tiger;
+--DROP TABLE IF EXISTS location_property_tiger;
+--ALTER TABLE location_property_tiger_import RENAME TO location_property_tiger;
 
-ALTER INDEX idx_location_property_tiger_housenumber_parent_place_id_imp RENAME TO idx_location_property_tiger_housenumber_parent_place_id;
-ALTER INDEX idx_location_property_tiger_place_id_imp RENAME TO idx_location_property_tiger_place_id;
+--ALTER INDEX idx_location_property_tiger_housenumber_parent_place_id_imp RENAME TO idx_location_property_tiger_housenumber_parent_place_id;
+--ALTER INDEX idx_location_property_tiger_place_id_imp RENAME TO idx_location_property_tiger_place_id;
 
 DROP FUNCTION tigger_create_interpolation (linegeo geometry, in_startnumber integer, in_endnumber integer, interpolationtype text, in_street text, in_isin text, in_postcode text);
index 8a436fc3d41dac571fc4626557e737170e6b0d57..bbff268a566f9476c6442545007164c21cb1b384 100644 (file)
@@ -187,7 +187,7 @@ Feature: API regression tests
      Scenario: trac #5238
         Given the request parameters
          | bounded | viewbox
-         | 1       | 0,0,-1,-1
+         | 1       | 0,0,1,-1
         When sending json search query "sy"
         Then exactly 0 results are returned
 
index 150d6bdcf45479d1255eeca63afc262347de31e3..34e43328d50d1c1a68a8a6ad0ecd36cf7e238497 100644 (file)
@@ -85,7 +85,7 @@ Feature: Search queries
     Scenario: bounded search remains within viewbox, even with no results
         Given the request parameters
          | bounded | viewbox
-         | 1       | 43.54285,-5.662003,43.5403125,-5.6563282
+         | 1       | 43.5403125,-5.6563282,43.54285,-5.662003
          When sending json search query "restaurant"
         Then less than 1 result is returned
 
index 2cb27b7cf61be0c3237f23220d804bf876374d18..43f460980da073be5f479963a47b2c9cc829b9a1 100644 (file)
@@ -108,35 +108,35 @@ Feature: Simple Tests
     Scenario: Empty XML search with viewbox
         Given the request parameters
           | viewbox
-          | 12,45.13,77,33
+          | 12,45.13,13,44
         When sending xml search query "xnznxvcx"
         Then result header contains
           | attr        | value
           | querystring | xnznxvcx
           | polygon     | false
-          | viewbox     | 12,45.13,77,33
+          | viewbox     | 12,45.13,13,44
 
     Scenario: Empty XML search with viewboxlbrt
         Given the request parameters
           | viewboxlbrt
-          | 12,34.13,77,45
+          | 12,34.13,13,35
         When sending xml search query "xnznxvcx"
         Then result header contains
           | attr        | value
           | querystring | xnznxvcx
           | polygon     | false
-          | viewbox     | 12,45.13,77,33
+          | viewbox     | 12,34.13,13,35
 
     Scenario: Empty XML search with viewboxlbrt and viewbox
         Given the request parameters
-          | viewbox        | viewboxblrt
-          | 12,45.13,77,33 | 1,2,3,4
+          | viewbox          | viewboxblrt
+          | 12,45.13,13.5,44 | 1,0,2,1
         When sending xml search query "pub"
         Then result header contains
           | attr        | value
           | querystring | pub
           | polygon     | false
-          | viewbox     | 12,45.13,77,33
+          | viewbox     | 12,45.13,13.5,44
 
 
     Scenario Outline: Empty XML search with polygon values
diff --git a/utils/cron_banip.py b/utils/cron_banip.py
new file mode 100755 (executable)
index 0000000..53f5e5f
--- /dev/null
@@ -0,0 +1,243 @@
+#!/usr/bin/python
+#
+# Search logs for high-bandwith users and create a list of suspicious IPs.
+# There are three states: bulk, block, ban. The first are bulk requesters
+# that need throtteling, the second bulk requesters that have overdone it
+# and the last manually banned IPs.
+#
+# The list can then be used in apache using rewrite rules to
+# direct bulk users to smaller thread pools or block them. A
+# typical apache config that uses php-fpm pools would look
+# like this:
+#
+#    Alias /nominatim-www/ "/var/www/nominatim/"
+#    Alias /nominatim-bulk/ "/var/www/nominatim/"
+#    <Directory "/var/www/nominatim/">
+#        Options MultiViews FollowSymLinks
+#        AddType text/html   .php
+#    </Directory>
+#
+#    <Location /nominatim-www>
+#        AddHandler fcgi:/var/run/php5-fpm-www.sock .php
+#    </Location>
+#    <Location /nominatim-bulk>
+#        AddHandler fcgi:/var/run/php5-fpm-bulk.sock .php
+#    </Location>
+#
+#    Redirect 509 /nominatim-block/
+#    ErrorDocument 509 "Bandwidth limit exceeded."
+#    Redirect 403 /nominatim-ban/
+#    ErrorDocument 403 "Access blocked."
+#
+#    RewriteEngine On
+#    RewriteMap bulklist txt:/home/wherever/ip-block.map
+#    RewriteRule ^/(.*) /nominatim-${bulklist:%{REMOTE_ADDR}|www}/$1 [PT]
+#
+
+import os
+import psycopg2
+import datetime
+
+BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
+
+#
+# DEFAULT SETTINGS
+#
+# Copy into settings/ip_blcoks.conf and adapt as required.
+#
+BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
+LOGFILE= BASEDIR + '/log/restricted_ip.log'
+
+# space-separated list of IPs that are never banned
+WHITELIST = ''
+# space-separated list of IPs manually blocked
+BLACKLIST = ''
+# user-agents that should be blocked from bulk mode
+# (matched with startswith)
+UA_BLOCKLIST = ()
+
+# time before a automatically blocked IP is allowed back
+BLOCKCOOLOFF_PERIOD='1 hour'
+# quiet time before an IP is released from the bulk pool
+BULKCOOLOFF_PERIOD='15 min'
+
+BULKLONG_LIMIT=8000
+BULKSHORT_LIMIT=2000
+BLOCK_UPPER=19000
+BLOCK_LOWER=4000
+BLOCK_LOADFAC=380
+BULK_LOADFAC=160
+BULK_LOWER=1500
+MAX_BULK_IPS=85
+
+#
+# END OF DEFAULT SETTINGS
+#
+
+try:
+    execfile(os.path.expanduser(BASEDIR + "/settings/ip_blocks.conf"))
+except IOError:
+    pass
+
+# read the previous blocklist
+WHITELIST = set(WHITELIST.split()) if WHITELIST else set()
+prevblocks = []
+prevbulks = []
+BLACKLIST = set(BLACKLIST.split()) if BLACKLIST else set()
+newblocks = set()
+newbulks = set()
+
+try:
+    fd = open(BLOCKEDFILE)
+    for line in fd:
+        ip, typ = line.strip().split(' ')
+        if ip not in BLACKLIST:
+            if typ == 'block':
+                prevblocks.append(ip)
+            elif typ == 'bulk':
+                prevbulks.append(ip)
+    fd.close()
+except IOError:
+    pass #ignore non-existing file
+
+# determine current load
+fd = open("/proc/loadavg")
+avgload = int(float(fd.readline().split()[2]))
+fd.close()
+# DB load
+conn = psycopg2.connect('dbname=nominatim')
+cur = conn.cursor()
+cur.execute("select count(*)/60 from new_query_log where starttime > now() - interval '1min'")
+dbload = int(cur.fetchone()[0])
+
+BLOCK_LIMIT = max(BLOCK_LOWER, BLOCK_UPPER - BLOCK_LOADFAC * (dbload - 75))
+BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * (avgload - 14))
+if len(prevbulks) > MAX_BULK_IPS:
+    BLOCK_LIMIT = max(3600, BLOCK_LOWER - (len(prevbulks) - MAX_BULK_IPS)*10)
+# if the bulk pool is still empty, clients will be faster, avoid having
+# them blocked in this case
+if len(prevbulks) < 10:
+    BLOCK_LIMIT = 2*BLOCK_UPPER
+
+
+# get the new block candidates
+cur.execute("""
+  SELECT ipaddress, max(count), max(ua) FROM
+   ((SELECT * FROM
+     (SELECT ipaddress, sum(case when endtime is null then 1 else 1+1.5*date_part('epoch',endtime-starttime) end) as count, substring(max(useragent) from 1 for 30) as ua FROM new_query_log
+      WHERE starttime > now() - interval '1 hour' GROUP BY ipaddress) as i
+   WHERE count > %s)
+   UNION
+   (SELECT ipaddress, count * 3, ua FROM
+     (SELECT ipaddress, sum(case when endtime is null then 1 else 1+1.5*date_part('epoch',endtime-starttime) end) as count, substring(max(useragent) from 1 for 30) as ua FROM new_query_log 
+      WHERE starttime > now() - interval '10 min' GROUP BY ipaddress) as i
+   WHERE count > %s)) as o
+  GROUP BY ipaddress
+""", (BULKLONG_LIMIT, BULKSHORT_LIMIT))
+
+bulkips = {}
+emergencyblocks = []
+useragentblocks = []
+
+for c in cur:
+    if c[0] not in WHITELIST and c[0] not in BLACKLIST:
+        # check for user agents that receive an immediate block
+        missing_agent = not c[2]
+        if not missing_agent:
+            for ua in UA_BLOCKLIST:
+                if c[2].startswith(ua):
+                    missing_agent = True
+                    break
+        if (missing_agent or c[1] > BLOCK_UPPER) and c[0] not in prevblocks:
+            newblocks.add(c[0])
+            if missing_agent:
+                useragentblocks.append(c[0])
+            else:
+                emergencyblocks.append(c[0])
+        else:
+            bulkips[c[0]] = c[1]
+
+# IPs from the block list that are no longer in the bulk list
+deblockcandidates = set()
+# IPs from the bulk list that are no longer in the bulk list
+debulkcandidates = set()
+# new IPs to go into the block list
+newlyblocked = []
+
+
+for ip in prevblocks:
+    if ip in bulkips:
+        newblocks.add(ip)
+        del bulkips[ip]
+    else:
+        deblockcandidates.add(ip)    
+        
+for ip in prevbulks:
+    if ip not in newblocks:
+        if ip in bulkips:
+            if bulkips[ip] > BLOCK_LIMIT:
+                newblocks.add(ip)
+                newlyblocked.append(ip)
+            else:
+                newbulks.add(ip)
+            del bulkips[ip]
+        else:
+            debulkcandidates.add(ip)
+
+# cross-check deblock candidates
+if deblockcandidates:
+    cur.execute("""
+        SELECT DISTINCT ipaddress FROM new_query_log
+        WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
+        """ % ("','".join(deblockcandidates), BLOCKCOOLOFF_PERIOD))
+
+    for c in cur:
+        newblocks.add(c[0])
+        deblockcandidates.remove(c[0])
+# deblocked IPs go back to the bulk pool to catch the ones that simply
+# ignored the HTTP error and just continue to hammer the API.
+# Those that behave and stopped will be debulked a minute later.
+for ip in deblockcandidates:
+    newbulks.add(ip)
+
+# cross-check debulk candidates
+if debulkcandidates:
+    cur.execute("""
+        SELECT DISTINCT ipaddress FROM new_query_log
+        WHERE ipaddress IN ('%s') AND starttime > now() - interval '%s'
+        AND starttime > date_trunc('day', now())
+        """ % ("','".join(debulkcandidates), BULKCOOLOFF_PERIOD))
+
+    for c in cur:
+        newbulks.add(c[0])
+        debulkcandidates.remove(c[0])
+
+for ip in bulkips.iterkeys():
+    newbulks.add(ip)
+
+# write out the new list
+fd = open(BLOCKEDFILE, 'w')
+for ip in newblocks:
+    fd.write(ip + " block\n")
+for ip in newbulks:
+    fd.write(ip + " bulk\n")
+for ip in BLACKLIST:
+    fd.write(ip + " ban\n")
+fd.close()
+
+# write out the log
+logstr = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
+fd = open(LOGFILE, 'a')
+if deblockcandidates:
+    fd.write(logstr % ('unblocked:', ', '.join(deblockcandidates)))
+if debulkcandidates:
+    fd.write(logstr % (' debulked:', ', '.join(debulkcandidates)))
+if bulkips:
+    fd.write(logstr % ('new bulks:', ', '.join(bulkips.keys())))
+if emergencyblocks:
+    fd.write(logstr % ('dir.block:', ', '.join(emergencyblocks)))
+if useragentblocks:
+    fd.write(logstr % (' ua block:', ', '.join(useragentblocks)))
+if newlyblocked:
+    fd.write(logstr % ('new block:', ', '.join(newlyblocked)))
+fd.close()
diff --git a/utils/cron_ipanalyse.py b/utils/cron_ipanalyse.py
new file mode 100755 (executable)
index 0000000..05b0b7f
--- /dev/null
@@ -0,0 +1,375 @@
+#!/usr/bin/python3
+#
+# Search apache logs for high-bandwith users and create a list of suspicious IPs.
+# There are three states: bulk, block, ban. The first are bulk requesters
+# that need throtteling, the second bulk requesters that have overdone it
+# and the last manually banned IPs.
+#
+
+import re
+import os
+import sys
+import subprocess
+from datetime import datetime, timedelta
+from collections import defaultdict
+
+#
+# DEFAULT SETTINGS
+#
+# Copy into settings/ip_blcoks.conf and adapt as required.
+#
+BASEDIR = os.path.normpath(os.path.join(os.path.realpath(__file__), '../..'))
+BLOCKEDFILE= BASEDIR + '/settings/ip_blocks.map'
+LOGFILE= BASEDIR + '/log/restricted_ip.log'
+
+# space-separated list of IPs that are never banned
+WHITELIST = ''
+# space-separated list of IPs manually blocked
+BLACKLIST = ''
+# user-agents that should be blocked from bulk mode
+# (matched with startswith)
+UA_BLOCKLIST = ()
+
+# time before a automatically blocked IP is allowed back
+BLOCKCOOLOFF_DELTA=timedelta(hours=1)
+# quiet time before an IP is released from the bulk pool
+BULKCOOLOFF_DELTA=timedelta(minutes=15)
+
+BULKLONG_LIMIT=8000
+BULKSHORT_LIMIT=2000
+BLOCK_UPPER=19000
+BLOCK_LOWER=4000
+BLOCK_LOADFAC=380
+BULK_LOADFAC=160
+BULK_LOWER=1500
+MAX_BULK_IPS=85
+
+#
+# END OF DEFAULT SETTINGS
+#
+
+try:
+    with open(BASEDIR + "/settings/ip_blocks.conf") as f:
+        code = compile(f.read(), BASEDIR + "/settings/ip_blocks.conf", 'exec')
+        exec(code)
+except IOError:
+    pass
+
+BLOCK_LIMIT = BLOCK_LOWER
+
+time_regex = r'(?P<t_day>\d\d)/(?P<t_month>[A-Za-z]+)/(?P<t_year>\d\d\d\d):(?P<t_hour>\d\d):(?P<t_min>\d\d):(?P<t_sec>\d\d) [+-]\d\d\d\d'
+
+format_pat= re.compile(r'(?P<ip>[a-f\d\.:]+) - - \['+ time_regex + r'] "(?P<query>.*?)" (?P<return>\d+) (?P<bytes>\d+) "(?P<referer>.*?)" "(?P<ua>.*?)"')
+time_pat= re.compile(r'[a-f\d:\.]+ - - \[' + time_regex + '\] ')
+
+logtime_pat = "%d/%b/%Y:%H:%M:%S %z"
+
+MONTHS = { 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6,
+           'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12 }
+
+class LogEntry:
+    def __init__(self, logline):
+        e = format_pat.match(logline)
+        if e is None:
+            raise ValueError("Invalid log line:", logline)
+        e = e.groupdict()
+        self.ip = e['ip']
+        self.date = datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
+                             int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+        qp = e['query'].split(' ', 2) 
+        if len(qp) < 2:
+            self.request = None
+            self.query = None
+        else:
+            self.query = qp[1]
+            if qp[0] == 'OPTIONS':
+                self.request = None
+            else:
+                if '/search' in qp[1]:
+                    self.request = 'S'
+                elif '/reverse' in qp[1]:
+                    self.request = 'R'
+                elif '/details' in qp[1]:
+                    self.request = 'D'
+                else:
+                    self.request = None
+        self.query = e['query']
+        self.retcode = int(e['return'])
+        self.referer = e['referer'] if e['referer'] != '-' else None
+        self.ua = e['ua'] if e['ua'] != '-' else None
+
+    def get_log_time(logline):
+        e = format_pat.match(logline)
+        if e is None:
+            return None
+        e = e.groupdict()
+        #return datetime.strptime(e['time'], logtime_pat).replace(tzinfo=None)
+        return datetime(int(e['t_year']), MONTHS[e['t_month']], int(e['t_day']),
+                             int(e['t_hour']), int(e['t_min']), int(e['t_sec']))
+
+
+class LogFile:
+    """ An apache log file, unpacked. """
+
+    def __init__(self, filename):
+        self.fd = open(filename)
+        self.len = os.path.getsize(filename)
+
+    def __del__(self):
+        self.fd.close()
+
+    def seek_next(self, abstime):
+        self.fd.seek(abstime)
+        self.fd.readline()
+        l = self.fd.readline()
+        return LogEntry.get_log_time(l) if l is not None else None
+
+    def seek_to_date(self, target):
+        # start position for binary search
+        fromseek = 0
+        fromdate = self.seek_next(0)
+        if fromdate > target:
+            return True
+        # end position for binary search
+        toseek = -100
+        while -toseek < self.len:
+            todate = self.seek_next(self.len + toseek)
+            if todate is not None:
+                break
+            toseek -= 100
+        if todate is None or todate < target:
+            return False
+        toseek = self.len + toseek
+
+
+        while True:
+            bps = (toseek - fromseek) / (todate - fromdate).total_seconds()
+            newseek = fromseek + int((target - fromdate).total_seconds() * bps)
+            newdate = self.seek_next(newseek)
+            if newdate is None:
+                return False;
+            error = abs((target - newdate).total_seconds())
+            if error < 1:
+                return True
+            if newdate > target:
+                toseek = newseek
+                todate = newdate
+                oldfromseek = fromseek
+                fromseek = toseek - error * bps
+                while True:
+                    if fromseek <= oldfromseek:
+                        fromseek = oldfromseek
+                        fromdate = self.seek_next(fromseek)
+                        break
+                    fromdate = self.seek_next(fromseek)
+                    if fromdate < target:
+                        break;
+                    bps *=2
+                    fromseek -= error * bps
+            else:
+                fromseek = newseek
+                fromdate = newdate
+                oldtoseek = toseek
+                toseek = fromseek + error * bps
+                while True:
+                    if toseek > oldtoseek:
+                        toseek = oldtoseek
+                        todate = self.seek_next(toseek)
+                        break
+                    todate = self.seek_next(toseek)
+                    if todate > target:
+                        break
+                    bps *=2
+                    toseek += error * bps
+            if toseek - fromseek < 500:
+                return True
+
+
+    def loglines(self):
+        for l in self.fd:
+            try:
+                yield LogEntry(l)
+            except ValueError:
+                pass # ignore invalid lines
+
+class BlockList:
+
+    def __init__(self):
+        self.whitelist = set(WHITELIST.split()) if WHITELIST else set()
+        self.blacklist = set(BLACKLIST.split()) if BLACKLIST else set()
+        self.prevblocks = set()
+        self.prevbulks = set()
+
+        try:
+            fd = open(BLOCKEDFILE)
+            for line in fd:
+                ip, typ = line.strip().split(' ')
+                if ip not in self.blacklist:
+                    if typ == 'block':
+                        self.prevblocks.add(ip)
+                    elif typ == 'bulk':
+                        self.prevbulks.add(ip)
+            fd.close()
+        except IOError:
+            pass #ignore non-existing file
+
+
+class IPstats:
+
+    def __init__(self):
+        self.short_total = 0
+        self.short_api = 0
+        self.long_total = 0
+        self.long_api = 0
+        self.bad_ua = False
+
+    def add_long(self, logentry):
+        self.long_total += 1
+        if logentry.request is not None:
+            self.long_api += 1
+        if not self.bad_ua:
+            if logentry.ua is None:
+                self.bad_ua = True
+
+    def add_short(self, logentry):
+        self.short_total += 1
+        if logentry.request is not None:
+            self.short_api += 1
+        self.add_long(logentry)
+
+    def new_state(self, was_blocked, was_bulked):
+        if was_blocked:
+            # deblock only if the IP has been really quiet
+            # (properly catches the ones that simply ignore the HTTP error)
+            return None if self.long_total < 20 else 'block'
+        if self.long_api > BLOCK_UPPER or self.short_api > BLOCK_UPPER / 3:
+                # client totally overdoing it
+                return 'block'
+        if was_bulked:
+            if self.short_total < 20:
+                # client has stopped, debulk
+                return None
+            if self.long_api > BLOCK_LIMIT or self.short_api > BLOCK_LIMIT / 3:
+                # client is still hammering us, block
+                return 'emblock'
+            return 'bulk'
+
+        if self.long_api > BULKLONG_LIMIT or self.short_api > BULKSHORT_LIMIT:
+            #if self.bad_ua:
+            #    return 'uablock' # bad useragent
+            return 'bulk'
+
+        return None
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: %s logfile startdate" % sys.argv[0])
+        sys.exit(-1)
+
+    if len(sys.argv) == 2:
+        dt = datetime.now() - BLOCKCOOLOFF_DELTA
+    else:
+        dt = datetime.strptime(sys.argv[2], "%Y-%m-%d %H:%M:%S")
+
+    if os.path.getsize(sys.argv[1]) < 2*1030*1024:
+        sys.exit(0) # not enough data
+
+    lf = LogFile(sys.argv[1])
+    if not lf.seek_to_date(dt):
+        sys.exit(0)
+
+    bl = BlockList()
+
+    shortstart = dt + BLOCKCOOLOFF_DELTA - BULKCOOLOFF_DELTA
+    notlogged = bl.whitelist | bl.blacklist
+
+    stats = defaultdict(IPstats)
+
+    for l in lf.loglines():
+        if l.ip not in notlogged:
+            stats[l.ip].add_long(l)
+        if l.date > shortstart:
+            break
+
+    total200 = 0
+    for l in lf.loglines():
+        if l.ip not in notlogged:
+            stats[l.ip].add_short(l)
+        if l.request is not None and l.retcode == 200:
+            total200 += 1
+
+    # adapt limits according to CPU and DB load
+    fd = open("/proc/loadavg")
+    cpuload = int(float(fd.readline().split()[2]))
+    fd.close()
+    # check the number of excess connections to apache
+    dbcons = int(subprocess.check_output("netstat -s | grep 'connections established' | sed 's:^\s*::;s: .*::'", shell=True))
+    fpms = int(subprocess.check_output('ps -Af | grep php-fpm | wc -l', shell=True))
+    dbload = max(0, dbcons - fpms)
+
+    numbulks = len(bl.prevbulks)
+    BLOCK_LIMIT = max(BLOCK_LIMIT, BLOCK_UPPER - BLOCK_LOADFAC * dbload)
+    BULKLONG_LIMIT = max(BULK_LOWER, BULKLONG_LIMIT - BULK_LOADFAC * cpuload)
+    if numbulks > MAX_BULK_IPS:
+        BLOCK_LIMIT = max(3600, BLOCK_LOWER - (numbulks - MAX_BULK_IPS)*10)
+    # if the bulk pool is still empty, clients will be faster, avoid having
+    # them blocked in this case
+    if numbulks < 10:
+        BLOCK_UPPER *= 2
+        BLOCK_LIMIT = BLOCK_UPPER
+
+
+    # collecting statistics
+    unblocked = []
+    debulked = []
+    bulked = []
+    blocked = []
+    uablocked = []
+    emblocked = []
+    # write out new state file
+    fd = open(BLOCKEDFILE, 'w')
+    for k,v in stats.items():
+        wasblocked = k in bl.prevblocks
+        wasbulked = k in bl.prevbulks
+        state = v.new_state(wasblocked, wasbulked)
+        if state is not None:
+            if state == 'uablock':
+                uablocked.append(k)
+                state = 'block'
+            elif state == 'emblock':
+                emblocked.append(k)
+                state = 'block'
+            elif state == 'block':
+                if not wasblocked:
+                    blocked.append(k)
+            elif state == 'bulk':
+                if not wasbulked:
+                    bulked.append(k)
+            fd.write("%s %s\n" % (k, state))
+        else:
+            if wasblocked:
+                unblocked.append(k)
+            elif wasbulked:
+                debulked.append(k)
+    for i in bl.blacklist:
+        fd.write("%s ban\n" % i)
+    fd.close()
+
+    # TODO write logs (need to collect some statistics)
+    logstr = datetime.now().strftime('%Y-%m-%d %H:%M') + ' %s %s\n'
+    fd = open(LOGFILE, 'a')
+    if unblocked:
+        fd.write(logstr % ('unblocked:', ', '.join(unblocked)))
+    if debulked:
+        fd.write(logstr % (' debulked:', ', '.join(debulked)))
+    if bulked:
+        fd.write(logstr % ('new bulks:', ', '.join(bulked)))
+    if emblocked:
+        fd.write(logstr % ('dir.block:', ', '.join(emblocked)))
+    if uablocked:
+        fd.write(logstr % (' ua block:', ', '.join(uablocked)))
+    if blocked:
+        fd.write(logstr % ('new block:', ', '.join(blocked)))
+    fd.close()
diff --git a/utils/cron_logrotate.sh b/utils/cron_logrotate.sh
new file mode 100755 (executable)
index 0000000..b9291d9
--- /dev/null
@@ -0,0 +1,20 @@
+#!/bin/bash -e
+#
+# Rotate query logs.
+
+dbname=nominatim
+
+basedir=`dirname $0`
+logfile=`date "+$basedir/../log/query-%F.log.gz"`
+
+# dump the old logfile
+pg_dump -a -F p -t backup_query_log $dbname | gzip -9 > $logfile
+
+# remove the old logs
+psql -q -d $dbname -c 'DROP TABLE backup_query_log'
+
+# rotate
+psql -q -1 -d $dbname -c 'ALTER TABLE new_query_log RENAME TO backup_query_log;CREATE TABLE new_query_log as (select * from backup_query_log limit 0);GRANT SELECT, INSERT, UPDATE ON new_query_log TO "www-data"'
+psql -q -d $dbname -c 'ALTER INDEX idx_new_query_log_starttime RENAME TO idx_backup_query_log_starttime'
+psql -q -d $dbname -c 'CREATE INDEX idx_new_query_log_starttime ON new_query_log USING BTREE (starttime)'
+
diff --git a/utils/cron_vacuum.sh b/utils/cron_vacuum.sh
new file mode 100755 (executable)
index 0000000..4c16fc6
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/bash
+#
+# Vaccum all tables with indices on integer arrays.
+# Agressive vacuuming seems to help against index bloat.
+#
+
+psql -q -d nominatim -c 'VACUUM ANALYSE search_name'
+psql -q -d nominatim -c 'VACUUM ANALYSE search_name_country'
+#psql -q -d nominatim -c 'VACUUM ANALYSE planet_osm_ways'
+
+for i in `seq 0 246`; do
+  psql -q -d nominatim -c "VACUUM ANALYSE search_name_${i}"
+done
+
index eff7b71b447fef7fd36bab87e3db8569fef84231..01bf11349ce750d60d130c5ca084b9689be62cee 100755 (executable)
                if (CONST_Tablespace_Place_Index)
                        $osm2pgsql .= ' --tablespace-main-index '.CONST_Tablespace_Place_Index;
                $osm2pgsql .= ' -lsc -O gazetteer --hstore --number-processes 1';
-               $osm2pgsql .= ' -C '.$iCacheMemory;
+               $osm2pgsql .= ' -C 25000';
                $osm2pgsql .= ' -P '.$aDSNInfo['port'];
                $osm2pgsql .= ' -d '.$aDSNInfo['database'].' '.$aCMDResult['osm-file'];
                passthruCheckReturn($osm2pgsql);
                $sSQL .= "select 'P',nextval('seq_postcodes'),'place','postcode',postcode,calculated_country_code,";
                $sSQL .= "ST_SetSRID(ST_Point(x,y),4326) as geometry from (select calculated_country_code,postcode,";
                $sSQL .= "avg(st_x(st_centroid(geometry))) as x,avg(st_y(st_centroid(geometry))) as y ";
-               $sSQL .= "from placex where postcode is not null group by calculated_country_code,postcode) as x";
+               $sSQL .= "from placex where postcode is not null and calculated_country_code not in ('ie') group by calculated_country_code,postcode) as x";
                if (!pg_query($oDB->connection, $sSQL)) fail(pg_last_error($oDB->connection));
 
                $sSQL = "insert into placex (osm_type,osm_id,class,type,postcode,calculated_country_code,geometry) ";
index cc2754633e262f34155afca5819c22074c73d45c..c6de7af6f9d2f4ba94e770af211ba6d8c61fbdfb 100755 (executable)
@@ -47,7 +47,6 @@
                showUsage($aCMDOptions, true, 'Select either import of hourly or daily');
        }
 
-       if (!isset($aResult['index-instances'])) $aResult['index-instances'] = 1;
        if (!isset($aResult['index-rank'])) $aResult['index-rank'] = 0;
 
 /*
 
        if ($aResult['index'])
        {
+               if (!isset($aResult['index-instances'])) $aResult['index-instances'] = 1;
                passthru(CONST_BasePath.'/nominatim/nominatim -i -d '.$aDSNInfo['database'].' -P '.$aDSNInfo['port'].' -t '.$aResult['index-instances'].' -r '.$aResult['index-rank']);
        }
 
                $sCMDDownload = $sOsmosisCMD.' --read-replication-interval workingDirectory='.$sOsmosisConfigDirectory.' --simplify-change --write-xml-change '.$sImportFile;
                $sCMDCheckReplicationLag = $sOsmosisCMD.' -q --read-replication-lag workingDirectory='.$sOsmosisConfigDirectory;
                $sCMDImport = $sOsm2pgsqlCmd.' '.$sImportFile;
-               $sCMDIndex = $sBasePath.'/nominatim/nominatim -i -d '.$aDSNInfo['database'].' -P '.$aDSNInfo['port'].' -t '.$aResult['index-instances'];
+               $sCMDIndex = $sBasePath.'/nominatim/nominatim -i -d '.$aDSNInfo['database'].' -P '.$aDSNInfo['port'];
                if (!$aResult['no-npi']) {
                        $sCMDIndex .= '-F ';
                }
                        $sBatchEnd = getosmosistimestamp($sOsmosisConfigDirectory);
 
                        // Index file
-                       $sThisIndexCmd = $sCMDIndex;
+                       if (!isset($aResult['index-instances']))
+                       {
+                               if (getLoadAverage() < 24)
+                                       $iIndexInstances = 2;
+                               else
+                                       $iIndexInstances = 1;
+                       } else
+                               $iIndexInstances = $aResult['index-instances'];
+
+                       $sThisIndexCmd = $sCMDIndex.' -t '.$iIndexInstances;
                        $fCMDStartTime = time();
 
                        if (!$aResult['no-npi'])
diff --git a/website/403.html b/website/403.html
new file mode 100644 (file)
index 0000000..e8a2386
--- /dev/null
@@ -0,0 +1,23 @@
+<html>
+<head>
+<title>Access blocked</title>
+</head>
+<body>
+<h1>Access blocked</h1>
+
+<p>You have been blocked because you have violated the
+<a href="http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy">usage policy</a>
+of OSM's Nominatim geocoding service. Please be aware that OSM's resources are
+limited and shared between many users. The usage policy is there to ensure that
+the service remains usable for everybody.</p>
+
+<p>Please review the terms and make sure that your
+software adheres to the terms. You should in particular verify that you have set a
++<b>valid</b> referrer or a user agent that identifies your application, and
+that you are not overusing the service with massive bulk requests.</p>
+
+<p>If you feel that this block is unjustified or remains after you have adopted
+your usage, you may contact the Nominatim system administrator at
+nominatim@openstreetmap.org to have this block lifted.</p>
+</body>
+</head>
diff --git a/website/509.html b/website/509.html
new file mode 100644 (file)
index 0000000..1e67a5a
--- /dev/null
@@ -0,0 +1,12 @@
+<html>
+<head>
+<title>Bandwidth limit exceeded</title>
+</head>
+<body>
+<h1>Bandwidth limit exceeded</h1>
+
+<p>You have been temporarily blocked because you have been overusing OSM's geocoding service or because you have not provided sufficient identification of your application. This block will be automatically lifted after a while. Please take the time and adapt your scripts to reduce the number of requests and make sure that you send a valid UserAgent or Referer.</p>
+
+<p>For more information, consult the <a href="http://wiki.openstreetmap.org/wiki/Nominatim_usage_policy">usage policy</a> for the OSM Nominatim server.
+</body>
+</head>
diff --git a/website/crossdomain.xml b/website/crossdomain.xml
new file mode 100644 (file)
index 0000000..963a682
--- /dev/null
@@ -0,0 +1,5 @@
+<?xml version="1.0"?>
+           <!DOCTYPE cross-domain-policy SYSTEM "http://www.macromedia.com/xml/dtds/cross-domain-policy.dtd">
+           <cross-domain-policy>
+           <allow-access-from domain="*" />
+           </cross-domain-policy> 
diff --git a/website/favicon.ico b/website/favicon.ico
new file mode 100644 (file)
index 0000000..0157ea0
Binary files /dev/null and b/website/favicon.ico differ
diff --git a/website/last_update.php b/website/last_update.php
new file mode 100644 (file)
index 0000000..a843586
--- /dev/null
@@ -0,0 +1,26 @@
+<?php
+       @define('CONST_ConnectionBucket_PageType', 'Status');
+
+       require_once(dirname(dirname(__FILE__)).'/lib/init-website.php');
+
+       function statusError($sMsg)
+       {
+               header("HTTP/1.0 500 Internal Server Error");
+               echo "ERROR: ".$sMsg;
+               exit;
+       }
+
+       $oDB =& DB::connect(CONST_Database_DSN, false);
+       if (!$oDB || PEAR::isError($oDB))
+       {
+               statusError("No database");
+       }
+
+       $sLastUpdate = $oDB->getOne("select * from import_status");
+       if (PEAR::isError($sLastUpdate))
+       {
+               statusError("Update status unknown.");
+       }
+       echo $sLastUpdate;
+       exit;
+
diff --git a/website/nominatim.xml b/website/nominatim.xml
new file mode 100644 (file)
index 0000000..28684b1
--- /dev/null
@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/"
+                       xmlns:moz="http://www.mozilla.org/2006/browser/search/">
+       <ShortName>Nominatim</ShortName>
+       <LongName>Nominatim OSM Search</LongName>
+       <Description>Search for a place in OpenStreetMap Nominatim</Description>
+       <InputEncoding>UTF-8</InputEncoding>
+       <OutputEncoding>UTF-8</OutputEncoding>
+       <Url type="text/html" method="get" template="http://nominatim.openstreetmap.org/search/?q={searchTerms}" />
+       <Query role="example" searchTerms="Reigate" />
+       <Developer>Brian Quinion</Developer>
+       <AdultContent>false</AdultContent>
+       <Attribution>Data &amp;copy; OpenStreetMap contributors, Some Rights Reserved. ODbL, http://www.osm.org/copyright.</Attribution>
+</OpenSearchDescription>
+
index d1c554a454352a79914f2f75a4096c182f8fa490..1f6e0aad1114f9acbd14e5baf210ef8cc4e792a9 100755 (executable)
@@ -6,19 +6,6 @@
        require_once(CONST_BasePath.'/lib/PlaceLookup.php');
        require_once(CONST_BasePath.'/lib/ReverseGeocode.php');
 
-       if (strpos(CONST_BulkUserIPs, ','.$_SERVER["REMOTE_ADDR"].',') !== false)
-       {
-               $fLoadAvg = getLoadAverage();
-               if ($fLoadAvg > 2) sleep(60);
-               if ($fLoadAvg > 4) sleep(120);
-               if ($fLoadAvg > 6)
-               {
-                       echo "Bulk User: Temporary block due to high server load\n";
-                       exit;
-               }
-       }
-
-
        $bAsPoints = false;
        $bAsGeoJSON = (boolean)isset($_GET['polygon_geojson']) && $_GET['polygon_geojson'];
        $bAsKML = (boolean)isset($_GET['polygon_kml']) && $_GET['polygon_kml'];
                $aPlace = null;
        }
 
+       logEnd($oDB, $hLog, sizeof($aPlace)?1:0);
 
        if (CONST_Debug)
        {
diff --git a/website/robots.txt b/website/robots.txt
new file mode 100644 (file)
index 0000000..9624d97
--- /dev/null
@@ -0,0 +1,14 @@
+User-agent: ia_archiver
+Allow: /
+
+User-agent: *
+Disallow: /search.php
+Disallow: /search
+Disallow: /details.php
+Disallow: /details
+Disallow: /reverse.php
+Disallow: /reverse
+Disallow: /hierarchy
+Disallow: /hierarchy.php
+Disallow: /lookup
+Disallow: /lookup.php
diff --git a/website/taginfo.json b/website/taginfo.json
new file mode 100644 (file)
index 0000000..3618026
--- /dev/null
@@ -0,0 +1,86 @@
+{
+    "data_format": 1,
+    "data_url": "http://nominatim.openstreetmap.org/taginfo.json",
+    "project": {
+        "name": "Nominatim",
+        "description": "OSM search engine.",
+        "project_url": "http://nominatim.openstreetmap.org",
+        "doc_url": "http://wiki.osm.org/wiki/Nominatim",
+        "contact_name": "Sarah Hoffmann",
+        "contact_email": "lonvia@denofr.de"
+    },
+    "tags": [
+      { "key" : "ref", "description": "Searchable name of the place."},
+      { "key" : "int_ref", "description": "Searchable name of the place."},
+      { "key" : "nat_ref", "description": "Searchable name of the place."},
+      { "key" : "reg_ref", "description": "Searchable name of the place."},
+      { "key" : "loc_ref", "description": "Searchable name of the place."},
+      { "key" : "old_ref", "description": "Searchable name of the place."},
+      { "key" : "iata", "description": "Searchable name of the place."},
+      { "key" : "icao", "description": "Searchable name of the place."},
+      { "key" : "pcode:1", "description": "Searchable name of the place."},
+      { "key" : "pcode:2", "description": "Searchable name of the place."},
+      { "key" : "pcode:3", "description": "Searchable name of the place."},
+      { "key" : "name", "description": "Searchable name of the place."},
+      { "key" : "int_name", "description": "Searchable name of the place."},
+      { "key" : "nat_name", "description": "Searchable name of the place."},
+      { "key" : "reg_name", "description": "Searchable name of the place."},
+      { "key" : "loc_name", "description": "Searchable name of the place."},
+      { "key" : "old_name", "description": "Searchable name of the place."},
+      { "key" : "alt_name", "description": "Searchable name of the place."},
+      { "key" : "official_name", "description": "Searchable name of the place."},
+      { "key" : "place_name", "description": "Searchable name of the place."},
+      { "key" : "short_name", "description": "Searchable name of the place."},
+      { "key" : "addr:housename", "description": "Searchable name of the place."},
+      { "key" : "operator", "description": "Searchable name for amenities and shops." },
+      { "key" : "brand", "description": "Searchable name of POI places."},
+      { "key" : "bridge:name", "description" : "Searchable name for bridges."},
+      { "key" : "tunnel:name", "description" : "Searchable name for tunnels."},
+      { "key" : "emergency", "description": "POI in the search database." },
+      { "key" : "tourism", "description": "POI in the search database." },
+      { "key" : "historic", "description": "POI in the search database." },
+      { "key" : "military", "description": "POI in the search database." },
+      { "key" : "natural", "description": "POI in the search database." },
+      { "key" : "man_made", "description": "POI in the search database." },
+      { "key" : "mountain_pass", "description": "POI in the search database." },
+      { "key" : "highway", "description": "POI or street in the search database (not added are: 'no', 'turning_circle', 'traffic_signals', 'mini_roundabout', 'noexit', 'crossing')." },
+      { "key" : "aerialway", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "aeroway", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "amenity", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "boundary", "description": "Area in the search database (used to compute addresses of other places)." },
+      { "key" : "bridge", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "craft", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "leisure", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "office", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "railway", "description": "Geographic feature in the search database (unless value is 'no')." },
+      { "key" : "landuse", "description": "Geographic feature in the search database (unless value is 'no')." },
+      { "key" : "shop", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "tunnel", "description": "POI in the search database (unless value is 'no')." },
+      { "key" : "waterway", "description": "Geographic feature in the search database (unless value is 'riverbank')."},
+      { "key" : "place", "description": "Settlement on the search database (used to compute addresses of other places)." },
+      { "key" : "postal_code", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "postcode", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "addr:postcode", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "tiger:zip_left", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "tiger:zip_right", "description": "Postcode in search database (used to compute postcodes of places around)." },
+      { "key" : "addr:street", "description": "Used to determine the street of a house or POI. Note that a street with the same name must exist for the tag to be effective."},
+      { "key" : "addr:place", "description": "Used to determine the settlement of a house or POI with a street-less address. Note that a place with the same name must exist for the tag to be effective."},
+      { "key" : "country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "ISO3166-1", "description": "Used to determine the country a place is in."},
+      { "key" : "is_in:country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:country", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:country_code", "description": "Used to determine the country a place is in."},
+      { "key" : "addr:housenumber", "description": "House number of the place (no ranges)."},
+      { "key" : "addr:conscriptionnumber", "description": "House number of the place (Eastern European system)."},
+      { "key" : "addr:streetnumber", "description": "House number of the place (Eastern European system)."},
+      { "key" : "addr:interpolation", "description": "Way along which house numbers are interpolated."} ,
+      { "key" : "tiger:county", "description": "Used to determine the address in the US (needs a place with the same name and a county suffix)."},
+      { "key" : "is_in", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+      { "key" : "addr:suburb", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+      { "key" : "addr:city", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+      { "key" : "addr:state_code", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+      { "key" : "addr:state", "description": "Used to determine the address of a place. Note that a place with the same name must exist for this to work."},
+      { "key" : "admin_level", "description": "Determines the hierarchy for administrative boundaries."},
+      { "key" : "wikipedia", "description": "Linking to the right wikipedia article helps to guess the importance of a place, which determines how far up in the search results it appears."}
+   ]
+}