]> git.openstreetmap.org Git - nominatim.git/blobdiff - lib/Geocode.php
Merge remote-tracking branch 'upstream/master'
[nominatim.git] / lib / Geocode.php
index 27c33ad68fae7896bb2c852fadd60fc3dad38636..306255b59079968104e5665d5f4b255f2527a609 100644 (file)
@@ -3,6 +3,7 @@
 namespace Nominatim;
 
 require_once(CONST_BasePath.'/lib/PlaceLookup.php');
+require_once(CONST_BasePath.'/lib/Phrase.php');
 require_once(CONST_BasePath.'/lib/ReverseGeocode.php');
 require_once(CONST_BasePath.'/lib/SearchDescription.php');
 require_once(CONST_BasePath.'/lib/SearchContext.php');
@@ -26,7 +27,7 @@ class Geocode
 
     protected $aExcludePlaceIDs = array();
     protected $bDeDupe = true;
-    protected $bReverseInPlan = false;
+    protected $bReverseInPlan = true;
 
     protected $iLimit = 20;
     protected $iFinalLimit = 10;
@@ -398,16 +399,20 @@ class Geocode
         $sSQL .= "    avg(ST_X(centroid)) AS lon, ";
         $sSQL .= "    avg(ST_Y(centroid)) AS lat, ";
         $sSQL .= "    COALESCE(importance,0.75-(rank_search::float/40)) $sImportanceSQL AS importance, ";
-        $sSQL .= "    ( ";
-        $sSQL .= "       SELECT max(p.importance*(p.rank_address+2))";
-        $sSQL .= "       FROM ";
-        $sSQL .= "         place_addressline s, ";
-        $sSQL .= "         placex p";
-        $sSQL .= "       WHERE s.place_id = min(CASE WHEN placex.rank_search < 28 THEN placex.place_id ELSE placex.parent_place_id END)";
-        $sSQL .= "         AND p.place_id = s.address_place_id ";
-        $sSQL .= "         AND s.isaddress ";
-        $sSQL .= "         AND p.importance is not null ";
-        $sSQL .= "    ) AS addressimportance, ";
+        if ($oCtx->hasNearPoint()) {
+            $sSQL .= $oCtx->distanceSQL('ST_Collect(centroid)')." AS addressimportance,";
+        } else {
+            $sSQL .= "    ( ";
+            $sSQL .= "       SELECT max(p.importance*(p.rank_address+2))";
+            $sSQL .= "       FROM ";
+            $sSQL .= "         place_addressline s, ";
+            $sSQL .= "         placex p";
+            $sSQL .= "       WHERE s.place_id = min(CASE WHEN placex.rank_search < 28 THEN placex.place_id ELSE placex.parent_place_id END)";
+            $sSQL .= "         AND p.place_id = s.address_place_id ";
+            $sSQL .= "         AND s.isaddress ";
+            $sSQL .= "         AND p.importance is not null ";
+            $sSQL .= "    ) AS addressimportance, ";
+        }
         $sSQL .= "    (extratags->'place') AS extra_place ";
         $sSQL .= " FROM placex";
         $sSQL .= " WHERE place_id in ($sPlaceIDs) ";
@@ -457,16 +462,20 @@ class Geocode
         if ($this->bIncludeNameDetails) $sSQL .= "null AS names,";
         $sSQL .= "  ST_x(st_centroid(geometry)) AS lon, ST_y(st_centroid(geometry)) AS lat,";
         $sSQL .= "  (0.75-(rank_search::float/40)) $sImportanceSQLGeom AS importance, ";
-        $sSQL .= "  (";
-        $sSQL .= "     SELECT max(p.importance*(p.rank_address+2))";
-        $sSQL .= "     FROM ";
-        $sSQL .= "       place_addressline s, ";
-        $sSQL .= "       placex p";
-        $sSQL .= "     WHERE s.place_id = lp.parent_place_id";
-        $sSQL .= "       AND p.place_id = s.address_place_id ";
-        $sSQL .= "       AND s.isaddress";
-        $sSQL .= "       AND p.importance is not null";
-        $sSQL .= "  ) AS addressimportance, ";
+        if ($oCtx->hasNearPoint()) {
+            $sSQL .= $oCtx->distanceSQL('geometry')." AS addressimportance,";
+        } else {
+            $sSQL .= "  (";
+            $sSQL .= "     SELECT max(p.importance*(p.rank_address+2))";
+            $sSQL .= "     FROM ";
+            $sSQL .= "       place_addressline s, ";
+            $sSQL .= "       placex p";
+            $sSQL .= "     WHERE s.place_id = lp.parent_place_id";
+            $sSQL .= "       AND p.place_id = s.address_place_id ";
+            $sSQL .= "       AND s.isaddress";
+            $sSQL .= "       AND p.importance is not null";
+            $sSQL .= "  ) AS addressimportance, ";
+        }
         $sSQL .= "  null AS extra_place ";
         $sSQL .= "FROM location_postcode lp";
         $sSQL .= " WHERE place_id in ($sPlaceIDs) ";
@@ -505,16 +514,20 @@ class Geocode
                 $sSQL .= "     avg(st_x(centroid)) AS lon, ";
                 $sSQL .= "     avg(st_y(centroid)) AS lat,";
                 $sSQL .= "     -1.15".$sImportanceSQL." AS importance, ";
-                $sSQL .= "     (";
-                $sSQL .= "        SELECT max(p.importance*(p.rank_address+2))";
-                $sSQL .= "        FROM ";
-                $sSQL .= "          place_addressline s, ";
-                $sSQL .= "          placex p";
-                $sSQL .= "        WHERE s.place_id = min(blub.parent_place_id)";
-                $sSQL .= "          AND p.place_id = s.address_place_id ";
-                $sSQL .= "          AND s.isaddress";
-                $sSQL .= "          AND p.importance is not null";
-                $sSQL .= "     ) AS addressimportance, ";
+                if ($oCtx->hasNearPoint()) {
+                    $sSQL .= $oCtx->distanceSQL('ST_Collect(centroid)')." AS addressimportance,";
+                } else {
+                    $sSQL .= "     (";
+                    $sSQL .= "        SELECT max(p.importance*(p.rank_address+2))";
+                    $sSQL .= "        FROM ";
+                    $sSQL .= "          place_addressline s, ";
+                    $sSQL .= "          placex p";
+                    $sSQL .= "        WHERE s.place_id = min(blub.parent_place_id)";
+                    $sSQL .= "          AND p.place_id = s.address_place_id ";
+                    $sSQL .= "          AND s.isaddress";
+                    $sSQL .= "          AND p.importance is not null";
+                    $sSQL .= "     ) AS addressimportance, ";
+                }
                 $sSQL .= "     null AS extra_place ";
                 $sSQL .= " FROM (";
                 $sSQL .= "     SELECT place_id, ";    // interpolate the Tiger housenumbers here
@@ -555,17 +568,21 @@ class Geocode
             $sSQL .= "  AVG(st_x(centroid)) AS lon, ";
             $sSQL .= "  AVG(st_y(centroid)) AS lat, ";
             $sSQL .= "  -0.1".$sImportanceSQL." AS importance, ";  // slightly smaller than the importance for normal houses with rank 30, which is 0
-            $sSQL .= "  (";
-            $sSQL .= "     SELECT ";
-            $sSQL .= "       MAX(p.importance*(p.rank_address+2)) ";
-            $sSQL .= "     FROM";
-            $sSQL .= "       place_addressline s, ";
-            $sSQL .= "       placex p";
-            $sSQL .= "     WHERE s.place_id = min(blub.parent_place_id) ";
-            $sSQL .= "       AND p.place_id = s.address_place_id ";
-            $sSQL .= "       AND s.isaddress ";
-            $sSQL .= "       AND p.importance is not null";
-            $sSQL .= "  ) AS addressimportance,";
+            if ($oCtx->hasNearPoint()) {
+                $sSQL .= $oCtx->distanceSQL('ST_Collect(centroid)')." AS addressimportance,";
+            } else {
+                $sSQL .= "  (";
+                $sSQL .= "     SELECT ";
+                $sSQL .= "       MAX(p.importance*(p.rank_address+2)) ";
+                $sSQL .= "     FROM";
+                $sSQL .= "       place_addressline s, ";
+                $sSQL .= "       placex p";
+                $sSQL .= "     WHERE s.place_id = min(blub.parent_place_id) ";
+                $sSQL .= "       AND p.place_id = s.address_place_id ";
+                $sSQL .= "       AND s.isaddress ";
+                $sSQL .= "       AND p.importance is not null";
+                $sSQL .= "  ) AS addressimportance,";
+            }
             $sSQL .= "  null AS extra_place ";
             $sSQL .= "  FROM (";
             $sSQL .= "     SELECT ";
@@ -614,16 +631,20 @@ class Geocode
                 $sSQL .= "     avg(ST_X(centroid)) AS lon, ";
                 $sSQL .= "     avg(ST_Y(centroid)) AS lat, ";
                 $sSQL .= "     -1.10".$sImportanceSQL." AS importance, ";
-                $sSQL .= "     ( ";
-                $sSQL .= "       SELECT max(p.importance*(p.rank_address+2))";
-                $sSQL .= "       FROM ";
-                $sSQL .= "          place_addressline s, ";
-                $sSQL .= "          placex p";
-                $sSQL .= "       WHERE s.place_id = min(location_property_aux.parent_place_id)";
-                $sSQL .= "         AND p.place_id = s.address_place_id ";
-                $sSQL .= "         AND s.isaddress";
-                $sSQL .= "         AND p.importance is not null";
-                $sSQL .= "     ) AS addressimportance, ";
+                if ($oCtx->hasNearPoint()) {
+                    $sSQL .= $oCtx->distanceSQL('ST_Collect(centroid)')." AS addressimportance,";
+                } else {
+                    $sSQL .= "     ( ";
+                    $sSQL .= "       SELECT max(p.importance*(p.rank_address+2))";
+                    $sSQL .= "       FROM ";
+                    $sSQL .= "          place_addressline s, ";
+                    $sSQL .= "          placex p";
+                    $sSQL .= "       WHERE s.place_id = min(location_property_aux.parent_place_id)";
+                    $sSQL .= "         AND p.place_id = s.address_place_id ";
+                    $sSQL .= "         AND s.isaddress";
+                    $sSQL .= "         AND p.importance is not null";
+                    $sSQL .= "     ) AS addressimportance, ";
+                }
                 $sSQL .= "     null AS extra_place ";
                 $sSQL .= "  FROM location_property_aux ";
                 $sSQL .= "  WHERE place_id in ($sPlaceIDs) ";
@@ -648,7 +669,7 @@ class Geocode
         return $aSearchResults;
     }
 
-    public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery)
+    public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured)
     {
         /*
              Calculate all searches using aValidTokens i.e.
@@ -663,15 +684,11 @@ class Geocode
          */
         $iGlobalRank = 0;
 
-        foreach ($aPhrases as $iPhrase => $aPhrase) {
+        foreach ($aPhrases as $iPhrase => $oPhrase) {
             $aNewPhraseSearches = array();
-            if ($bStructuredPhrases) {
-                $sPhraseType = $aPhraseTypes[$iPhrase];
-            } else {
-                $sPhraseType = '';
-            }
+            $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
 
-            foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) {
+            foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) {
                 // Too many permutations - too expensive
                 if ($iWordSet > 120) break;
 
@@ -690,17 +707,8 @@ class Geocode
                         // If the token is valid
                         if (isset($aValidTokens[' '.$sToken])) {
                             foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) {
-                                // Recheck if the original word shows up in the query.
-                                $bWordInQuery = false;
-                                if (isset($aSearchTerm['word']) && $aSearchTerm['word']) {
-                                    $bWordInQuery = strpos(
-                                        $sNormQuery,
-                                        $this->normTerm($aSearchTerm['word'])
-                                    ) !== false;
-                                }
                                 $aNewSearches = $oCurrentSearch->extendWithFullTerm(
                                     $aSearchTerm,
-                                    $bWordInQuery,
                                     isset($aValidTokens[$sToken])
                                       && strpos($sToken, ' ') === false,
                                     $sPhraseType,
@@ -726,9 +734,8 @@ class Geocode
                             foreach ($aValidTokens[$sToken] as $aSearchTerm) {
                                 $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
                                     $aSearchTerm,
-                                    $bStructuredPhrases,
+                                    $bIsStructured,
                                     $iPhrase,
-                                    $aWordFrequencyScores,
                                     isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array()
                                 );
 
@@ -786,7 +793,7 @@ class Geocode
         // Revisit searches, drop bad searches and give penalty to unlikely combinations.
         $aGroupedSearches = array();
         foreach ($aSearches as $oSearch) {
-            if (!$oSearch->isValidSearch($this->aCountryCodes)) {
+            if (!$oSearch->isValidSearch()) {
                 continue;
             }
 
@@ -846,7 +853,7 @@ class Geocode
                 $this->aRouteWidth,
                 $this->bBoundedSearch
             );
-        } else if ($this->aViewBox) {
+        } elseif ($this->aViewBox) {
             $oCtx->setViewboxFromBox($this->aViewBox, $this->bBoundedSearch);
         }
         if ($this->aExcludePlaceIDs) {
@@ -935,10 +942,10 @@ class Geocode
             // Split query into phrases
             // Commas are used to reduce the search space by indicating where phrases split
             if ($this->aStructuredQuery) {
-                $aPhrases = $this->aStructuredQuery;
+                $aInPhrases = $this->aStructuredQuery;
                 $bStructuredPhrases = true;
             } else {
-                $aPhrases = explode(',', $sQuery);
+                $aInPhrases = explode(',', $sQuery);
                 $bStructuredPhrases = false;
             }
 
@@ -947,25 +954,19 @@ class Geocode
             // Get all 'sets' of words
             // Generate a complete list of all
             $aTokens = array();
-            foreach ($aPhrases as $iPhrase => $sPhrase) {
-                $aPhrase = chksql(
-                    $this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"),
+            $aPhrases = array();
+            foreach ($aInPhrases as $iPhrase => $sPhrase) {
+                $sPhrase = chksql(
+                    $this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'),
                     "Cannot normalize query string (is it a UTF-8 string?)"
                 );
-                if (trim($aPhrase['string'])) {
-                    $aPhrases[$iPhrase] = $aPhrase;
-                    $aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']);
-                    $aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0);
-                    $aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets']));
-                } else {
-                    unset($aPhrases[$iPhrase]);
+                if (trim($sPhrase)) {
+                    $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
+                    $oPhrase->addTokens($aTokens);
+                    $aPhrases[] = $oPhrase;
                 }
             }
 
-            // Reindex phrases - we make assumptions later on that they are numerically keyed in order
-            $aPhraseTypes = array_keys($aPhrases);
-            $aPhrases = array_values($aPhrases);
-
             if (sizeof($aTokens)) {
                 // Check which tokens we have, get the ID numbers
                 $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count';
@@ -979,22 +980,29 @@ class Geocode
                     $this->oDB->getAll($sSQL),
                     "Could not get word tokens."
                 );
-                $aPossibleMainWordIDs = array();
                 $aWordFrequencyScores = array();
                 foreach ($aDatabaseWords as $aToken) {
-                    // Very special case - require 2 letter country param to match the country code found
-                    if ($bStructuredPhrases && $aToken['country_code'] && !empty($this->aStructuredQuery['country'])
-                        && strlen($this->aStructuredQuery['country']) == 2 && strtolower($this->aStructuredQuery['country']) != $aToken['country_code']
+                    // Filter country tokens that do not match restricted countries.
+                    if ($this->aCountryCodes
+                        && $aToken['country_code']
+                        && !in_array($aToken['country_code'], $this->aCountryCodes)
                     ) {
                         continue;
                     }
 
+                    // Special terms need to appear in their normalized form.
+                    if ($aToken['word'] && $aToken['class']) {
+                        $sNormWord = $this->normTerm($aToken['word']);
+                        if (strpos($sNormQuery, $sNormWord) === false) {
+                            continue;
+                        }
+                    }
+
                     if (isset($aValidTokens[$aToken['word_token']])) {
                         $aValidTokens[$aToken['word_token']][] = $aToken;
                     } else {
                         $aValidTokens[$aToken['word_token']] = array($aToken);
                     }
-                    if (!$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1;
                     $aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1;
                 }
                 if (CONST_Debug) var_Dump($aPhrases, $aValidTokens);
@@ -1026,19 +1034,18 @@ class Geocode
                 // Any words that have failed completely?
                 // TODO: suggestions
 
-                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery);
+                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases);
 
                 if ($this->bReverseInPlan) {
                     // Reverse phrase array and also reverse the order of the wordsets in
                     // the first and final phrase. Don't bother about phrases in the middle
                     // because order in the address doesn't matter.
                     $aPhrases = array_reverse($aPhrases);
-                    $aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0);
+                    $aPhrases[0]->invertWordSets();
                     if (sizeof($aPhrases) > 1) {
-                        $aFinalPhrase = end($aPhrases);
-                        $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0);
+                        $aPhrases[sizeof($aPhrases)-1]->invertWordSets();
                     }
-                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery);
+                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false);
 
                     foreach ($aGroupedSearches as $aSearches) {
                         foreach ($aSearches as $aSearch) {
@@ -1266,36 +1273,42 @@ class Geocode
                 }
             }
 
-            // Adjust importance for the number of exact string matches in the result
-            $aResult['importance'] = max(0.001, $aResult['importance']);
-            $iCountWords = 0;
-            $sAddress = $aResult['langaddress'];
-            foreach ($aRecheckWords as $i => $sWord) {
-                if (stripos($sAddress, $sWord)!==false) {
-                    $iCountWords++;
-                    if (preg_match("/(^|,)\s*".preg_quote($sWord, '/')."\s*(,|$)/", $sAddress)) $iCountWords += 0.1;
+            $aResult['name'] = $aResult['langaddress'];
+
+            if ($oCtx->hasNearPoint()) {
+                $aResult['importance'] = 0.001;
+                $aResult['foundorder'] = $aResult['addressimportance'];
+            } else {
+                // Adjust importance for the number of exact string matches in the result
+                $aResult['importance'] = max(0.001, $aResult['importance']);
+                $iCountWords = 0;
+                $sAddress = $aResult['langaddress'];
+                foreach ($aRecheckWords as $i => $sWord) {
+                    if (stripos($sAddress, $sWord)!==false) {
+                        $iCountWords++;
+                        if (preg_match("/(^|,)\s*".preg_quote($sWord, '/')."\s*(,|$)/", $sAddress)) $iCountWords += 0.1;
+                    }
                 }
-            }
 
-            $aResult['importance'] = $aResult['importance'] + ($iCountWords*0.1); // 0.1 is a completely arbitrary number but something in the range 0.1 to 0.5 would seem right
+                $aResult['importance'] = $aResult['importance'] + ($iCountWords*0.1); // 0.1 is a completely arbitrary number but something in the range 0.1 to 0.5 would seem right
 
-            $aResult['name'] = $aResult['langaddress'];
-            // secondary ordering (for results with same importance (the smaller the better):
-            // - approximate importance of address parts
-            $aResult['foundorder'] = -$aResult['addressimportance']/10;
-            // - number of exact matches from the query
-            if (isset($this->exactMatchCache[$aResult['place_id']])) {
-                $aResult['foundorder'] -= $this->exactMatchCache[$aResult['place_id']];
-            } elseif (isset($this->exactMatchCache[$aResult['parent_place_id']])) {
-                $aResult['foundorder'] -= $this->exactMatchCache[$aResult['parent_place_id']];
-            }
-            // - importance of the class/type
-            if (isset($aClassType[$aResult['class'].':'.$aResult['type']]['importance'])
-                && $aClassType[$aResult['class'].':'.$aResult['type']]['importance']
-            ) {
-                $aResult['foundorder'] += 0.0001 * $aClassType[$aResult['class'].':'.$aResult['type']]['importance'];
-            } else {
-                $aResult['foundorder'] += 0.01;
+                // secondary ordering (for results with same importance (the smaller the better):
+                // - approximate importance of address parts
+                $aResult['foundorder'] = -$aResult['addressimportance']/10;
+                // - number of exact matches from the query
+                if (isset($this->exactMatchCache[$aResult['place_id']])) {
+                    $aResult['foundorder'] -= $this->exactMatchCache[$aResult['place_id']];
+                } elseif (isset($this->exactMatchCache[$aResult['parent_place_id']])) {
+                    $aResult['foundorder'] -= $this->exactMatchCache[$aResult['parent_place_id']];
+                }
+                // - importance of the class/type
+                if (isset($aClassType[$aResult['class'].':'.$aResult['type']]['importance'])
+                    && $aClassType[$aResult['class'].':'.$aResult['type']]['importance']
+                ) {
+                    $aResult['foundorder'] += 0.0001 * $aClassType[$aResult['class'].':'.$aResult['type']]['importance'];
+                } else {
+                    $aResult['foundorder'] += 0.01;
+                }
             }
             if (CONST_Debug) var_dump($aResult);
             $aSearchResults[$iResNum] = $aResult;