]> git.openstreetmap.org Git - nominatim.git/commitdiff
disregard special phrases that do not match fully
authorSarah Hoffmann <lonvia@denofr.de>
Thu, 1 Jun 2017 19:40:23 +0000 (21:40 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Sun, 4 Jun 2017 21:12:09 +0000 (23:12 +0200)
Compare the normalized terms imported with the special
terms script with the normalized version of the query string.
Disregard them if they cannot be found. This avoids a significant
number of mismatches due to transliteration issues.

The match will only be done when a normalized word has been set
making this change backwards compatible with older databases.

lib/Geocode.php
settings/defaults.php
utils/specialphrases.php

index ec8eb3489e6daaa15f11ba3d6aa0029284abb114..17aaf826e2963e2f9405561bafd04604a2fad651 100644 (file)
@@ -653,7 +653,7 @@ class Geocode
         return $aSearchResults;
     }
 
-    public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases)
+    public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery)
     {
         /*
              Calculate all searches using aValidTokens i.e.
@@ -752,13 +752,19 @@ class Geocode
                                          */
                                     }
                                 } elseif ($sPhraseType == '' && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null) {
-                                    if ($aSearch['sClass'] === '') {
-                                        $aSearch['sOperator'] = $aSearchTerm['operator'];
+                                    // require a normalized exact match of the term
+                                    // if we have the normalizer version of the query
+                                    // available
+                                    if ($aSearch['sClass'] === ''
+                                        && ($sNormQuery === null || !($aSearchTerm['word'] && strpos($sNormQuery, $aSearchTerm['word']) === false))) {
                                         $aSearch['sClass'] = $aSearchTerm['class'];
                                         $aSearch['sType'] = $aSearchTerm['type'];
-                                        if (sizeof($aSearch['aName'])) $aSearch['sOperator'] = 'name';
-                                        else $aSearch['sOperator'] = 'near'; // near = in for the moment
-                                        if (strlen($aSearchTerm['operator']) == 0) $aSearch['iSearchRank'] += 1;
+                                        if ($aSearchTerm['operator'] == '') {
+                                            $aSearch['sOperator'] = sizeof($aSearch['aName']) ? 'name' :  'near';
+                                            $aSearch['iSearchRank'] += 2;
+                                        } else {
+                                            $aSearch['sOperator'] = 'near'; // near = in for the moment
+                                        }
 
                                         if ($aSearch['iSearchRank'] < $this->iMaxRank) $aNewWordsetSearches[] = $aSearch;
                                     }
@@ -913,6 +919,13 @@ class Geocode
     {
         if (!$this->sQuery && !$this->aStructuredQuery) return array();
 
+        $oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules);
+        if ($oNormalizer !== null) {
+            $sNormQuery = $oNormalizer->transliterate($this->sQuery);
+        } else {
+            $sNormQuery = null;
+        }
+
         $sLanguagePrefArraySQL = "ARRAY[".join(',', array_map("getDBQuoted", $this->aLangPrefOrder))."]";
         $sCountryCodesSQL = false;
         if ($this->aCountryCodes) {
@@ -1139,7 +1152,7 @@ class Geocode
                 // array with: placeid => -1 | tiger-housenumber
                 $aResultPlaceIDs = array();
 
-                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases);
+                $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery);
 
                 if ($this->bReverseInPlan) {
                     // Reverse phrase array and also reverse the order of the wordsets in
@@ -1151,7 +1164,7 @@ class Geocode
                         $aFinalPhrase = end($aPhrases);
                         $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0);
                     }
-                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false);
+                    $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery);
 
                     foreach ($aGroupedSearches as $aSearches) {
                         foreach ($aSearches as $aSearch) {
index 16711542faee15872a176e6eb0f10f20185f293c..9f694c89fb24699a62ee1d05c09b20b57c98f59a 100644 (file)
@@ -17,6 +17,10 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true);
 // codes, to restrict import to a subset of languages.
 // Currently only affects the import of country names and special phrases.
 @define('CONST_Languages', false);
+// Rules for normalizing terms for comparison before doing comparisons.
+// The default is to remove accents and punctuation and to lower-case the
+// term. Spaces are kept but collapsed to one standard space.
+@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();");
 
 // Set to false to avoid importing extra postcodes for the US.
 @define('CONST_Use_Extra_US_Postcodes', true);
index 156169761fef71102ff21f47a802375f5796064b..1a4a51d758e9b9ce7c1ece8280caeb11743f2c7a 100755 (executable)
@@ -19,7 +19,7 @@ getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
 include(CONST_InstallPath.'/settings/phrase_settings.php');
 
 if ($aCMDResult['wiki-import']) {
-    $oNormalizer = Transliterator::createFromRules(":: NFD (); [:Nonspacing Mark:] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();");
+    $oNormalizer = Transliterator::createFromRules(CONST_Term_Normalization_Rules);
     $aPairs = array();
 
     $sLanguageIn = CONST_Languages ? CONST_Languages :
@@ -32,7 +32,11 @@ if ($aCMDResult['wiki-import']) {
         if (preg_match_all('#\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([^|]+) \\|\\| ([\\-YN])#', $sWikiPageXML, $aMatches, PREG_SET_ORDER)) {
             foreach ($aMatches as $aMatch) {
                 $sLabel = trim($aMatch[1]);
-                $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel));
+                if ($oNormalizer !== null) {
+                    $sTrans = pg_escape_string($oNormalizer->transliterate($sLabel));
+                } else {
+                    $sTrans = null;
+                }
                 $sClass = trim($aMatch[2]);
                 $sType = trim($aMatch[3]);
                 // hack around a bug where building=yes was imported with