]> git.openstreetmap.org Git - nominatim.git/commitdiff
improve handling of multi-word partials in SearchDescription
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 25 Nov 2020 10:44:25 +0000 (11:44 +0100)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 25 Nov 2020 11:07:04 +0000 (12:07 +0100)
Multi-word partial terms had an undue advantage over separate partial
terms because they only need to pay the penalty once. This changes
the behaviour by setting the penalty according to the number of
words in the token. This should get rid of search interpretations
with low chance of matching.

This also fixes handling of exact term matching. We now match against
all exact terms of the query, not just a couple of them collected
while building the interpretations.

Also adds a penalty to very short postcodes.

lib/Geocode.php
lib/SearchContext.php
lib/SearchDescription.php
lib/TokenList.php
lib/TokenWord.php
test/php/Nominatim/TokenListTest.php

index 69b6f41ca46e1085d2917592dc2757780bbbc64a..ed02848eac4c2c846ea1a655089caa75aca879cd 100644 (file)
@@ -650,6 +650,8 @@ class Geocode
                     $this->oNormalizer
                 );
 
+                $oCtx->setFullNameWords($oValidTokens->getFullWordIDs());
+
                 // Try more interpretations for Tokens that could not be matched.
                 foreach ($aTokens as $sToken) {
                     if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
index 3d399bdcf62967d63fe96b5f3d614d315f1c9c4e..c2898d274b44266aa567fb74cddfd9790a67df13 100644 (file)
@@ -32,7 +32,18 @@ class SearchContext
     public $sqlCountryList = '';
     /// List of place IDs to exclude (as SQL).
     private $sqlExcludeList = '';
+    /// Subset of word ids of full words in the query.
+    private $aFullNameWords = array();
 
+    public function setFullNameWords($aWordList)
+    {
+        $this->aFullNameWords = $aWordList;
+    }
+
+    public function getFullNameTerms()
+    {
+        return $this->aFullNameWords;
+    }
 
     /**
      * Check if a reference point is defined.
index ad404528968ad94075bc414af0bc0f5a185873cc..94ba87ff76480e3e237010e0b7005f5fa835034b 100644 (file)
@@ -21,8 +21,6 @@ class SearchDescription
     private $bRareName = false;
     /// List of word ids making up the address of the object.
     private $aAddress = array();
-    /// Subset of word ids of full words making up the address.
-    private $aFullNameAddress = array();
     /// List of word ids that appear in the name but should be ignored.
     private $aNameNonSearch = array();
     /// List of word ids that appear in the address but should be ignored.
@@ -219,6 +217,9 @@ class SearchDescription
                 ) {
                     $oSearch = clone $this;
                     $oSearch->iSearchRank++;
+                    if (strlen($oSearchTerm->sPostcode) < 4) {
+                        $oSearch->iSearchRank += 4 - strlen($oSearchTerm->sPostcode);
+                    }
                     $oSearch->sPostcode = $oSearchTerm->sPostcode;
                     $aNewSearches[] = $oSearch;
                 }
@@ -283,11 +284,9 @@ class SearchDescription
             if (!empty($this->aName) || !($bFirstPhrase || $sPhraseType == '')) {
                 if (($sPhraseType == '' || !$bFirstPhrase) && !$bHasPartial) {
                     $oSearch = clone $this;
-                    $oSearch->iSearchRank += 2;
+                    $oSearch->iSearchRank += 3 * $oSearchTerm->iTermCount;
                     $oSearch->aAddress[$iWordID] = $iWordID;
                     $aNewSearches[] = $oSearch;
-                } else {
-                    $this->aFullNameAddress[$iWordID] = $iWordID;
                 }
             } else {
                 $oSearch = clone $this;
@@ -333,16 +332,19 @@ class SearchDescription
         ) {
             if ($oSearchTerm->iSearchNameCount < CONST_Max_Word_Frequency) {
                 $oSearch = clone $this;
-                $oSearch->iSearchRank += 2;
+                $oSearch->iSearchRank += $oSearchTerm->iTermCount;
+                if (empty($this->aName)) {
+                    $oSearch->iSearchRank++;
+                }
+                if (preg_match('#^[0-9]+$#', $sToken)) {
+                    $oSearch->iSearchRank++;
+                }
                 $oSearch->aAddress[$iWordID] = $iWordID;
                 $aNewSearches[] = $oSearch;
             } else {
                 $oSearch = clone $this;
                 $oSearch->iSearchRank++;
                 $oSearch->aAddressNonSearch[$iWordID] = $iWordID;
-                if (preg_match('#^[0-9]+$#', $sToken)) {
-                    $oSearch->iSearchRank += 2;
-                }
                 if (!empty($aFullTokens)) {
                     $oSearch->iSearchRank++;
                 }
@@ -352,7 +354,7 @@ class SearchDescription
                 foreach ($aFullTokens as $oSearchTermToken) {
                     if (is_a($oSearchTermToken, '\Nominatim\Token\Word')) {
                         $oSearch = clone $this;
-                        $oSearch->iSearchRank++;
+                        $oSearch->iSearchRank += 3;
                         $oSearch->aAddress[$oSearchTermToken->iId]
                             = $oSearchTermToken->iId;
                         $aNewSearches[] = $oSearch;
@@ -691,10 +693,11 @@ class SearchDescription
         $sImportanceSQL .= $this->oContext->viewboxImportanceSQL('centroid');
         $aOrder[] = "$sImportanceSQL DESC";
 
-        if (!empty($this->aFullNameAddress)) {
+        $aFullNameAddress = $this->oContext->getFullNameTerms();
+        if (!empty($aFullNameAddress)) {
             $sExactMatchSQL = ' ( ';
             $sExactMatchSQL .= ' SELECT count(*) FROM ( ';
-            $sExactMatchSQL .= '  SELECT unnest('.$oDB->getArraySQL($this->aFullNameAddress).')';
+            $sExactMatchSQL .= '  SELECT unnest('.$oDB->getArraySQL($aFullNameAddress).')';
             $sExactMatchSQL .= '    INTERSECT ';
             $sExactMatchSQL .= '  SELECT unnest(nameaddress_vector)';
             $sExactMatchSQL .= ' ) s';
index fce5f940b84513a6bc1850cbbbdb5e9fa043682c..78fa6444af63ccf3f90d787cb1498560fbb44a52 100644 (file)
@@ -80,6 +80,21 @@ class TokenList
         return isset($this->aTokens[$sWord]) ? $this->aTokens[$sWord] : array();
     }
 
+    public function getFullWordIDs()
+    {
+        $ids = array();
+
+        foreach($this->aTokens as $aTokenList) {
+            foreach($aTokenList as $oToken) {
+                if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) {
+                    $ids[$oToken->iId] = $oToken->iId;
+                }
+            }
+        }
+
+        return $ids;
+    }
+
     /**
      * Add token information from the word table in the database.
      *
@@ -151,7 +166,8 @@ class TokenList
                 $oToken = new Token\Word(
                     $iId,
                     $aWord['word_token'][0] != ' ',
-                    (int) $aWord['count']
+                    (int) $aWord['count'],
+                    substr_count($aWord['word_token'], ' ')
                 );
             }
 
index 54622cbcb081c05cd3bd3b67d663ef938455763c..fc28535d4582e459f5d88c72b8977efaf1930fa9 100644 (file)
@@ -13,12 +13,15 @@ class Word
     public $bPartial;
     /// Number of appearances in the database.
     public $iSearchNameCount;
+    /// Number of terms in the word.
+    public $iTermCount;
 
-    public function __construct($iId, $bPartial, $iSearchNameCount)
+    public function __construct($iId, $bPartial, $iSearchNameCount, $iTermCount)
     {
         $this->iId = $iId;
         $this->bPartial = $bPartial;
         $this->iSearchNameCount = $iSearchNameCount;
+        $this->iTermCount = $iTermCount;
     }
 
     public function debugInfo()
index 191a09dceeb6a002a5c6c4103cb0557ad06178a3..ca43aabb3d71bfa6780d50b24d26aa6f5cbcf81c 100644 (file)
@@ -121,6 +121,6 @@ class TokenTest extends \PHPUnit\Framework\TestCase
         $this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051'));
         $this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne'));
         $this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286'));
-        $this->assertEquals(array(new Token\Word(999, true, 533)), $TL->get('darmstadt'));
+        $this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt'));
     }
 }