From 0f87da017f83b321770044e52d7034894448dee6 Mon Sep 17 00:00:00 2001 From: Sarah Hoffmann Date: Wed, 25 Nov 2020 11:44:25 +0100 Subject: [PATCH] improve handling of multi-word partials in SearchDescription Multi-word partial terms had an undue advantage over separate partial terms because they only need to pay the penalty once. This changes the behaviour by setting the penalty according to the number of words in the token. This should get rid of search interpretations with low chance of matching. This also fixes handling of exact term matching. We now match against all exact terms of the query, not just a couple of them collected while building the interpretations. Also adds a penalty to very short postcodes. --- lib/Geocode.php | 2 ++ lib/SearchContext.php | 11 +++++++++++ lib/SearchDescription.php | 27 +++++++++++++++------------ lib/TokenList.php | 18 +++++++++++++++++- lib/TokenWord.php | 5 ++++- test/php/Nominatim/TokenListTest.php | 2 +- 6 files changed, 50 insertions(+), 15 deletions(-) diff --git a/lib/Geocode.php b/lib/Geocode.php index 69b6f41c..ed02848e 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -650,6 +650,8 @@ class Geocode $this->oNormalizer ); + $oCtx->setFullNameWords($oValidTokens->getFullWordIDs()); + // Try more interpretations for Tokens that could not be matched. foreach ($aTokens as $sToken) { if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { diff --git a/lib/SearchContext.php b/lib/SearchContext.php index 3d399bdc..c2898d27 100644 --- a/lib/SearchContext.php +++ b/lib/SearchContext.php @@ -32,7 +32,18 @@ class SearchContext public $sqlCountryList = ''; /// List of place IDs to exclude (as SQL). private $sqlExcludeList = ''; + /// Subset of word ids of full words in the query. + private $aFullNameWords = array(); + public function setFullNameWords($aWordList) + { + $this->aFullNameWords = $aWordList; + } + + public function getFullNameTerms() + { + return $this->aFullNameWords; + } /** * Check if a reference point is defined. diff --git a/lib/SearchDescription.php b/lib/SearchDescription.php index ad404528..94ba87ff 100644 --- a/lib/SearchDescription.php +++ b/lib/SearchDescription.php @@ -21,8 +21,6 @@ class SearchDescription private $bRareName = false; /// List of word ids making up the address of the object. private $aAddress = array(); - /// Subset of word ids of full words making up the address. - private $aFullNameAddress = array(); /// List of word ids that appear in the name but should be ignored. private $aNameNonSearch = array(); /// List of word ids that appear in the address but should be ignored. @@ -219,6 +217,9 @@ class SearchDescription ) { $oSearch = clone $this; $oSearch->iSearchRank++; + if (strlen($oSearchTerm->sPostcode) < 4) { + $oSearch->iSearchRank += 4 - strlen($oSearchTerm->sPostcode); + } $oSearch->sPostcode = $oSearchTerm->sPostcode; $aNewSearches[] = $oSearch; } @@ -283,11 +284,9 @@ class SearchDescription if (!empty($this->aName) || !($bFirstPhrase || $sPhraseType == '')) { if (($sPhraseType == '' || !$bFirstPhrase) && !$bHasPartial) { $oSearch = clone $this; - $oSearch->iSearchRank += 2; + $oSearch->iSearchRank += 3 * $oSearchTerm->iTermCount; $oSearch->aAddress[$iWordID] = $iWordID; $aNewSearches[] = $oSearch; - } else { - $this->aFullNameAddress[$iWordID] = $iWordID; } } else { $oSearch = clone $this; @@ -333,16 +332,19 @@ class SearchDescription ) { if ($oSearchTerm->iSearchNameCount < CONST_Max_Word_Frequency) { $oSearch = clone $this; - $oSearch->iSearchRank += 2; + $oSearch->iSearchRank += $oSearchTerm->iTermCount; + if (empty($this->aName)) { + $oSearch->iSearchRank++; + } + if (preg_match('#^[0-9]+$#', $sToken)) { + $oSearch->iSearchRank++; + } $oSearch->aAddress[$iWordID] = $iWordID; $aNewSearches[] = $oSearch; } else { $oSearch = clone $this; $oSearch->iSearchRank++; $oSearch->aAddressNonSearch[$iWordID] = $iWordID; - if (preg_match('#^[0-9]+$#', $sToken)) { - $oSearch->iSearchRank += 2; - } if (!empty($aFullTokens)) { $oSearch->iSearchRank++; } @@ -352,7 +354,7 @@ class SearchDescription foreach ($aFullTokens as $oSearchTermToken) { if (is_a($oSearchTermToken, '\Nominatim\Token\Word')) { $oSearch = clone $this; - $oSearch->iSearchRank++; + $oSearch->iSearchRank += 3; $oSearch->aAddress[$oSearchTermToken->iId] = $oSearchTermToken->iId; $aNewSearches[] = $oSearch; @@ -691,10 +693,11 @@ class SearchDescription $sImportanceSQL .= $this->oContext->viewboxImportanceSQL('centroid'); $aOrder[] = "$sImportanceSQL DESC"; - if (!empty($this->aFullNameAddress)) { + $aFullNameAddress = $this->oContext->getFullNameTerms(); + if (!empty($aFullNameAddress)) { $sExactMatchSQL = ' ( '; $sExactMatchSQL .= ' SELECT count(*) FROM ( '; - $sExactMatchSQL .= ' SELECT unnest('.$oDB->getArraySQL($this->aFullNameAddress).')'; + $sExactMatchSQL .= ' SELECT unnest('.$oDB->getArraySQL($aFullNameAddress).')'; $sExactMatchSQL .= ' INTERSECT '; $sExactMatchSQL .= ' SELECT unnest(nameaddress_vector)'; $sExactMatchSQL .= ' ) s'; diff --git a/lib/TokenList.php b/lib/TokenList.php index fce5f940..78fa6444 100644 --- a/lib/TokenList.php +++ b/lib/TokenList.php @@ -80,6 +80,21 @@ class TokenList return isset($this->aTokens[$sWord]) ? $this->aTokens[$sWord] : array(); } + public function getFullWordIDs() + { + $ids = array(); + + foreach($this->aTokens as $aTokenList) { + foreach($aTokenList as $oToken) { + if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) { + $ids[$oToken->iId] = $oToken->iId; + } + } + } + + return $ids; + } + /** * Add token information from the word table in the database. * @@ -151,7 +166,8 @@ class TokenList $oToken = new Token\Word( $iId, $aWord['word_token'][0] != ' ', - (int) $aWord['count'] + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') ); } diff --git a/lib/TokenWord.php b/lib/TokenWord.php index 54622cbc..fc28535d 100644 --- a/lib/TokenWord.php +++ b/lib/TokenWord.php @@ -13,12 +13,15 @@ class Word public $bPartial; /// Number of appearances in the database. public $iSearchNameCount; + /// Number of terms in the word. + public $iTermCount; - public function __construct($iId, $bPartial, $iSearchNameCount) + public function __construct($iId, $bPartial, $iSearchNameCount, $iTermCount) { $this->iId = $iId; $this->bPartial = $bPartial; $this->iSearchNameCount = $iSearchNameCount; + $this->iTermCount = $iTermCount; } public function debugInfo() diff --git a/test/php/Nominatim/TokenListTest.php b/test/php/Nominatim/TokenListTest.php index 191a09dc..ca43aabb 100644 --- a/test/php/Nominatim/TokenListTest.php +++ b/test/php/Nominatim/TokenListTest.php @@ -121,6 +121,6 @@ class TokenTest extends \PHPUnit\Framework\TestCase $this->assertEquals(array(new Token\HouseNumber(999, '1051')), $TL->get('1051')); $this->assertEquals(array(new Token\Country(999, 'de')), $TL->get('alemagne')); $this->assertEquals(array(new Token\Postcode(999, '64286')), $TL->get('64286')); - $this->assertEquals(array(new Token\Word(999, true, 533)), $TL->get('darmstadt')); + $this->assertEquals(array(new Token\Word(999, true, 533, 0)), $TL->get('darmstadt')); } } -- 2.39.5