From: Sarah Hoffmann Date: Tue, 26 Oct 2021 15:02:16 +0000 (+0200) Subject: Revert "ICU: additional ranking by matching of normalised term" X-Git-Tag: deploy~139 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/aca6439eb7bc2cc6a6c2d7df0ebdb299c96c9c41?ds=sidebyside Revert "ICU: additional ranking by matching of normalised term" This reverts commit b6a831443c8f46bfe1be59ecd1cb7a6847fdf948. --- diff --git a/lib-php/TokenWord.php b/lib-php/TokenWord.php index b9e28d91..59456e35 100644 --- a/lib-php/TokenWord.php +++ b/lib-php/TokenWord.php @@ -13,15 +13,12 @@ class Word private $iSearchNameCount; /// Number of terms in the word. private $iTermCount; - /// Match score. - private $iMatchScore; - public function __construct($iId, $iSearchNameCount, $iTermCount, $iMatchScore = 1) + public function __construct($iId, $iSearchNameCount, $iTermCount) { $this->iId = $iId; $this->iSearchNameCount = $iSearchNameCount; $this->iTermCount = $iTermCount; - $this->iMatchScore = $iMatchScore; } public function getId() @@ -66,13 +63,13 @@ class Word if ($this->iTermCount > 1 && ($oPosition->isPhrase('') || !$oPosition->isFirstPhrase()) ) { - $oNewSearch = $oSearch->clone($this->iMatchScore); + $oNewSearch = $oSearch->clone(1); $oNewSearch->addAddressToken($this->iId); return array($oNewSearch); } } elseif (!$oSearch->hasName(true)) { - $oNewSearch = $oSearch->clone($this->iMatchScore); + $oNewSearch = $oSearch->clone(1); $oNewSearch->addNameToken( $this->iId, CONST_Search_NameOnlySearchFrequencyThreshold @@ -93,8 +90,7 @@ class Word 'Type' => 'word', 'Info' => array( 'count' => $this->iSearchNameCount, - 'terms' => $this->iTermCount, - 'score' => $this->iMatchScore + 'terms' => $this->iTermCount ) ); } diff --git a/lib-php/tokenizer/icu_tokenizer.php b/lib-php/tokenizer/icu_tokenizer.php index cebdac47..f4dd3aeb 100644 --- a/lib-php/tokenizer/icu_tokenizer.php +++ b/lib-php/tokenizer/icu_tokenizer.php @@ -89,26 +89,19 @@ class Tokenizer $aWordLists = array(); $aTokens = array(); foreach ($aPhrases as $iPhrase => $oPhrase) { - $sNormPhrase = $this->normalizeString($oPhrase->getPhrase()); - Debug::printVar('Phrase', $sNormPhrase); - - $oWordList = new SimpleWordList($sNormPhrase); - - foreach ($oWordList->getTokens() as $sToken) { - $sTransToken = $this->makeStandardWord($sToken); - if (!isset($aTokens[$sTransToken])) { - $aTokens[$sTransToken] = array(); - } - $aTokens[$sTransToken][$sToken] = $sToken; - } + $sNormQuery .= ','.$this->normalizeString($oPhrase->getPhrase()); + $sPhrase = $this->makeStandardWord($oPhrase->getPhrase()); + Debug::printVar('Phrase', $sPhrase); + $oWordList = new SimpleWordList($sPhrase); + $aTokens = array_merge($aTokens, $oWordList->getTokens()); $aWordLists[] = $oWordList; } Debug::printVar('Tokens', $aTokens); Debug::printVar('WordLists', $aWordLists); - $oValidTokens = $this->computeValidTokens($aTokens); + $oValidTokens = $this->computeValidTokens($aTokens, $sNormQuery); foreach ($aPhrases as $iPhrase => $oPhrase) { $oPhrase->setWordSets($aWordLists[$iPhrase]->getWordSets($oValidTokens)); @@ -118,16 +111,16 @@ class Tokenizer } - private function computeValidTokens($aTokens) + private function computeValidTokens($aTokens, $sNormQuery) { $oValidTokens = new TokenList(); if (!empty($aTokens)) { - $this->addTokensFromDB($oValidTokens, $aTokens); + $this->addTokensFromDB($oValidTokens, $aTokens, $sNormQuery); // Try more interpretations for Tokens that could not be matched. - foreach ($aTokens as $sToken => $aNormalized) { - if (!$oValidTokens->contains($sToken)) { + foreach ($aTokens as $sToken) { + if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) { if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { // US ZIP+4 codes - merge in the 5-digit ZIP code $oValidTokens->addToken( @@ -150,7 +143,7 @@ class Tokenizer } - private function addTokensFromDB(&$oValidTokens, $aTokens) + private function addTokensFromDB(&$oValidTokens, $aTokens, $sNormQuery) { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, type, word,'; @@ -158,7 +151,7 @@ class Tokenizer $sSQL .= " info->>'class' as class, info->>'type' as ctype,"; $sSQL .= " info->>'count' as count"; $sSQL .= ' FROM word WHERE word_token in ('; - $sSQL .= join(',', $this->oDB->getDBQuotedList(array_keys($aTokens))).')'; + $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')'; Debug::printSQL($sSQL); @@ -167,23 +160,18 @@ class Tokenizer foreach ($aDBWords as $aWord) { $iId = (int) $aWord['word_id']; $sTok = $aWord['word_token']; - $aNorms = $aTokens[$sTok]; switch ($aWord['type']) { case 'C': // country name tokens if ($aWord['word'] !== null) { - foreach ($aNorms as $sNorm) { - $oValidTokens->addToken( - $sNorm, - new Token\Country($iId, $aWord['word']) - ); - } + $oValidTokens->addToken( + $sTok, + new Token\Country($iId, $aWord['word']) + ); } break; case 'H': // house number tokens - foreach ($aNorms as $sNorm) { - $oValidTokens->addToken($sNorm, new Token\HouseNumber($iId, $aWord['word_token'])); - } + $oValidTokens->addToken($sTok, new Token\HouseNumber($iId, $aWord['word_token'])); break; case 'P': // postcode tokens // Postcodes are not normalized, so they may have content @@ -193,48 +181,37 @@ class Tokenizer && pg_escape_string($aWord['word']) == $aWord['word'] ) { $sNormPostcode = $this->normalizeString($aWord['word']); - foreach ($aNorms as $sNorm) { - if ($sNormPostcode == $sNorm) { - $oValidTokens->addToken( - $sNorm, - new Token\Postcode($iId, $aWord['word'], null) - ); - } + if (strpos($sNormQuery, $sNormPostcode) !== false) { + $oValidTokens->addToken( + $sTok, + new Token\Postcode($iId, $aWord['word'], null) + ); } } break; case 'S': // tokens for classification terms (special phrases) if ($aWord['class'] !== null && $aWord['ctype'] !== null) { - foreach ($aNorms as $sNorm) { - if ($aWord['word'] == $sNorm) { - $oValidTokens->addToken($sTok, new Token\SpecialTerm( - $iId, - $aWord['class'], - $aWord['ctype'], - (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE - )); - } - } - } - break; - case 'W': // full-word tokens - foreach ($aNorms as $sNorm) { - $oValidTokens->addToken($sNorm, new Token\Word( + $oValidTokens->addToken($sTok, new Token\SpecialTerm( $iId, - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') + 1, - levenshtein($aWord['word'], $sNorm) + 1 + $aWord['class'], + $aWord['ctype'], + (isset($aWord['operator'])) ? Operator::NEAR : Operator::NONE )); } break; + case 'W': // full-word tokens + $oValidTokens->addToken($sTok, new Token\Word( + $iId, + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + )); + break; case 'w': // partial word terms - foreach ($aNorms as $sNorm) { - $oValidTokens->addToken($sNorm, new Token\Partial( - $iId, - $aWord['word_token'], - (int) $aWord['count'] - )); - } + $oValidTokens->addToken($sTok, new Token\Partial( + $iId, + $aWord['word_token'], + (int) $aWord['count'] + )); break; default: break;