From: Sarah Hoffmann Date: Sun, 15 Oct 2017 16:07:55 +0000 (+0200) Subject: Merge pull request #814 from lonvia/phrase-as-a-class X-Git-Tag: v3.1.0~43 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/fcf7fcee03c8d9a67e1ecace61df81aa81201936?hp=7ea1ef3feb587b58f74784e6802c2e74d90e22cc Merge pull request #814 from lonvia/phrase-as-a-class Make phrases a class and add early checking of token validity --- diff --git a/lib/Geocode.php b/lib/Geocode.php index 16919bb8..be543012 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -3,6 +3,7 @@ namespace Nominatim; require_once(CONST_BasePath.'/lib/PlaceLookup.php'); +require_once(CONST_BasePath.'/lib/Phrase.php'); require_once(CONST_BasePath.'/lib/ReverseGeocode.php'); require_once(CONST_BasePath.'/lib/SearchDescription.php'); require_once(CONST_BasePath.'/lib/SearchContext.php'); @@ -668,7 +669,7 @@ class Geocode return $aSearchResults; } - public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery) + public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured) { /* Calculate all searches using aValidTokens i.e. @@ -683,15 +684,11 @@ class Geocode */ $iGlobalRank = 0; - foreach ($aPhrases as $iPhrase => $aPhrase) { + foreach ($aPhrases as $iPhrase => $oPhrase) { $aNewPhraseSearches = array(); - if ($bStructuredPhrases) { - $sPhraseType = $aPhraseTypes[$iPhrase]; - } else { - $sPhraseType = ''; - } + $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : ''; - foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) { + foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) { // Too many permutations - too expensive if ($iWordSet > 120) break; @@ -710,17 +707,8 @@ class Geocode // If the token is valid if (isset($aValidTokens[' '.$sToken])) { foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) { - // Recheck if the original word shows up in the query. - $bWordInQuery = false; - if (isset($aSearchTerm['word']) && $aSearchTerm['word']) { - $bWordInQuery = strpos( - $sNormQuery, - $this->normTerm($aSearchTerm['word']) - ) !== false; - } $aNewSearches = $oCurrentSearch->extendWithFullTerm( $aSearchTerm, - $bWordInQuery, isset($aValidTokens[$sToken]) && strpos($sToken, ' ') === false, $sPhraseType, @@ -746,9 +734,8 @@ class Geocode foreach ($aValidTokens[$sToken] as $aSearchTerm) { $aNewSearches = $oCurrentSearch->extendWithPartialTerm( $aSearchTerm, - $bStructuredPhrases, + $bIsStructured, $iPhrase, - $aWordFrequencyScores, isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array() ); @@ -806,7 +793,7 @@ class Geocode // Revisit searches, drop bad searches and give penalty to unlikely combinations. $aGroupedSearches = array(); foreach ($aSearches as $oSearch) { - if (!$oSearch->isValidSearch($this->aCountryCodes)) { + if (!$oSearch->isValidSearch()) { continue; } @@ -955,10 +942,10 @@ class Geocode // Split query into phrases // Commas are used to reduce the search space by indicating where phrases split if ($this->aStructuredQuery) { - $aPhrases = $this->aStructuredQuery; + $aInPhrases = $this->aStructuredQuery; $bStructuredPhrases = true; } else { - $aPhrases = explode(',', $sQuery); + $aInPhrases = explode(',', $sQuery); $bStructuredPhrases = false; } @@ -967,25 +954,19 @@ class Geocode // Get all 'sets' of words // Generate a complete list of all $aTokens = array(); - foreach ($aPhrases as $iPhrase => $sPhrase) { - $aPhrase = chksql( - $this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"), + $aPhrases = array(); + foreach ($aInPhrases as $iPhrase => $sPhrase) { + $sPhrase = chksql( + $this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'), "Cannot normalize query string (is it a UTF-8 string?)" ); - if (trim($aPhrase['string'])) { - $aPhrases[$iPhrase] = $aPhrase; - $aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']); - $aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0); - $aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets'])); - } else { - unset($aPhrases[$iPhrase]); + if (trim($sPhrase)) { + $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : ''); + $oPhrase->addTokens($aTokens); + $aPhrases[] = $oPhrase; } } - // Reindex phrases - we make assumptions later on that they are numerically keyed in order - $aPhraseTypes = array_keys($aPhrases); - $aPhrases = array_values($aPhrases); - if (sizeof($aTokens)) { // Check which tokens we have, get the ID numbers $sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count'; @@ -999,22 +980,29 @@ class Geocode $this->oDB->getAll($sSQL), "Could not get word tokens." ); - $aPossibleMainWordIDs = array(); $aWordFrequencyScores = array(); foreach ($aDatabaseWords as $aToken) { - // Very special case - require 2 letter country param to match the country code found - if ($bStructuredPhrases && $aToken['country_code'] && !empty($this->aStructuredQuery['country']) - && strlen($this->aStructuredQuery['country']) == 2 && strtolower($this->aStructuredQuery['country']) != $aToken['country_code'] + // Filter country tokens that do not match restricted countries. + if ($this->aCountryCodes + && $aToken['country_code'] + && !in_array($aToken['country_code'], $this->aCountryCodes) ) { continue; } + // Special terms need to appear in their normalized form. + if ($aToken['word'] && $aToken['class']) { + $sNormWord = $this->normTerm($aToken['word']); + if (strpos($sNormQuery, $sNormWord) === false) { + continue; + } + } + if (isset($aValidTokens[$aToken['word_token']])) { $aValidTokens[$aToken['word_token']][] = $aToken; } else { $aValidTokens[$aToken['word_token']] = array($aToken); } - if (!$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1; $aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1; } if (CONST_Debug) var_Dump($aPhrases, $aValidTokens); @@ -1046,19 +1034,18 @@ class Geocode // Any words that have failed completely? // TODO: suggestions - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases); if ($this->bReverseInPlan) { // Reverse phrase array and also reverse the order of the wordsets in // the first and final phrase. Don't bother about phrases in the middle // because order in the address doesn't matter. $aPhrases = array_reverse($aPhrases); - $aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0); + $aPhrases[0]->invertWordSets(); if (sizeof($aPhrases) > 1) { - $aFinalPhrase = end($aPhrases); - $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0); + $aPhrases[sizeof($aPhrases)-1]->invertWordSets(); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { @@ -1288,8 +1275,7 @@ class Geocode $aResult['name'] = $aResult['langaddress']; - if ($oCtx->hasNearPoint()) - { + if ($oCtx->hasNearPoint()) { $aResult['importance'] = 0.001; $aResult['foundorder'] = $aResult['addressimportance']; } else { diff --git a/lib/Phrase.php b/lib/Phrase.php new file mode 100644 index 00000000..b39079d9 --- /dev/null +++ b/lib/Phrase.php @@ -0,0 +1,116 @@ +sPhrase = trim($sPhrase); + $this->sPhraseType = $sPhraseType; + $this->aWords = explode(' ', $this->sPhrase); + $this->aWordSets = $this->createWordSets($this->aWords, 0); + } + + /** + * Return the element type of the phrase. + * + * @return string Pharse type if the phrase comes from a structured query + * or empty string otherwise. + */ + public function getPhraseType() + { + return $this->sPhraseType; + } + + /** + * Return the array of possible segmentations of the phrase. + * + * @return string[][] Array of segmentations, each consisting of an + * array of terms. + */ + public function getWordSets() + { + return $this->aWordSets; + } + + /** + * Add the tokens from this phrase to the given list of tokens. + * + * @param string[] $aTokens List of tokens to append. + * + * @return void + */ + public function addTokens(&$aTokens) + { + foreach ($this->aWordSets as $aSet) { + foreach ($aSet as $sWord) { + $aTokens[' '.$sWord] = ' '.$sWord; + $aTokens[$sWord] = $sWord; + } + } + } + + /** + * Invert the set of possible segmentations. + * + * @return void + */ + public function invertWordSets() + { + $this->aWordSets = $this->createInverseWordSets($this->aWords, 0); + } + + private function createWordSets($aWords, $iDepth) + { + $aResult = array(array(join(' ', $aWords))); + $sFirstToken = ''; + if ($iDepth < Phrase::MAX_DEPTH) { + while (sizeof($aWords) > 1) { + $sWord = array_shift($aWords); + $sFirstToken .= ($sFirstToken?' ':'').$sWord; + $aRest = $this->createWordSets($aWords, $iDepth + 1); + foreach ($aRest as $aSet) { + $aResult[] = array_merge(array($sFirstToken), $aSet); + } + } + } + + return $aResult; + } + + private function createInverseWordSets($aWords, $iDepth) + { + $aResult = array(array(join(' ', $aWords))); + $sFirstToken = ''; + if ($iDepth < Phrase::MAX_DEPTH) { + while (sizeof($aWords) > 1) { + $sWord = array_pop($aWords); + $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken; + $aRest = $this->createInverseWordSets($aWords, $iDepth + 1); + foreach ($aRest as $aSet) { + $aResult[] = array_merge(array($sFirstToken), $aSet); + } + } + } + + return $aResult; + } +} diff --git a/lib/SearchDescription.php b/lib/SearchDescription.php index 1f3765ab..eba5f6a9 100644 --- a/lib/SearchDescription.php +++ b/lib/SearchDescription.php @@ -155,22 +155,17 @@ class SearchDescription /** * Check if the combination of parameters is sensible. * - * @param string[] $aCountryCodes List of country codes. - * * @return bool True, if the search looks valid. */ - public function isValidSearch(&$aCountryCodes) + public function isValidSearch() { if (!sizeof($this->aName)) { if ($this->sHouseNumber) { return false; } - } - if ($aCountryCodes - && $this->sCountryCode - && !in_array($this->sCountryCode, $aCountryCodes) - ) { - return false; + if (!$this->sClass && !$this->sCountryCode) { + return false; + } } return true; @@ -183,8 +178,6 @@ class SearchDescription * Derive new searches by adding a full term to the existing search. * * @param mixed[] $aSearchTerm Description of the token. - * @param bool $bWordInQuery True, if the normalised version of the word - * is contained in the query. * @param bool $bHasPartial True if there are also tokens of partial terms * with the same name. * @param string $sPhraseType Type of phrase the token is contained in. @@ -198,7 +191,7 @@ class SearchDescription * * @return SearchDescription[] List of derived search descriptions. */ - public function extendWithFullTerm($aSearchTerm, $bWordInQuery, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank) + public function extendWithFullTerm($aSearchTerm, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank) { $aNewSearches = array(); @@ -229,7 +222,8 @@ class SearchDescription // We need to try the case where the postal code is the primary element // (i.e. no way to tell if it is (postalcode, city) OR (city, postalcode) // so try both. - if (!$this->sPostcode && $bWordInQuery + if (!$this->sPostcode + && $aSearchTerm['word'] && pg_escape_string($aSearchTerm['word']) == $aSearchTerm['word'] ) { // If we have structured search or this is the first term, @@ -278,16 +272,8 @@ class SearchDescription } $aNewSearches[] = $oSearch; } - } elseif ($sPhraseType == '' - && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null - ) { - // require a normalized exact match of the term - // if we have the normalizer version of the query - // available - if ($this->iOperator == Operator::NONE - && (isset($aSearchTerm['word']) && $aSearchTerm['word']) - && $bWordInQuery - ) { + } elseif ($sPhraseType == '' && $aSearchTerm['class']) { + if ($this->iOperator == Operator::NONE) { $oSearch = clone $this; $oSearch->iSearchRank++; @@ -302,7 +288,10 @@ class SearchDescription $oSearch->setPoiSearch($iOp, $aSearchTerm['class'], $aSearchTerm['type']); $aNewSearches[] = $oSearch; } - } elseif (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) { + } elseif (isset($aSearchTerm['word_id']) + && $aSearchTerm['word_id'] + && $sPhraseType != 'country' + ) { $iWordID = $aSearchTerm['word_id']; if (sizeof($this->aName)) { if (($sPhraseType == '' || !$bFirstPhrase) @@ -330,17 +319,15 @@ class SearchDescription /** * Derive new searches by adding a partial term to the existing search. * - * @param mixed[] $aSearchTerm Description of the token. - * @param bool $bStructuredPhrases True if the search is structured. - * @param integer $iPhrase Number of the phrase the token is in. - * @param mixed[] $aWordFrequencyScores Number of times tokens appears - * overall in a planet database. - * @param array[] $aFullTokens List of full term tokens with the - * same name. + * @param mixed[] $aSearchTerm Description of the token. + * @param bool $bStructuredPhrases True if the search is structured. + * @param integer $iPhrase Number of the phrase the token is in. + * @param array[] $aFullTokens List of full term tokens with the + * same name. * * @return SearchDescription[] List of derived search descriptions. */ - public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, &$aWordFrequencyScores, $aFullTokens) + public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens) { // Only allow name terms. if (!(isset($aSearchTerm['word_id']) && $aSearchTerm['word_id'])) { @@ -354,7 +341,7 @@ class SearchDescription && sizeof($this->aName) && strpos($aSearchTerm['word_token'], ' ') === false ) { - if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) { + if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) { $oSearch = clone $this; $oSearch->iSearchRank++; $oSearch->aAddress[$iWordID] = $iWordID; @@ -397,7 +384,7 @@ class SearchDescription if (preg_match('#^[0-9]+$#', $aSearchTerm['word_token'])) { $oSearch->iSearchRank += 2; } - if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) { + if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) { $oSearch->aName[$iWordID] = $iWordID; } else { $oSearch->aNameNonSearch[$iWordID] = $iWordID; diff --git a/lib/lib.php b/lib/lib.php index b5fbee3e..76775d6c 100644 --- a/lib/lib.php +++ b/lib/lib.php @@ -60,54 +60,6 @@ function byImportance($a, $b) } -function getWordSets($aWords, $iDepth) -{ - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < 7) { - while (sizeof($aWords) > 1) { - $sWord = array_shift($aWords); - $sFirstToken .= ($sFirstToken?' ':'').$sWord; - $aRest = getWordSets($aWords, $iDepth+1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); - } - } - } - return $aResult; -} - -function getInverseWordSets($aWords, $iDepth) -{ - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < 8) { - while (sizeof($aWords) > 1) { - $sWord = array_pop($aWords); - $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken; - $aRest = getInverseWordSets($aWords, $iDepth+1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); - } - } - } - return $aResult; -} - - -function getTokensFromSets($aSets) -{ - $aTokens = array(); - foreach ($aSets as $aSet) { - foreach ($aSet as $sWord) { - $aTokens[' '.$sWord] = ' '.$sWord; - $aTokens[$sWord] = $sWord; - } - } - return $aTokens; -} - - function getClassTypes() { return array( diff --git a/test/php/Nominatim/NominatimTest.php b/test/php/Nominatim/NominatimTest.php index 33bb6d32..cae3ebb8 100644 --- a/test/php/Nominatim/NominatimTest.php +++ b/test/php/Nominatim/NominatimTest.php @@ -66,76 +66,6 @@ class NominatimTest extends \PHPUnit_Framework_TestCase } - public function testGetWordSets() - { - // given an array of arrays like - // array( array('a','b'), array('c','d') ) - // returns a summary as string: '(a|b),(c|d)' - - - function serializeSets($aSets) - { - $aParts = array(); - foreach ($aSets as $aSet) { - $aParts[] = '(' . join('|', $aSet) . ')'; - } - return join(',', $aParts); - } - - $this->assertEquals( - array(array('')), - getWordSets(array(), 0) - ); - - $this->assertEquals( - '(a)', - serializeSets(getWordSets(array("a"), 0)) - ); - - $this->assertEquals( - '(a b),(a|b)', - serializeSets(getWordSets(array('a', 'b'), 0)) - ); - - $this->assertEquals( - '(a b c),(a|b c),(a|b|c),(a b|c)', - serializeSets(getWordSets(array('a', 'b', 'c'), 0)) - ); - - $this->assertEquals( - '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)', - serializeSets(getWordSets(array('a', 'b', 'c', 'd'), 0)) - ); - - - // Inverse - $this->assertEquals( - '(a b c),(c|a b),(c|b|a),(b c|a)', - serializeSets(getInverseWordSets(array('a', 'b', 'c'), 0)) - ); - - - // make sure we don't create too many sets - // 4 words => 8 sets - // 10 words => 511 sets - // 15 words => 12911 sets - // 18 words => 65536 sets - // 20 words => 169766 sets - // 22 words => 401930 sets - // 28 words => 3505699 sets (needs more than 4GB via 'phpunit -d memory_limit=' to run) - $this->assertEquals( - 8, - count(getWordSets(array_fill(0, 4, 'a'), 0)) - ); - - - $this->assertEquals( - 41226, - count(getWordSets(array_fill(0, 18, 'a'), 0)) - ); - } - - public function testCreatePointsAroundCenter() { // you might say we're creating a circle diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php new file mode 100644 index 00000000..db8d8b50 --- /dev/null +++ b/test/php/Nominatim/PhraseTest.php @@ -0,0 +1,87 @@ +assertEquals( + array(array('')), + $oPhrase->getWordSets() + ); + } + + + public function testSingleWordPhrase() + { + $oPhrase = new Phrase('a', ''); + + $this->assertEquals( + '(a)', + $this->serializeSets($oPhrase->getWordSets()) + ); + } + + + public function testMultiWordPhrase() + { + $oPhrase = new Phrase('a b', ''); + $this->assertEquals( + '(a b),(a|b)', + $this->serializeSets($oPhrase->getWordSets()) + ); + + $oPhrase = new Phrase('a b c', ''); + $this->assertEquals( + '(a b c),(a|b c),(a|b|c),(a b|c)', + $this->serializeSets($oPhrase->getWordSets()) + ); + + $oPhrase = new Phrase('a b c d', ''); + $this->assertEquals( + '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)', + $this->serializeSets($oPhrase->getWordSets()) + ); + } + + + public function testInverseWordSets() + { + $oPhrase = new Phrase('a b c', ''); + $oPhrase->invertWordSets(); + + $this->assertEquals( + '(a b c),(c|a b),(c|b|a),(b c|a)', + $this->serializeSets($oPhrase->getWordSets()) + ); + } + + + public function testMaxDepth() + { + $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), ''); + $this->assertEquals(8, count($oPhrase->getWordSets())); + $oPhrase->invertWordSets(); + $this->assertEquals(8, count($oPhrase->getWordSets())); + + $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); + $this->assertEquals(41226, count($oPhrase->getWordSets())); + $oPhrase->invertWordSets(); + $this->assertEquals(41226, count($oPhrase->getWordSets())); + } +}