From: Sarah Hoffmann Date: Sun, 30 Jun 2019 08:48:58 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~282 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/b8ebc169b104dae171731288a5a8a7b569d22530?hp=1dfa9684b0dbe0d312449fd3041df6ee2bc13616 Merge remote-tracking branch 'upstream/master' --- diff --git a/lib/Geocode.php b/lib/Geocode.php index e2d67686..12410acc 100644 --- a/lib/Geocode.php +++ b/lib/Geocode.php @@ -348,10 +348,7 @@ class Geocode $aNewPhraseSearches = array(); $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : ''; - foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) { - // Too many permutations - too expensive - if ($iWordSet > 120) break; - + foreach ($oPhrase->getWordSets() as $aWordset) { $aWordsetSearches = $aSearches; // Add all words from this wordset @@ -641,7 +638,6 @@ class Geocode } } - Debug::printDebugTable('Phrases', $aPhrases); Debug::printVar('Tokens', $aTokens); $oValidTokens = new TokenList(); @@ -686,6 +682,11 @@ class Geocode Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo()); + foreach ($aPhrases as $oPhrase) { + $oPhrase->computeWordSets($oValidTokens); + } + Debug::printDebugTable('Phrases', $aPhrases); + Debug::newSection('Search candidates'); $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases); diff --git a/lib/Phrase.php b/lib/Phrase.php index 7cf3f297..2e90537e 100644 --- a/lib/Phrase.php +++ b/lib/Phrase.php @@ -9,7 +9,8 @@ namespace Nominatim; */ class Phrase { - const MAX_DEPTH = 7; + public const MAX_WORDSET_LEN = 20; + public const MAX_WORDSETS = 100; // Complete phrase as a string. private $sPhrase; @@ -20,13 +21,24 @@ class Phrase // Possible segmentations of the phrase. private $aWordSets; + public static function cmpByArraylen($aA, $aB) + { + $iALen = count($aA); + $iBLen = count($aB); + + if ($iALen == $iBLen) { + return 0; + } + + return ($iALen < $iBLen) ? -1 : 1; + } + public function __construct($sPhrase, $sPhraseType) { $this->sPhrase = trim($sPhrase); $this->sPhraseType = $sPhraseType; $this->aWords = explode(' ', $this->sPhrase); - $this->aWordSets = $this->createWordSets($this->aWords, 0); } /** @@ -60,10 +72,17 @@ class Phrase */ public function addTokens(&$aTokens) { - foreach ($this->aWordSets as $aSet) { - foreach ($aSet as $sWord) { - $aTokens[' '.$sWord] = ' '.$sWord; - $aTokens[$sWord] = $sWord; + $iNumWords = count($this->aWords); + + for ($i = 0; $i < $iNumWords; $i++) { + $sPhrase = $this->aWords[$i]; + $aTokens[' '.$sPhrase] = ' '.$sPhrase; + $aTokens[$sPhrase] = $sPhrase; + + for ($j = $i + 1; $j < $iNumWords; $j++) { + $sPhrase .= ' '.$this->aWords[$j]; + $aTokens[' '.$sPhrase] = ' '.$sPhrase; + $aTokens[$sPhrase] = $sPhrase; } } } @@ -75,45 +94,60 @@ class Phrase */ public function invertWordSets() { - $this->aWordSets = $this->createInverseWordSets($this->aWords, 0); + foreach ($this->aWordSets as $i => $aSet) { + $this->aWordSets[$i] = array_reverse($aSet); + } } - private function createWordSets($aWords, $iDepth) + public function computeWordSets($oTokens) { - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < Phrase::MAX_DEPTH) { - while (count($aWords) > 1) { - $sWord = array_shift($aWords); - $sFirstToken .= ($sFirstToken?' ':'').$sWord; - $aRest = $this->createWordSets($aWords, $iDepth + 1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); - } - } - } + $iNumWords = count($this->aWords); + // Caches the word set for the partial phrase up to word i. + $aSetCache = array_fill(0, $iNumWords, array()); - return $aResult; - } + // Initialise first element of cache. There can only be the word. + if ($oTokens->containsAny($this->aWords[0])) { + $aSetCache[0][] = array($this->aWords[0]); + } - private function createInverseWordSets($aWords, $iDepth) - { - $aResult = array(array(join(' ', $aWords))); - $sFirstToken = ''; - if ($iDepth < Phrase::MAX_DEPTH) { - while (count($aWords) > 1) { - $sWord = array_pop($aWords); - $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken; - $aRest = $this->createInverseWordSets($aWords, $iDepth + 1); - foreach ($aRest as $aSet) { - $aResult[] = array_merge(array($sFirstToken), $aSet); + // Now do the next elements using what we already have. + for ($i = 1; $i < $iNumWords; $i++) { + for ($j = $i; $j > 0; $j--) { + $sPartial = $j == $i ? $this->aWords[$j] : $this->aWords[$j].' '.$sPartial; + if (!empty($aSetCache[$j - 1]) && $oTokens->containsAny($sPartial)) { + $aPartial = array($sPartial); + foreach ($aSetCache[$j - 1] as $aSet) { + if (count($aSet) < Phrase::MAX_WORDSET_LEN) { + $aSetCache[$i][] = array_merge($aSet, $aPartial); + } + } + if (count($aSetCache[$i]) > 2 * Phrase::MAX_WORDSETS) { + usort( + $aSetCache[$i], + array('\Nominatim\Phrase', 'cmpByArraylen') + ); + $aSetCache[$i] = array_slice( + $aSetCache[$i], + 0, + Phrase::MAX_WORDSETS + ); + } } } + + // finally the current full phrase + $sPartial = $this->aWords[0].' '.$sPartial; + if ($oTokens->containsAny($sPartial)) { + $aSetCache[$i][] = array($sPartial); + } } - return $aResult; + $this->aWordSets = $aSetCache[$iNumWords - 1]; + usort($this->aWordSets, array('\Nominatim\Phrase', 'cmpByArraylen')); + $this->aWordSets = array_slice($this->aWordSets, 0, Phrase::MAX_WORDSETS); } + public function debugInfo() { return array( diff --git a/lib/TokenList.php b/lib/TokenList.php index 84dc98d0..fce5f940 100644 --- a/lib/TokenList.php +++ b/lib/TokenList.php @@ -55,6 +55,18 @@ class TokenList return isset($this->aTokens[$sWord]); } + /** + * Check if there are partial or full tokens for the given word. + * + * @param string $sWord Token word to look for. + * + * @return bool True if there is one or more token for the token word. + */ + public function containsAny($sWord) + { + return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]); + } + /** * Get the list of tokens for the given token word. * diff --git a/test/php/Nominatim/PhraseTest.php b/test/php/Nominatim/PhraseTest.php index cd742715..c23d6483 100644 --- a/test/php/Nominatim/PhraseTest.php +++ b/test/php/Nominatim/PhraseTest.php @@ -4,6 +4,29 @@ namespace Nominatim; require_once(CONST_BasePath.'/lib/Phrase.php'); +class TokensFullSet +{ + public function containsAny($sTerm) + { + return true; + } +} + +// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses +class TokensPartialSet +{ + public function __construct($aTokens) + { + $this->aTokens = array_flip($aTokens); + } + + public function containsAny($sTerm) + { + return isset($this->aTokens[$sTerm]); + } +} + +// phpcs:ignore PSR1.Classes.ClassDeclaration.MultipleClasses class PhraseTest extends \PHPUnit\Framework\TestCase { @@ -21,6 +44,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testEmptyPhrase() { $oPhrase = new Phrase('', ''); + $oPhrase->computeWordSets(new TokensFullSet()); $this->assertEquals( array(array('')), @@ -32,6 +56,7 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testSingleWordPhrase() { $oPhrase = new Phrase('a', ''); + $oPhrase->computeWordSets(new TokensFullSet()); $this->assertEquals( '(a)', @@ -43,20 +68,23 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testMultiWordPhrase() { $oPhrase = new Phrase('a b', ''); + $oPhrase->computeWordSets(new TokensFullSet()); $this->assertEquals( '(a b),(a|b)', $this->serializeSets($oPhrase->getWordSets()) ); $oPhrase = new Phrase('a b c', ''); + $oPhrase->computeWordSets(new TokensFullSet()); $this->assertEquals( - '(a b c),(a|b c),(a|b|c),(a b|c)', + '(a b c),(a|b c),(a b|c),(a|b|c)', $this->serializeSets($oPhrase->getWordSets()) ); $oPhrase = new Phrase('a b c d', ''); + $oPhrase->computeWordSets(new TokensFullSet()); $this->assertEquals( - '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)', + '(a b c d),(a b c|d),(a b|c d),(a|b c d),(a b|c|d),(a|b c|d),(a|b|c d),(a|b|c|d)', $this->serializeSets($oPhrase->getWordSets()) ); } @@ -65,25 +93,47 @@ class PhraseTest extends \PHPUnit\Framework\TestCase public function testInverseWordSets() { $oPhrase = new Phrase('a b c', ''); + $oPhrase->computeWordSets(new TokensFullSet()); $oPhrase->invertWordSets(); $this->assertEquals( - '(a b c),(c|a b),(c|b|a),(b c|a)', + '(a b c),(b c|a),(c|a b),(c|b|a)', $this->serializeSets($oPhrase->getWordSets()) ); } - public function testMaxDepth() + public function testMaxWordSets() { $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), ''); + $oPhrase->computeWordSets(new TokensFullSet()); $this->assertEquals(8, count($oPhrase->getWordSets())); $oPhrase->invertWordSets(); $this->assertEquals(8, count($oPhrase->getWordSets())); $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); - $this->assertEquals(41226, count($oPhrase->getWordSets())); + $oPhrase->computeWordSets(new TokensFullSet()); + $this->assertEquals(Phrase::MAX_WORDSETS, count($oPhrase->getWordSets())); $oPhrase->invertWordSets(); - $this->assertEquals(41226, count($oPhrase->getWordSets())); + $this->assertEquals(Phrase::MAX_WORDSETS, count($oPhrase->getWordSets())); + } + + + public function testPartialTokensShortTerm() + { + $oPhrase = new Phrase('a b c d', ''); + $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'b', 'd', 'b c', 'b c d'))); + $this->assertEquals( + '(a|b c d),(a|b c|d)', + $this->serializeSets($oPhrase->getWordSets()) + ); + } + + + public function testPartialTokensLongTerm() + { + $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), ''); + $oPhrase->computeWordSets(new TokensPartialSet(array('a', 'a a a a a'))); + $this->assertEquals(80, count($oPhrase->getWordSets())); } }