namespace Nominatim;
require_once(CONST_BasePath.'/lib/PlaceLookup.php');
+require_once(CONST_BasePath.'/lib/Phrase.php');
require_once(CONST_BasePath.'/lib/ReverseGeocode.php');
require_once(CONST_BasePath.'/lib/SearchDescription.php');
require_once(CONST_BasePath.'/lib/SearchContext.php');
return $aSearchResults;
}
- public function getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery)
+ public function getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bIsStructured)
{
/*
Calculate all searches using aValidTokens i.e.
*/
$iGlobalRank = 0;
- foreach ($aPhrases as $iPhrase => $aPhrase) {
+ foreach ($aPhrases as $iPhrase => $oPhrase) {
$aNewPhraseSearches = array();
- if ($bStructuredPhrases) {
- $sPhraseType = $aPhraseTypes[$iPhrase];
- } else {
- $sPhraseType = '';
- }
+ $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : '';
- foreach ($aPhrase['wordsets'] as $iWordSet => $aWordset) {
+ foreach ($oPhrase->getWordSets() as $iWordSet => $aWordset) {
// Too many permutations - too expensive
if ($iWordSet > 120) break;
// If the token is valid
if (isset($aValidTokens[' '.$sToken])) {
foreach ($aValidTokens[' '.$sToken] as $aSearchTerm) {
- // Recheck if the original word shows up in the query.
- $bWordInQuery = false;
- if (isset($aSearchTerm['word']) && $aSearchTerm['word']) {
- $bWordInQuery = strpos(
- $sNormQuery,
- $this->normTerm($aSearchTerm['word'])
- ) !== false;
- }
$aNewSearches = $oCurrentSearch->extendWithFullTerm(
$aSearchTerm,
- $bWordInQuery,
isset($aValidTokens[$sToken])
&& strpos($sToken, ' ') === false,
$sPhraseType,
foreach ($aValidTokens[$sToken] as $aSearchTerm) {
$aNewSearches = $oCurrentSearch->extendWithPartialTerm(
$aSearchTerm,
- $bStructuredPhrases,
+ $bIsStructured,
$iPhrase,
- $aWordFrequencyScores,
isset($aValidTokens[' '.$sToken]) ? $aValidTokens[' '.$sToken] : array()
);
// Revisit searches, drop bad searches and give penalty to unlikely combinations.
$aGroupedSearches = array();
foreach ($aSearches as $oSearch) {
- if (!$oSearch->isValidSearch($this->aCountryCodes)) {
+ if (!$oSearch->isValidSearch()) {
continue;
}
// Split query into phrases
// Commas are used to reduce the search space by indicating where phrases split
if ($this->aStructuredQuery) {
- $aPhrases = $this->aStructuredQuery;
+ $aInPhrases = $this->aStructuredQuery;
$bStructuredPhrases = true;
} else {
- $aPhrases = explode(',', $sQuery);
+ $aInPhrases = explode(',', $sQuery);
$bStructuredPhrases = false;
}
// Get all 'sets' of words
// Generate a complete list of all
$aTokens = array();
- foreach ($aPhrases as $iPhrase => $sPhrase) {
- $aPhrase = chksql(
- $this->oDB->getRow("SELECT make_standard_name('".pg_escape_string($sPhrase)."') as string"),
+ $aPhrases = array();
+ foreach ($aInPhrases as $iPhrase => $sPhrase) {
+ $sPhrase = chksql(
+ $this->oDB->getOne('SELECT make_standard_name('.getDBQuoted($sPhrase).')'),
"Cannot normalize query string (is it a UTF-8 string?)"
);
- if (trim($aPhrase['string'])) {
- $aPhrases[$iPhrase] = $aPhrase;
- $aPhrases[$iPhrase]['words'] = explode(' ', $aPhrases[$iPhrase]['string']);
- $aPhrases[$iPhrase]['wordsets'] = getWordSets($aPhrases[$iPhrase]['words'], 0);
- $aTokens = array_merge($aTokens, getTokensFromSets($aPhrases[$iPhrase]['wordsets']));
- } else {
- unset($aPhrases[$iPhrase]);
+ if (trim($sPhrase)) {
+ $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : '');
+ $oPhrase->addTokens($aTokens);
+ $aPhrases[] = $oPhrase;
}
}
- // Reindex phrases - we make assumptions later on that they are numerically keyed in order
- $aPhraseTypes = array_keys($aPhrases);
- $aPhrases = array_values($aPhrases);
-
if (sizeof($aTokens)) {
// Check which tokens we have, get the ID numbers
$sSQL = 'SELECT word_id, word_token, word, class, type, country_code, operator, search_name_count';
$this->oDB->getAll($sSQL),
"Could not get word tokens."
);
- $aPossibleMainWordIDs = array();
$aWordFrequencyScores = array();
foreach ($aDatabaseWords as $aToken) {
- // Very special case - require 2 letter country param to match the country code found
- if ($bStructuredPhrases && $aToken['country_code'] && !empty($this->aStructuredQuery['country'])
- && strlen($this->aStructuredQuery['country']) == 2 && strtolower($this->aStructuredQuery['country']) != $aToken['country_code']
+ // Filter country tokens that do not match restricted countries.
+ if ($this->aCountryCodes
+ && $aToken['country_code']
+ && !in_array($aToken['country_code'], $this->aCountryCodes)
) {
continue;
}
+ // Special terms need to appear in their normalized form.
+ if ($aToken['word'] && $aToken['class']) {
+ $sNormWord = $this->normTerm($aToken['word']);
+ if (strpos($sNormQuery, $sNormWord) === false) {
+ continue;
+ }
+ }
+
if (isset($aValidTokens[$aToken['word_token']])) {
$aValidTokens[$aToken['word_token']][] = $aToken;
} else {
$aValidTokens[$aToken['word_token']] = array($aToken);
}
- if (!$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1;
$aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1;
}
if (CONST_Debug) var_Dump($aPhrases, $aValidTokens);
// Any words that have failed completely?
// TODO: suggestions
- $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhraseTypes, $aPhrases, $aValidTokens, $aWordFrequencyScores, $bStructuredPhrases, $sNormQuery);
+ $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, $bStructuredPhrases);
if ($this->bReverseInPlan) {
// Reverse phrase array and also reverse the order of the wordsets in
// the first and final phrase. Don't bother about phrases in the middle
// because order in the address doesn't matter.
$aPhrases = array_reverse($aPhrases);
- $aPhrases[0]['wordsets'] = getInverseWordSets($aPhrases[0]['words'], 0);
+ $aPhrases[0]->invertWordSets();
if (sizeof($aPhrases) > 1) {
- $aFinalPhrase = end($aPhrases);
- $aPhrases[sizeof($aPhrases)-1]['wordsets'] = getInverseWordSets($aFinalPhrase['words'], 0);
+ $aPhrases[sizeof($aPhrases)-1]->invertWordSets();
}
- $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, null, $aPhrases, $aValidTokens, $aWordFrequencyScores, false, $sNormQuery);
+ $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $aValidTokens, false);
foreach ($aGroupedSearches as $aSearches) {
foreach ($aSearches as $aSearch) {
$aResult['name'] = $aResult['langaddress'];
- if ($oCtx->hasNearPoint())
- {
+ if ($oCtx->hasNearPoint()) {
$aResult['importance'] = 0.001;
$aResult['foundorder'] = $aResult['addressimportance'];
} else {
--- /dev/null
+<?php
+
+namespace Nominatim;
+
+/**
+ * Segment of a query string.
+ *
+ * The parts of a query strings are usually separated by commas.
+ */
+class Phrase
+{
+ const MAX_DEPTH = 7;
+
+ // Complete phrase as a string.
+ private $sPhrase;
+ // Element type for structured searches.
+ private $sPhraseType;
+ // Space-separated words of the phrase.
+ private $aWords;
+ // Possible segmentations of the phrase.
+ private $aWordSets;
+
+
+ public function __construct($sPhrase, $sPhraseType)
+ {
+ $this->sPhrase = trim($sPhrase);
+ $this->sPhraseType = $sPhraseType;
+ $this->aWords = explode(' ', $this->sPhrase);
+ $this->aWordSets = $this->createWordSets($this->aWords, 0);
+ }
+
+ /**
+ * Return the element type of the phrase.
+ *
+ * @return string Pharse type if the phrase comes from a structured query
+ * or empty string otherwise.
+ */
+ public function getPhraseType()
+ {
+ return $this->sPhraseType;
+ }
+
+ /**
+ * Return the array of possible segmentations of the phrase.
+ *
+ * @return string[][] Array of segmentations, each consisting of an
+ * array of terms.
+ */
+ public function getWordSets()
+ {
+ return $this->aWordSets;
+ }
+
+ /**
+ * Add the tokens from this phrase to the given list of tokens.
+ *
+ * @param string[] $aTokens List of tokens to append.
+ *
+ * @return void
+ */
+ public function addTokens(&$aTokens)
+ {
+ foreach ($this->aWordSets as $aSet) {
+ foreach ($aSet as $sWord) {
+ $aTokens[' '.$sWord] = ' '.$sWord;
+ $aTokens[$sWord] = $sWord;
+ }
+ }
+ }
+
+ /**
+ * Invert the set of possible segmentations.
+ *
+ * @return void
+ */
+ public function invertWordSets()
+ {
+ $this->aWordSets = $this->createInverseWordSets($this->aWords, 0);
+ }
+
+ private function createWordSets($aWords, $iDepth)
+ {
+ $aResult = array(array(join(' ', $aWords)));
+ $sFirstToken = '';
+ if ($iDepth < Phrase::MAX_DEPTH) {
+ while (sizeof($aWords) > 1) {
+ $sWord = array_shift($aWords);
+ $sFirstToken .= ($sFirstToken?' ':'').$sWord;
+ $aRest = $this->createWordSets($aWords, $iDepth + 1);
+ foreach ($aRest as $aSet) {
+ $aResult[] = array_merge(array($sFirstToken), $aSet);
+ }
+ }
+ }
+
+ return $aResult;
+ }
+
+ private function createInverseWordSets($aWords, $iDepth)
+ {
+ $aResult = array(array(join(' ', $aWords)));
+ $sFirstToken = '';
+ if ($iDepth < Phrase::MAX_DEPTH) {
+ while (sizeof($aWords) > 1) {
+ $sWord = array_pop($aWords);
+ $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
+ $aRest = $this->createInverseWordSets($aWords, $iDepth + 1);
+ foreach ($aRest as $aSet) {
+ $aResult[] = array_merge(array($sFirstToken), $aSet);
+ }
+ }
+ }
+
+ return $aResult;
+ }
+}
/**
* Check if the combination of parameters is sensible.
*
- * @param string[] $aCountryCodes List of country codes.
- *
* @return bool True, if the search looks valid.
*/
- public function isValidSearch(&$aCountryCodes)
+ public function isValidSearch()
{
if (!sizeof($this->aName)) {
if ($this->sHouseNumber) {
return false;
}
- }
- if ($aCountryCodes
- && $this->sCountryCode
- && !in_array($this->sCountryCode, $aCountryCodes)
- ) {
- return false;
+ if (!$this->sClass && !$this->sCountryCode) {
+ return false;
+ }
}
return true;
* Derive new searches by adding a full term to the existing search.
*
* @param mixed[] $aSearchTerm Description of the token.
- * @param bool $bWordInQuery True, if the normalised version of the word
- * is contained in the query.
* @param bool $bHasPartial True if there are also tokens of partial terms
* with the same name.
* @param string $sPhraseType Type of phrase the token is contained in.
*
* @return SearchDescription[] List of derived search descriptions.
*/
- public function extendWithFullTerm($aSearchTerm, $bWordInQuery, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank)
+ public function extendWithFullTerm($aSearchTerm, $bHasPartial, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken, &$iGlobalRank)
{
$aNewSearches = array();
// We need to try the case where the postal code is the primary element
// (i.e. no way to tell if it is (postalcode, city) OR (city, postalcode)
// so try both.
- if (!$this->sPostcode && $bWordInQuery
+ if (!$this->sPostcode
+ && $aSearchTerm['word']
&& pg_escape_string($aSearchTerm['word']) == $aSearchTerm['word']
) {
// If we have structured search or this is the first term,
}
$aNewSearches[] = $oSearch;
}
- } elseif ($sPhraseType == ''
- && $aSearchTerm['class'] !== '' && $aSearchTerm['class'] !== null
- ) {
- // require a normalized exact match of the term
- // if we have the normalizer version of the query
- // available
- if ($this->iOperator == Operator::NONE
- && (isset($aSearchTerm['word']) && $aSearchTerm['word'])
- && $bWordInQuery
- ) {
+ } elseif ($sPhraseType == '' && $aSearchTerm['class']) {
+ if ($this->iOperator == Operator::NONE) {
$oSearch = clone $this;
$oSearch->iSearchRank++;
$oSearch->setPoiSearch($iOp, $aSearchTerm['class'], $aSearchTerm['type']);
$aNewSearches[] = $oSearch;
}
- } elseif (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) {
+ } elseif (isset($aSearchTerm['word_id'])
+ && $aSearchTerm['word_id']
+ && $sPhraseType != 'country'
+ ) {
$iWordID = $aSearchTerm['word_id'];
if (sizeof($this->aName)) {
if (($sPhraseType == '' || !$bFirstPhrase)
/**
* Derive new searches by adding a partial term to the existing search.
*
- * @param mixed[] $aSearchTerm Description of the token.
- * @param bool $bStructuredPhrases True if the search is structured.
- * @param integer $iPhrase Number of the phrase the token is in.
- * @param mixed[] $aWordFrequencyScores Number of times tokens appears
- * overall in a planet database.
- * @param array[] $aFullTokens List of full term tokens with the
- * same name.
+ * @param mixed[] $aSearchTerm Description of the token.
+ * @param bool $bStructuredPhrases True if the search is structured.
+ * @param integer $iPhrase Number of the phrase the token is in.
+ * @param array[] $aFullTokens List of full term tokens with the
+ * same name.
*
* @return SearchDescription[] List of derived search descriptions.
*/
- public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, &$aWordFrequencyScores, $aFullTokens)
+ public function extendWithPartialTerm($aSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
{
// Only allow name terms.
if (!(isset($aSearchTerm['word_id']) && $aSearchTerm['word_id'])) {
&& sizeof($this->aName)
&& strpos($aSearchTerm['word_token'], ' ') === false
) {
- if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) {
+ if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) {
$oSearch = clone $this;
$oSearch->iSearchRank++;
$oSearch->aAddress[$iWordID] = $iWordID;
if (preg_match('#^[0-9]+$#', $aSearchTerm['word_token'])) {
$oSearch->iSearchRank += 2;
}
- if ($aWordFrequencyScores[$iWordID] < CONST_Max_Word_Frequency) {
+ if ($aSearchTerm['search_name_count'] + 1 < CONST_Max_Word_Frequency) {
$oSearch->aName[$iWordID] = $iWordID;
} else {
$oSearch->aNameNonSearch[$iWordID] = $iWordID;
}
-function getWordSets($aWords, $iDepth)
-{
- $aResult = array(array(join(' ', $aWords)));
- $sFirstToken = '';
- if ($iDepth < 7) {
- while (sizeof($aWords) > 1) {
- $sWord = array_shift($aWords);
- $sFirstToken .= ($sFirstToken?' ':'').$sWord;
- $aRest = getWordSets($aWords, $iDepth+1);
- foreach ($aRest as $aSet) {
- $aResult[] = array_merge(array($sFirstToken), $aSet);
- }
- }
- }
- return $aResult;
-}
-
-function getInverseWordSets($aWords, $iDepth)
-{
- $aResult = array(array(join(' ', $aWords)));
- $sFirstToken = '';
- if ($iDepth < 8) {
- while (sizeof($aWords) > 1) {
- $sWord = array_pop($aWords);
- $sFirstToken = $sWord.($sFirstToken?' ':'').$sFirstToken;
- $aRest = getInverseWordSets($aWords, $iDepth+1);
- foreach ($aRest as $aSet) {
- $aResult[] = array_merge(array($sFirstToken), $aSet);
- }
- }
- }
- return $aResult;
-}
-
-
-function getTokensFromSets($aSets)
-{
- $aTokens = array();
- foreach ($aSets as $aSet) {
- foreach ($aSet as $sWord) {
- $aTokens[' '.$sWord] = ' '.$sWord;
- $aTokens[$sWord] = $sWord;
- }
- }
- return $aTokens;
-}
-
-
function getClassTypes()
{
return array(
}
- public function testGetWordSets()
- {
- // given an array of arrays like
- // array( array('a','b'), array('c','d') )
- // returns a summary as string: '(a|b),(c|d)'
-
-
- function serializeSets($aSets)
- {
- $aParts = array();
- foreach ($aSets as $aSet) {
- $aParts[] = '(' . join('|', $aSet) . ')';
- }
- return join(',', $aParts);
- }
-
- $this->assertEquals(
- array(array('')),
- getWordSets(array(), 0)
- );
-
- $this->assertEquals(
- '(a)',
- serializeSets(getWordSets(array("a"), 0))
- );
-
- $this->assertEquals(
- '(a b),(a|b)',
- serializeSets(getWordSets(array('a', 'b'), 0))
- );
-
- $this->assertEquals(
- '(a b c),(a|b c),(a|b|c),(a b|c)',
- serializeSets(getWordSets(array('a', 'b', 'c'), 0))
- );
-
- $this->assertEquals(
- '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)',
- serializeSets(getWordSets(array('a', 'b', 'c', 'd'), 0))
- );
-
-
- // Inverse
- $this->assertEquals(
- '(a b c),(c|a b),(c|b|a),(b c|a)',
- serializeSets(getInverseWordSets(array('a', 'b', 'c'), 0))
- );
-
-
- // make sure we don't create too many sets
- // 4 words => 8 sets
- // 10 words => 511 sets
- // 15 words => 12911 sets
- // 18 words => 65536 sets
- // 20 words => 169766 sets
- // 22 words => 401930 sets
- // 28 words => 3505699 sets (needs more than 4GB via 'phpunit -d memory_limit=' to run)
- $this->assertEquals(
- 8,
- count(getWordSets(array_fill(0, 4, 'a'), 0))
- );
-
-
- $this->assertEquals(
- 41226,
- count(getWordSets(array_fill(0, 18, 'a'), 0))
- );
- }
-
-
public function testCreatePointsAroundCenter()
{
// you might say we're creating a circle
--- /dev/null
+<?php
+
+namespace Nominatim;
+
+require_once '../../lib/Phrase.php';
+
+class PhraseTest extends \PHPUnit_Framework_TestCase
+{
+ private function serializeSets($aSets)
+ {
+ $aParts = array();
+ foreach ($aSets as $aSet) {
+ $aParts[] = '(' . join('|', $aSet) . ')';
+ }
+ return join(',', $aParts);
+ }
+
+
+ public function testEmptyPhrase()
+ {
+ $oPhrase = new Phrase('', '');
+
+ $this->assertEquals(
+ array(array('')),
+ $oPhrase->getWordSets()
+ );
+ }
+
+
+ public function testSingleWordPhrase()
+ {
+ $oPhrase = new Phrase('a', '');
+
+ $this->assertEquals(
+ '(a)',
+ $this->serializeSets($oPhrase->getWordSets())
+ );
+ }
+
+
+ public function testMultiWordPhrase()
+ {
+ $oPhrase = new Phrase('a b', '');
+ $this->assertEquals(
+ '(a b),(a|b)',
+ $this->serializeSets($oPhrase->getWordSets())
+ );
+
+ $oPhrase = new Phrase('a b c', '');
+ $this->assertEquals(
+ '(a b c),(a|b c),(a|b|c),(a b|c)',
+ $this->serializeSets($oPhrase->getWordSets())
+ );
+
+ $oPhrase = new Phrase('a b c d', '');
+ $this->assertEquals(
+ '(a b c d),(a|b c d),(a|b|c d),(a|b|c|d),(a|b c|d),(a b|c d),(a b|c|d),(a b c|d)',
+ $this->serializeSets($oPhrase->getWordSets())
+ );
+ }
+
+
+ public function testInverseWordSets()
+ {
+ $oPhrase = new Phrase('a b c', '');
+ $oPhrase->invertWordSets();
+
+ $this->assertEquals(
+ '(a b c),(c|a b),(c|b|a),(b c|a)',
+ $this->serializeSets($oPhrase->getWordSets())
+ );
+ }
+
+
+ public function testMaxDepth()
+ {
+ $oPhrase = new Phrase(join(' ', array_fill(0, 4, 'a')), '');
+ $this->assertEquals(8, count($oPhrase->getWordSets()));
+ $oPhrase->invertWordSets();
+ $this->assertEquals(8, count($oPhrase->getWordSets()));
+
+ $oPhrase = new Phrase(join(' ', array_fill(0, 18, 'a')), '');
+ $this->assertEquals(41226, count($oPhrase->getWordSets()));
+ $oPhrase->invertWordSets();
+ $this->assertEquals(41226, count($oPhrase->getWordSets()));
+ }
+}