From: Sarah Hoffmann Date: Mon, 19 Jul 2021 07:42:37 +0000 (+0200) Subject: Merge pull request #2396 from lonvia/partial-word-token X-Git-Tag: v4.0.0~50 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/ee32315378868748fe3704616e9ca860cdc97da9?hp=bc8b2d4ae0dbaef64448ddcb530de9626da9d82d Merge pull request #2396 from lonvia/partial-word-token Reorganise code that build the SearchDescription --- diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php index ec21a0dc..52b92c99 100644 --- a/lib-php/Geocode.php +++ b/lib-php/Geocode.php @@ -7,6 +7,7 @@ require_once(CONST_LibDir.'/Phrase.php'); require_once(CONST_LibDir.'/ReverseGeocode.php'); require_once(CONST_LibDir.'/SearchDescription.php'); require_once(CONST_LibDir.'/SearchContext.php'); +require_once(CONST_LibDir.'/SearchPosition.php'); require_once(CONST_LibDir.'/TokenList.php'); require_once(CONST_TokenizerDir.'/tokenizer.php'); @@ -345,7 +346,11 @@ class Geocode */ foreach ($aPhrases as $iPhrase => $oPhrase) { $aNewPhraseSearches = array(); - $sPhraseType = $oPhrase->getPhraseType(); + $oPosition = new SearchPosition( + $oPhrase->getPhraseType(), + $iPhrase, + count($aPhrases) + ); foreach ($oPhrase->getWordSets() as $aWordset) { $aWordsetSearches = $aSearches; @@ -353,37 +358,14 @@ class Geocode // Add all words from this wordset foreach ($aWordset as $iToken => $sToken) { $aNewWordsetSearches = array(); + $oPosition->setTokenPosition($iToken, count($aWordset)); foreach ($aWordsetSearches as $oCurrentSearch) { - // Tokens with full name matches. - foreach ($oValidTokens->get(' '.$sToken) as $oSearchTerm) { - $aNewSearches = $oCurrentSearch->extendWithFullTerm( - $oSearchTerm, - $sPhraseType, - $iToken == 0 && $iPhrase == 0, - $iPhrase == 0, - $iToken + 1 == count($aWordset) - && $iPhrase + 1 == count($aPhrases) - ); - - foreach ($aNewSearches as $oSearch) { - if ($oSearch->getRank() < $this->iMaxRank) { - $aNewWordsetSearches[] = $oSearch; - } - } - } - // Look for partial matches. - // Note that there is no point in adding country terms here - // because country is omitted in the address. - if ($sPhraseType != 'country') { - // Allow searching for a word - but at extra cost - foreach ($oValidTokens->get($sToken) as $oSearchTerm) { - $aNewSearches = $oCurrentSearch->extendWithPartialTerm( - $sToken, - $oSearchTerm, - (bool) $sPhraseType, - $iPhrase, - $oValidTokens->get(' '.$sToken) + foreach ($oValidTokens->get($sToken) as $oSearchTerm) { + if ($oSearchTerm->isExtendable($oCurrentSearch, $oPosition)) { + $aNewSearches = $oSearchTerm->extendSearch( + $oCurrentSearch, + $oPosition ); foreach ($aNewSearches as $oSearch) { @@ -573,15 +555,15 @@ class Geocode if (!empty($aTokens)) { $aNewSearches = array(); + $oPosition = new SearchPosition('', 0, 1); + $oPosition->setTokenPosition(0, 1); + foreach ($aSearches as $oSearch) { foreach ($aTokens as $oToken) { - $oNewSearch = clone $oSearch; - $oNewSearch->setPoiSearch( - $oToken->iOperator, - $oToken->sClass, - $oToken->sType + $aNewSearches = array_merge( + $aNewSearches, + $oToken->extendSearch($oSearch, $oPosition) ); - $aNewSearches[] = $oNewSearch; } } $aSearches = $aNewSearches; diff --git a/lib-php/SearchDescription.php b/lib-php/SearchDescription.php index 6091fd61..4d944bfb 100644 --- a/lib-php/SearchDescription.php +++ b/lib-php/SearchDescription.php @@ -67,35 +67,6 @@ class SearchDescription return $this->iSearchRank; } - /** - * Make this search a POI search. - * - * In a POI search, objects are not (only) searched by their name - * but also by the primary OSM key/value pair (class and type in Nominatim). - * - * @param integer $iOperator Type of POI search - * @param string $sClass Class (or OSM tag key) of POI. - * @param string $sType Type (or OSM tag value) of POI. - * - * @return void - */ - public function setPoiSearch($iOperator, $sClass, $sType) - { - $this->iOperator = $iOperator; - $this->sClass = $sClass; - $this->sType = $sType; - } - - /** - * Check if any operator is set. - * - * @return bool True, if this is a special search operation. - */ - public function hasOperator() - { - return $this->iOperator != Operator::NONE; - } - /** * Extract key/value pairs from a query. * @@ -148,253 +119,234 @@ class SearchDescription /////////// Search building functions - /** - * Derive new searches by adding a full term to the existing search. + * Create a copy of this search description adding to search rank. * - * @param object $oSearchTerm Description of the token. - * @param string $sPhraseType Type of phrase the token is contained in. - * @param bool $bFirstToken True if the token is at the beginning of the - * query. - * @param bool $bFirstPhrase True if the token is in the first phrase of - * the query. - * @param bool $bLastToken True if the token is at the end of the query. + * @param integer $iTermCost Cost to add to the current search rank. * - * @return SearchDescription[] List of derived search descriptions. + * @return object Cloned search description. */ - public function extendWithFullTerm($oSearchTerm, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken) + public function clone($iTermCost) { - $aNewSearches = array(); + $oSearch = clone $this; + $oSearch->iSearchRank += $iTermCost; - if (($sPhraseType == '' || $sPhraseType == 'country') - && is_a($oSearchTerm, '\Nominatim\Token\Country') - ) { - if (!$this->sCountryCode) { - $oSearch = clone $this; - $oSearch->iSearchRank++; - $oSearch->sCountryCode = $oSearchTerm->sCountryCode; - // Country is almost always at the end of the string - // - increase score for finding it anywhere else (optimisation) - if (!$bLastToken) { - $oSearch->iSearchRank += 5; - $oSearch->iNamePhrase = -1; - } - $aNewSearches[] = $oSearch; - } - } elseif (($sPhraseType == '' || $sPhraseType == 'postalcode') - && is_a($oSearchTerm, '\Nominatim\Token\Postcode') - ) { - if (!$this->sPostcode) { - // If we have structured search or this is the first term, - // make the postcode the primary search element. - if ($this->iOperator == Operator::NONE && $bFirstToken) { - $oSearch = clone $this; - $oSearch->iSearchRank++; - $oSearch->iOperator = Operator::POSTCODE; - $oSearch->aAddress = array_merge($this->aAddress, $this->aName); - $oSearch->aName = - array($oSearchTerm->iId => $oSearchTerm->sPostcode); - $aNewSearches[] = $oSearch; - } + return $oSearch; + } - // If we have a structured search or this is not the first term, - // add the postcode as an addendum. - if ($this->iOperator != Operator::POSTCODE - && ($sPhraseType == 'postalcode' || !empty($this->aName)) - ) { - $oSearch = clone $this; - $oSearch->iSearchRank++; - $oSearch->iNamePhrase = -1; - if (strlen($oSearchTerm->sPostcode) < 4) { - $oSearch->iSearchRank += 4 - strlen($oSearchTerm->sPostcode); - } - $oSearch->sPostcode = $oSearchTerm->sPostcode; - $aNewSearches[] = $oSearch; - } - } - } elseif (($sPhraseType == '' || $sPhraseType == 'street') - && is_a($oSearchTerm, '\Nominatim\Token\HouseNumber') - ) { - if (!$this->sHouseNumber && $this->iOperator != Operator::POSTCODE) { - // sanity check: if the housenumber is not mainly made - // up of numbers, add a penalty - $iSearchCost = 1; - if (preg_match('/\\d/', $oSearchTerm->sToken) === 0 - || preg_match_all('/[^0-9]/', $oSearchTerm->sToken, $aMatches) > 2) { - $iSearchCost++; - } - if ($this->iOperator != Operator::NONE) { - $iSearchCost++; - } - if (empty($oSearchTerm->iId)) { - $iSearchCost++; - } - // also must not appear in the middle of the address - if (!empty($this->aAddress) - || (!empty($this->aAddressNonSearch)) - || $this->sPostcode - ) { - $iSearchCost++; - } + /** + * Check if the search currently includes a name. + * + * @param bool bIncludeNonNames If true stop-word tokens are taken into + * account, too. + * + * @return bool True, if search has a name. + */ + public function hasName($bIncludeNonNames = false) + { + return !empty($this->aName) + || (!empty($this->aNameNonSearch) && $bIncludeNonNames); + } - $oSearch = clone $this; - $oSearch->iSearchRank += $iSearchCost; - $oSearch->iNamePhrase = -1; - $oSearch->sHouseNumber = $oSearchTerm->sToken; - $aNewSearches[] = $oSearch; - - // Housenumbers may appear in the name when the place has its own - // address terms. - if ($oSearchTerm->iId !== null - && ($this->iNamePhrase >= 0 || empty($this->aName)) - && empty($this->aAddress) - ) { - $oSearch = clone $this; - $oSearch->iSearchRank += $iSearchCost; - $oSearch->aAddress = $this->aName; - $oSearch->bRareName = false; - $oSearch->aName = array($oSearchTerm->iId => $oSearchTerm->iId); - $aNewSearches[] = $oSearch; - } - } - } elseif ($sPhraseType == '' - && is_a($oSearchTerm, '\Nominatim\Token\SpecialTerm') - ) { - if ($this->iOperator == Operator::NONE) { - $oSearch = clone $this; - $oSearch->iSearchRank += 2; - $oSearch->iNamePhrase = -1; - - $iOp = $oSearchTerm->iOperator; - if ($iOp == Operator::NONE) { - if (!empty($this->aName) || $this->oContext->isBoundedSearch()) { - $iOp = Operator::NAME; - } else { - $iOp = Operator::NEAR; - } - $oSearch->iSearchRank += 2; - } elseif (!$bFirstToken && !$bLastToken) { - $oSearch->iSearchRank += 2; - } - if ($this->sHouseNumber) { - $oSearch->iSearchRank++; - } + /** + * Check if the search currently includes an address term. + * + * @return bool True, if any address term is included, including stop-word + * terms. + */ + public function hasAddress() + { + return !empty($this->aAddress) || !empty($this->aAddressNonSearch); + } - $oSearch->setPoiSearch( - $iOp, - $oSearchTerm->sClass, - $oSearchTerm->sType - ); - $aNewSearches[] = $oSearch; - } - } elseif ($sPhraseType != 'country' - && is_a($oSearchTerm, '\Nominatim\Token\Word') - ) { - $iWordID = $oSearchTerm->iId; - // Full words can only be a name if they appear at the beginning - // of the phrase. In structured search the name must forcably in - // the first phrase. In unstructured search it may be in a later - // phrase when the first phrase is a house number. - if (!empty($this->aName) || !($bFirstPhrase || $sPhraseType == '')) { - if (($sPhraseType == '' || !$bFirstPhrase) && $oSearchTerm->iTermCount > 1) { - $oSearch = clone $this; - $oSearch->iNamePhrase = -1; - $oSearch->iSearchRank += 1; - $oSearch->aAddress[$iWordID] = $iWordID; - $aNewSearches[] = $oSearch; - } - } elseif (empty($this->aNameNonSearch)) { - $oSearch = clone $this; - $oSearch->iSearchRank++; - $oSearch->aName = array($iWordID => $iWordID); - if (CONST_Search_NameOnlySearchFrequencyThreshold) { - $oSearch->bRareName = - $oSearchTerm->iSearchNameCount - < CONST_Search_NameOnlySearchFrequencyThreshold; - } - $aNewSearches[] = $oSearch; - } - } + /** + * Check if a country restriction is currently included in the search. + * + * @return bool True, if a country restriction is set. + */ + public function hasCountry() + { + return $this->sCountryCode !== ''; + } - return $aNewSearches; + /** + * Check if a postcode is currently included in the search. + * + * @return bool True, if a postcode is set. + */ + public function hasPostcode() + { + return $this->sPostcode !== ''; } /** - * Derive new searches by adding a partial term to the existing search. + * Check if a house number is set for the search. * - * @param string $sToken Term for the token. - * @param object $oSearchTerm Description of the token. - * @param bool $bStructuredPhrases True if the search is structured. - * @param integer $iPhrase Number of the phrase the token is in. - * @param array[] $aFullTokens List of full term tokens with the - * same name. + * @return bool True, if a house number is set. + */ + public function hasHousenumber() + { + return $this->sHouseNumber !== ''; + } + + /** + * Check if a special type of place is requested. * - * @return SearchDescription[] List of derived search descriptions. + * param integer iOperator When set, check for the particular + * operator used for the special type. + * + * @return bool True, if speial type is requested or, if requested, + * a special type with the given operator. */ - public function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens) + public function hasOperator($iOperator = null) { - // Only allow name terms. - if (!(is_a($oSearchTerm, '\Nominatim\Token\Word')) - || strpos($sToken, ' ') !== false - ) { - return array(); + return $iOperator === null ? $this->iOperator != Operator::NONE : $this->iOperator == $iOperator; + } + + /** + * Add the given token to the list of terms to search for in the address. + * + * @param integer iID ID of term to add. + * @param bool bSearchable Term should be used to search for result + * (i.e. term is not a stop word). + */ + public function addAddressToken($iId, $bSearchable = true) + { + if ($bSearchable) { + $this->aAddress[$iId] = $iId; + } else { + $this->aAddressNonSearch[$iId] = $iId; } + } - $aNewSearches = array(); - $iWordID = $oSearchTerm->iId; + /** + * Add the given full-word token to the list of terms to search for in the + * name. + * + * @param interger iId ID of term to add. + * @param bool bRareName True if the term is infrequent enough to not + * require other constraints for efficient search. + */ + public function addNameToken($iId, $bRareName) + { + $this->aName[$iId] = $iId; + $this->bRareName = $bRareName; + } - if ((!$bStructuredPhrases || $iPhrase > 0) - && (!empty($this->aName)) - ) { - $oSearch = clone $this; - $oSearch->iSearchRank++; - if (preg_match('#^[0-9 ]+$#', $sToken)) { - $oSearch->iSearchRank++; - } - if ($oSearchTerm->iSearchNameCount < CONST_Max_Word_Frequency) { - $oSearch->aAddress[$iWordID] = $iWordID; - } else { - $oSearch->aAddressNonSearch[$iWordID] = $iWordID; - if (!empty($aFullTokens)) { - $oSearch->iSearchRank++; - } - } - $aNewSearches[] = $oSearch; + /** + * Add the given partial token to the list of terms to search for in + * the name. + * + * @param integer iID ID of term to add. + * @param bool bSearchable Term should be used to search for result + * (i.e. term is not a stop word). + * @param integer iPhraseNumber Index of phrase, where the partial term + * appears. + */ + public function addPartialNameToken($iId, $bSearchable, $iPhraseNumber) + { + if ($bSearchable) { + $this->aName[$iId] = $iId; + } else { + $this->aNameNonSearch[$iId] = $iId; } + $this->iNamePhrase = $iPhraseNumber; + } - if ((!$this->sPostcode && !$this->aAddress && !$this->aAddressNonSearch) - && ((empty($this->aName) && empty($this->aNameNonSearch)) || $this->iNamePhrase == $iPhrase) - ) { - $oSearch = clone $this; - $oSearch->iSearchRank++; - if (empty($this->aName) && empty($this->aNameNonSearch)) { - $oSearch->iSearchRank++; - } - if (preg_match('#^[0-9 ]+$#', $sToken)) { - $oSearch->iSearchRank++; - } - if ($oSearchTerm->iSearchNameCount < CONST_Max_Word_Frequency) { - if (empty($this->aName) - && CONST_Search_NameOnlySearchFrequencyThreshold - ) { - $oSearch->bRareName = - $oSearchTerm->iSearchNameCount - < CONST_Search_NameOnlySearchFrequencyThreshold; - } else { - $oSearch->bRareName = false; - } - $oSearch->aName[$iWordID] = $iWordID; - } else { - if (!empty($aFullTokens)) { - $oSearch->iSearchRank++; - } - $oSearch->aNameNonSearch[$iWordID] = $iWordID; - } - $oSearch->iNamePhrase = $iPhrase; - $aNewSearches[] = $oSearch; - } + /** + * Set country restriction for the search. + * + * @param string sCountryCode Country code of country to restrict search to. + */ + public function setCountry($sCountryCode) + { + $this->sCountryCode = $sCountryCode; + $this->iNamePhrase = -1; + } + + /** + * Set postcode search constraint. + * + * @param string sPostcode Postcode the result should have. + */ + public function setPostcode($sPostcode) + { + $this->sPostcode = $sPostcode; + $this->iNamePhrase = -1; + } + + /** + * Make this search a search for a postcode object. + * + * @param integer iId Token Id for the postcode. + * @param string sPostcode Postcode to look for. + */ + public function setPostcodeAsName($iId, $sPostcode) + { + $this->iOperator = Operator::POSTCODE; + $this->aAddress = array_merge($this->aAddress, $this->aName); + $this->aName = array($iId => $sPostcode); + $this->bRareName = true; + $this->iNamePhrase = -1; + } + + /** + * Set house number search cnstraint. + * + * @param string sNumber House number the result should have. + */ + public function setHousenumber($sNumber) + { + $this->sHouseNumber = $sNumber; + $this->iNamePhrase = -1; + } + + /** + * Make this search a search for a house number. + * + * @param integer iId Token Id for the house number. + */ + public function setHousenumberAsName($iId) + { + $this->aAddress = array_merge($this->aAddress, $this->aName); + $this->bRareName = false; + $this->aName = array($iId => $iId); + $this->iNamePhrase = -1; + } + + /** + * Make this search a POI search. + * + * In a POI search, objects are not (only) searched by their name + * but also by the primary OSM key/value pair (class and type in Nominatim). + * + * @param integer $iOperator Type of POI search + * @param string $sClass Class (or OSM tag key) of POI. + * @param string $sType Type (or OSM tag value) of POI. + * + * @return void + */ + public function setPoiSearch($iOperator, $sClass, $sType) + { + $this->iOperator = $iOperator; + $this->sClass = $sClass; + $this->sType = $sType; + $this->iNamePhrase = -1; + } + + public function getNamePhrase() + { + return $this->iNamePhrase; + } - return $aNewSearches; + /** + * Get the global search context. + * + * @return object Objects of global search constraints. + */ + public function getContext() + { + return $this->oContext; } /////////// Query functions diff --git a/lib-php/SearchPosition.php b/lib-php/SearchPosition.php new file mode 100644 index 00000000..e4260bf2 --- /dev/null +++ b/lib-php/SearchPosition.php @@ -0,0 +1,87 @@ +sPhraseType = $sPhraseType; + $this->iPhrase = $iPhrase; + $this->iNumPhrases = $iNumPhrases; + } + + public function setTokenPosition($iToken, $iNumTokens) + { + $this->iToken = $iToken; + $this->iNumTokens = $iNumTokens; + } + + /** + * Check if the phrase can be of the given type. + * + * @param string $sType Type of phrse requested. + * + * @return True if the phrase is untyped or of the given type. + */ + public function maybePhrase($sType) + { + return $this->sPhraseType == '' || $this->sPhraseType == $sType; + } + + /** + * Check if the phrase is exactly of the given type. + * + * @param string $sType Type of phrse requested. + * + * @return True if the phrase of the given type. + */ + public function isPhrase($sType) + { + return $this->sPhraseType == $sType; + } + + /** + * Return true if the token is the very first in the query. + */ + public function isFirstToken() + { + return $this->iPhrase == 0 && $this->iToken == 0; + } + + /** + * Check if the token is the final one in the query. + */ + public function isLastToken() + { + return $this->iToken + 1 == $this->iNumTokens && $this->iPhrase + 1 == $this->iNumPhrases; + } + + /** + * Check if the current token is part of the first phrase in the query. + */ + public function isFirstPhrase() + { + return $this->iPhrase == 0; + } + + /** + * Get the phrase position in the query. + */ + public function getPhrase() + { + return $this->iPhrase; + } +} diff --git a/lib-php/TokenCountry.php b/lib-php/TokenCountry.php index 518c0a31..c9b7b6af 100644 --- a/lib-php/TokenCountry.php +++ b/lib-php/TokenCountry.php @@ -8,9 +8,9 @@ namespace Nominatim\Token; class Country { /// Database word id, if available. - public $iId; + private $iId; /// Two-letter country code (lower-cased). - public $sCountryCode; + private $sCountryCode; public function __construct($iId, $sCountryCode) { @@ -18,6 +18,44 @@ class Country $this->sCountryCode = $sCountryCode; } + public function getId() + { + return $this->iId; + } + + /** + * Check if the token can be added to the given search. + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return True if the token is compatible with the search configuration + * given the position. + */ + public function isExtendable($oSearch, $oPosition) + { + return !$oSearch->hasCountry() && $oPosition->maybePhrase('country'); + } + + /** + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return SearchDescription[] List of derived search descriptions. + */ + public function extendSearch($oSearch, $oPosition) + { + $oNewSearch = $oSearch->clone($oPosition->isLastToken() ? 1 : 6); + $oNewSearch->setCountry($this->sCountryCode); + + return array($oNewSearch); + } + public function debugInfo() { return array( @@ -26,4 +64,9 @@ class Country 'Info' => $this->sCountryCode ); } + + public function debugCode() + { + return 'C'; + } } diff --git a/lib-php/TokenHousenumber.php b/lib-php/TokenHousenumber.php index 5c7c6e9b..cd60d3ca 100644 --- a/lib-php/TokenHousenumber.php +++ b/lib-php/TokenHousenumber.php @@ -8,9 +8,9 @@ namespace Nominatim\Token; class HouseNumber { /// Database word id, if available. - public $iId; + private $iId; /// Normalized house number. - public $sToken; + private $sToken; public function __construct($iId, $sToken) { @@ -18,6 +18,80 @@ class HouseNumber $this->sToken = $sToken; } + public function getId() + { + return $this->iId; + } + + /** + * Check if the token can be added to the given search. + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return True if the token is compatible with the search configuration + * given the position. + */ + public function isExtendable($oSearch, $oPosition) + { + return !$oSearch->hasHousenumber() + && !$oSearch->hasOperator(\Nominatim\Operator::POSTCODE) + && $oPosition->maybePhrase('street'); + } + + /** + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return SearchDescription[] List of derived search descriptions. + */ + public function extendSearch($oSearch, $oPosition) + { + $aNewSearches = array(); + + // sanity check: if the housenumber is not mainly made + // up of numbers, add a penalty + $iSearchCost = 1; + if (preg_match('/\\d/', $this->sToken) === 0 + || preg_match_all('/[^0-9]/', $this->sToken, $aMatches) > 2) { + $iSearchCost++; + } + if (!$oSearch->hasOperator(\Nominatim\Operator::NONE)) { + $iSearchCost++; + } + if (empty($this->iId)) { + $iSearchCost++; + } + // also must not appear in the middle of the address + if ($oSearch->hasAddress() || $oSearch->hasPostcode()) { + $iSearchCost++; + } + + $oNewSearch = $oSearch->clone($iSearchCost); + $oNewSearch->setHousenumber($this->sToken); + $aNewSearches[] = $oNewSearch; + + // Housenumbers may appear in the name when the place has its own + // address terms. + if ($this->iId !== null + && ($oSearch->getNamePhrase() >= 0 || !$oSearch->hasName()) + && !$oSearch->hasAddress() + ) { + $oNewSearch = $oSearch->clone($iSearchCost); + $oNewSearch->setHousenumberAsName($this->iId); + + $aNewSearches[] = $oNewSearch; + } + + return $aNewSearches; + } + + public function debugInfo() { return array( @@ -26,4 +100,9 @@ class HouseNumber 'Info' => array('nr' => $this->sToken) ); } + + public function debugCode() + { + return 'H'; + } } diff --git a/lib-php/TokenList.php b/lib-php/TokenList.php index 2df9fe05..a599648c 100644 --- a/lib-php/TokenList.php +++ b/lib-php/TokenList.php @@ -7,6 +7,7 @@ require_once(CONST_LibDir.'/TokenHousenumber.php'); require_once(CONST_LibDir.'/TokenPostcode.php'); require_once(CONST_LibDir.'/TokenSpecialTerm.php'); require_once(CONST_LibDir.'/TokenWord.php'); +require_once(CONST_LibDir.'/TokenPartial.php'); require_once(CONST_LibDir.'/SpecialSearchOperator.php'); /** @@ -17,15 +18,6 @@ require_once(CONST_LibDir.'/SpecialSearchOperator.php'); * tokens do not have a common base class. All tokens need to have a field * with the word id that points to an entry in the `word` database table * but otherwise the information saved about a token can be very different. - * - * There are two different kinds of token words: full words and partial terms. - * - * Full words start with a space. They represent a complete name of a place. - * All special tokens are normally full words. - * - * Partial terms have no space at the beginning. They may represent a part of - * a name of a place (e.g. in the name 'World Trade Center' a partial term - * would be 'Trade' or 'Trade Center'). They are only used in TokenWord. */ class TokenList { @@ -64,7 +56,7 @@ class TokenList */ public function containsAny($sWord) { - return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]); + return isset($this->aTokens[$sWord]); } /** @@ -86,8 +78,8 @@ class TokenList foreach ($this->aTokens as $aTokenList) { foreach ($aTokenList as $oToken) { - if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) { - $ids[$oToken->iId] = $oToken->iId; + if (is_a($oToken, '\Nominatim\Token\Word')) { + $ids[$oToken->getId()] = $oToken->getId(); } } } @@ -117,9 +109,9 @@ class TokenList $aWordsIDs = array(); foreach ($this->aTokens as $sToken => $aWords) { foreach ($aWords as $aToken) { - if ($aToken->iId !== null) { - $aWordsIDs[$aToken->iId] = - '#'.$sToken.'('.$aToken->iId.')#'; + $iId = $aToken->getId(); + if ($iId !== null) { + $aWordsIDs[$iId] = '#'.$sToken.'('.$aToken->debugCode().' '.$iId.')#'; } } } diff --git a/lib-php/TokenPartial.php b/lib-php/TokenPartial.php new file mode 100644 index 00000000..131bb2a3 --- /dev/null +++ b/lib-php/TokenPartial.php @@ -0,0 +1,118 @@ +iId = $iId; + $this->bNumberToken = (bool) preg_match('#^[0-9 ]+$#', $sToken); + $this->iSearchNameCount = $iSearchNameCount; + } + + public function getId() + { + return $this->iId; + } + + /** + * Check if the token can be added to the given search. + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return True if the token is compatible with the search configuration + * given the position. + */ + public function isExtendable($oSearch, $oPosition) + { + return !$oPosition->isPhrase('country'); + } + + /** + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return SearchDescription[] List of derived search descriptions. + */ + public function extendSearch($oSearch, $oPosition) + { + $aNewSearches = array(); + + // Partial token in Address. + if (($oPosition->isPhrase('') || !$oPosition->isFirstPhrase()) + && $oSearch->hasName() + ) { + $iSearchCost = $this->bNumberToken ? 2 : 1; + if ($this->iSearchNameCount >= CONST_Max_Word_Frequency) { + $iSearchCost += 1; + } + + $oNewSearch = $oSearch->clone($iSearchCost); + $oNewSearch->addAddressToken( + $this->iId, + $this->iSearchNameCount < CONST_Max_Word_Frequency + ); + + $aNewSearches[] = $oNewSearch; + } + + // Partial token in Name. + if ((!$oSearch->hasPostcode() && !$oSearch->hasAddress()) + && (!$oSearch->hasName(true) + || $oSearch->getNamePhrase() == $oPosition->getPhrase()) + ) { + $iSearchCost = 1; + if (!$oSearch->hasName(true)) { + $iSearchCost += 1; + } + if ($this->bNumberToken) { + $iSearchCost += 1; + } + + $oNewSearch = $oSearch->clone($iSearchCost); + $oNewSearch->addPartialNameToken( + $this->iId, + $this->iSearchNameCount < CONST_Max_Word_Frequency, + $oPosition->getPhrase() + ); + + $aNewSearches[] = $oNewSearch; + } + + return $aNewSearches; + } + + + public function debugInfo() + { + return array( + 'ID' => $this->iId, + 'Type' => 'partial', + 'Info' => array( + 'count' => $this->iSearchNameCount + ) + ); + } + + public function debugCode() + { + return 'w'; + } +} diff --git a/lib-php/TokenPostcode.php b/lib-php/TokenPostcode.php index 8fa2ae80..c0b42fad 100644 --- a/lib-php/TokenPostcode.php +++ b/lib-php/TokenPostcode.php @@ -8,11 +8,11 @@ namespace Nominatim\Token; class Postcode { /// Database word id, if available. - public $iId; + private $iId; /// Full nomralized postcode (upper cased). - public $sPostcode; + private $sPostcode; // Optional country code the postcode belongs to (currently unused). - public $sCountryCode; + private $sCountryCode; public function __construct($iId, $sPostcode, $sCountryCode = '') { @@ -21,6 +21,67 @@ class Postcode $this->sCountryCode = empty($sCountryCode) ? '' : $sCountryCode; } + public function getId() + { + return $this->iId; + } + + /** + * Check if the token can be added to the given search. + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return True if the token is compatible with the search configuration + * given the position. + */ + public function isExtendable($oSearch, $oPosition) + { + return !$oSearch->hasPostcode() && $oPosition->maybePhrase('postalcode'); + } + + /** + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return SearchDescription[] List of derived search descriptions. + */ + public function extendSearch($oSearch, $oPosition) + { + $aNewSearches = array(); + + // If we have structured search or this is the first term, + // make the postcode the primary search element. + if ($oSearch->hasOperator(\Nominatim\Operator::NONE) && $oPosition->isFirstToken()) { + $oNewSearch = $oSearch->clone(1); + $oNewSearch->setPostcodeAsName($this->iId, $this->sPostcode); + + $aNewSearches[] = $oNewSearch; + } + + // If we have a structured search or this is not the first term, + // add the postcode as an addendum. + if (!$oSearch->hasOperator(\Nominatim\Operator::POSTCODE) + && ($oPosition->isPhrase('postalcode') || $oSearch->hasName()) + ) { + $iPenalty = 1; + if (strlen($this->sPostcode) < 4) { + $iPenalty += 4 - strlen($this->sPostcode); + } + $oNewSearch = $oSearch->clone($iPenalty); + $oNewSearch->setPostcode($this->sPostcode); + + $aNewSearches[] = $oNewSearch; + } + + return $aNewSearches; + } + public function debugInfo() { return array( @@ -29,4 +90,9 @@ class Postcode 'Info' => $this->sPostcode.'('.$this->sCountryCode.')' ); } + + public function debugCode() + { + return 'P'; + } } diff --git a/lib-php/TokenSpecialTerm.php b/lib-php/TokenSpecialTerm.php index b2c312ec..5b2d4c70 100644 --- a/lib-php/TokenSpecialTerm.php +++ b/lib-php/TokenSpecialTerm.php @@ -10,13 +10,13 @@ require_once(CONST_LibDir.'/SpecialSearchOperator.php'); class SpecialTerm { /// Database word id, if applicable. - public $iId; + private $iId; /// Class (or OSM tag key) of the place to look for. - public $sClass; + private $sClass; /// Type (or OSM tag value) of the place to look for. - public $sType; + private $sType; /// Relationship of the operator to the object (see Operator class). - public $iOperator; + private $iOperator; public function __construct($iID, $sClass, $sType, $iOperator) { @@ -26,6 +26,62 @@ class SpecialTerm $this->iOperator = $iOperator; } + public function getId() + { + return $this->iId; + } + + /** + * Check if the token can be added to the given search. + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return True if the token is compatible with the search configuration + * given the position. + */ + public function isExtendable($oSearch, $oPosition) + { + return !$oSearch->hasOperator() && $oPosition->isPhrase(''); + } + + /** + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return SearchDescription[] List of derived search descriptions. + */ + public function extendSearch($oSearch, $oPosition) + { + $iSearchCost = 2; + + $iOp = $this->iOperator; + if ($iOp == \Nominatim\Operator::NONE) { + if ($oSearch->hasName() || $oSearch->getContext()->isBoundedSearch()) { + $iOp = \Nominatim\Operator::NAME; + } else { + $iOp = \Nominatim\Operator::NEAR; + } + $iSearchCost += 2; + } elseif (!$oPosition->isFirstToken() && !$oPosition->isLastToken()) { + $iSearchCost += 2; + } + if ($oSearch->hasHousenumber()) { + $iSearchCost ++; + } + + $oNewSearch = $oSearch->clone($iSearchCost); + $oNewSearch->setPoiSearch($iOp, $this->sClass, $this->sType); + + return array($oNewSearch); + } + + public function debugInfo() { return array( @@ -38,4 +94,9 @@ class SpecialTerm ) ); } + + public function debugCode() + { + return 'S'; + } } diff --git a/lib-php/TokenWord.php b/lib-php/TokenWord.php index fc28535d..59456e35 100644 --- a/lib-php/TokenWord.php +++ b/lib-php/TokenWord.php @@ -8,31 +8,95 @@ namespace Nominatim\Token; class Word { /// Database word id, if applicable. - public $iId; - /// If true, the word may represent only part of a place name. - public $bPartial; + private $iId; /// Number of appearances in the database. - public $iSearchNameCount; + private $iSearchNameCount; /// Number of terms in the word. - public $iTermCount; + private $iTermCount; - public function __construct($iId, $bPartial, $iSearchNameCount, $iTermCount) + public function __construct($iId, $iSearchNameCount, $iTermCount) { $this->iId = $iId; - $this->bPartial = $bPartial; $this->iSearchNameCount = $iSearchNameCount; $this->iTermCount = $iTermCount; } + public function getId() + { + return $this->iId; + } + + /** + * Check if the token can be added to the given search. + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return True if the token is compatible with the search configuration + * given the position. + */ + public function isExtendable($oSearch, $oPosition) + { + return !$oPosition->isPhrase('country'); + } + + /** + * Derive new searches by adding this token to an existing search. + * + * @param object $oSearch Partial search description derived so far. + * @param object $oPosition Description of the token position within + the query. + * + * @return SearchDescription[] List of derived search descriptions. + */ + public function extendSearch($oSearch, $oPosition) + { + // Full words can only be a name if they appear at the beginning + // of the phrase. In structured search the name must forcably in + // the first phrase. In unstructured search it may be in a later + // phrase when the first phrase is a house number. + if ($oSearch->hasName() + || !($oPosition->isFirstPhrase() || $oPosition->isPhrase('')) + ) { + if ($this->iTermCount > 1 + && ($oPosition->isPhrase('') || !$oPosition->isFirstPhrase()) + ) { + $oNewSearch = $oSearch->clone(1); + $oNewSearch->addAddressToken($this->iId); + + return array($oNewSearch); + } + } elseif (!$oSearch->hasName(true)) { + $oNewSearch = $oSearch->clone(1); + $oNewSearch->addNameToken( + $this->iId, + CONST_Search_NameOnlySearchFrequencyThreshold + && $this->iSearchNameCount + < CONST_Search_NameOnlySearchFrequencyThreshold + ); + + return array($oNewSearch); + } + + return array(); + } + public function debugInfo() { return array( 'ID' => $this->iId, 'Type' => 'word', 'Info' => array( - 'partial' => $this->bPartial, - 'count' => $this->iSearchNameCount + 'count' => $this->iSearchNameCount, + 'terms' => $this->iTermCount ) ); } + + public function debugCode() + { + return 'W'; + } } diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 92dd7272..2c0884c8 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -120,14 +120,14 @@ class Tokenizer // Try more interpretations for Tokens that could not be matched. foreach ($aTokens as $sToken) { - if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { - if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { + if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) { + if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { // US ZIP+4 codes - merge in the 5-digit ZIP code $oValidTokens->addToken( $sToken, new Token\Postcode(null, $aData[1], 'us') ); - } elseif (preg_match('/^ [0-9]+$/', $sToken)) { + } elseif (preg_match('/^[0-9]+$/', $sToken)) { // Unknown single word token with a number. // Assume it is a house number. $oValidTokens->addToken( @@ -195,17 +195,28 @@ class Tokenizer ) { $oToken = new Token\Country($iId, $aWord['country_code']); } + } elseif ($aWord['word_token'][0] == ' ') { + $oToken = new Token\Word( + $iId, + $aWord['word_token'][0] != ' ', + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + ); } else { - $oToken = new Token\Word( + $oToken = new Token\Partial( $iId, - $aWord['word_token'][0] != ' ', - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') + $aWord['word_token'], + (int) $aWord['count'] ); } if ($oToken) { - $oValidTokens->addToken($aWord['word_token'], $oToken); + // remove any leading spaces + if ($aWord['word_token'][0] == ' ') { + $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken); + } else { + $oValidTokens->addToken($aWord['word_token'], $oToken); + } } } } diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index 50207c31..064b4166 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -137,14 +137,14 @@ class Tokenizer // Try more interpretations for Tokens that could not be matched. foreach ($aTokens as $sToken) { - if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { - if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { + if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) { + if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { // US ZIP+4 codes - merge in the 5-digit ZIP code $oValidTokens->addToken( $sToken, new Token\Postcode(null, $aData[1], 'us') ); - } elseif (preg_match('/^ [0-9]+$/', $sToken)) { + } elseif (preg_match('/^[0-9]+$/', $sToken)) { // Unknown single word token with a number. // Assume it is a house number. $oValidTokens->addToken( @@ -212,17 +212,29 @@ class Tokenizer ) { $oToken = new Token\Country($iId, $aWord['country_code']); } - } else { + } elseif ($aWord['word_token'][0] == ' ') { $oToken = new Token\Word( $iId, - $aWord['word_token'][0] != ' ', (int) $aWord['count'], substr_count($aWord['word_token'], ' ') ); + // For backward compatibility: ignore all partial tokens with more + // than one word. + } elseif (strpos($aWord['word_token'], ' ') === false) { + $oToken = new Token\Partial( + $iId, + $aWord['word_token'], + (int) $aWord['count'] + ); } if ($oToken) { - $oValidTokens->addToken($aWord['word_token'], $oToken); + // remove any leading spaces + if ($aWord['word_token'][0] == ' ') { + $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken); + } else { + $oValidTokens->addToken($aWord['word_token'], $oToken); + } } } }