X-Git-Url: https://git.openstreetmap.org/nominatim.git/blobdiff_plain/bd27310c68da663aadef9038ec600793444e786b..143ff1446656ee995356c7a7b5eaa624140c70d8:/lib-php/Geocode.php diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php index b475add2..eda6df54 100644 --- a/lib-php/Geocode.php +++ b/lib-php/Geocode.php @@ -8,17 +8,18 @@ require_once(CONST_LibDir.'/ReverseGeocode.php'); require_once(CONST_LibDir.'/SearchDescription.php'); require_once(CONST_LibDir.'/SearchContext.php'); require_once(CONST_LibDir.'/TokenList.php'); +require_once(CONST_TokenizerDir.'/tokenizer.php'); class Geocode { protected $oDB; protected $oPlaceLookup; + protected $oTokenizer; protected $aLangPrefOrder = array(); protected $aExcludePlaceIDs = array(); - protected $bReverseInPlan = true; protected $iLimit = 20; protected $iFinalLimit = 10; @@ -42,28 +43,12 @@ class Geocode protected $sQuery = false; protected $aStructuredQuery = false; - protected $oNormalizer = null; - public function __construct(&$oDB) { $this->oDB =& $oDB; $this->oPlaceLookup = new PlaceLookup($this->oDB); - $this->oNormalizer = \Transliterator::createFromRules(CONST_Term_Normalization_Rules); - } - - private function normTerm($sTerm) - { - if ($this->oNormalizer === null) { - return $sTerm; - } - - return $this->oNormalizer->transliterate($sTerm); - } - - public function setReverseInPlan($bReverse) - { - $this->bReverseInPlan = $bReverse; + $this->oTokenizer = new \Nominatim\Tokenizer($this->oDB); } public function setLanguagePreference($aLangPref) @@ -85,7 +70,9 @@ class Geocode $aParams['exclude_place_ids'] = implode(',', $this->aExcludePlaceIDs); } - if ($this->bBoundedSearch) $aParams['bounded'] = '1'; + if ($this->bBoundedSearch) { + $aParams['bounded'] = '1'; + } if ($this->aCountryCodes) { $aParams['countrycodes'] = implode(',', $this->aCountryCodes); @@ -100,8 +87,11 @@ class Geocode public function setLimit($iLimit = 10) { - if ($iLimit > 50) $iLimit = 50; - if ($iLimit < 1) $iLimit = 1; + if ($iLimit > 50) { + $iLimit = 50; + } elseif ($iLimit < 1) { + $iLimit = 1; + } $this->iFinalLimit = $iLimit; $this->iLimit = $iLimit + min($iLimit, 10); @@ -196,18 +186,24 @@ class Geocode if ($sExcluded) { foreach ($sExcluded as $iExcludedPlaceID) { $iExcludedPlaceID = (int)$iExcludedPlaceID; - if ($iExcludedPlaceID) + if ($iExcludedPlaceID) { $aExcludePlaceIDs[$iExcludedPlaceID] = $iExcludedPlaceID; + } } - if (isset($aExcludePlaceIDs)) + if (isset($aExcludePlaceIDs)) { $this->aExcludePlaceIDs = $aExcludePlaceIDs; + } } // Only certain ranks of feature $sFeatureType = $oParams->getString('featureType'); - if (!$sFeatureType) $sFeatureType = $oParams->getString('featuretype'); - if ($sFeatureType) $this->setFeatureType($sFeatureType); + if (!$sFeatureType) { + $sFeatureType = $oParams->getString('featuretype'); + } + if ($sFeatureType) { + $this->setFeatureType($sFeatureType); + } // Country code list $sCountries = $oParams->getStringList('countrycodes'); @@ -217,8 +213,9 @@ class Geocode $aCountries[] = strtolower($sCountryCode); } } - if (isset($aCountries)) + if (isset($aCountries)) { $this->aCountryCodes = $aCountries; + } } $aViewbox = $oParams->getStringList('viewboxlbrt'); @@ -262,7 +259,6 @@ class Geocode $oParams->getString('country'), $oParams->getString('postalcode') ); - $this->setReverseInPlan(false); } else { $this->setQuery($sQuery); } @@ -271,13 +267,17 @@ class Geocode public function loadStructuredAddressElement($sValue, $sKey, $iNewMinAddressRank, $iNewMaxAddressRank, $aItemListValues) { $sValue = trim($sValue); - if (!$sValue) return false; + if (!$sValue) { + return false; + } $this->aStructuredQuery[$sKey] = $sValue; if ($this->iMinAddressRank == 0 && $this->iMaxAddressRank == 30) { $this->iMinAddressRank = $iNewMinAddressRank; $this->iMaxAddressRank = $iNewMaxAddressRank; } - if ($aItemListValues) $this->aAddressRankList = array_merge($this->aAddressRankList, $aItemListValues); + if ($aItemListValues) { + $this->aAddressRankList = array_merge($this->aAddressRankList, $aItemListValues); + } return true; } @@ -311,11 +311,11 @@ class Geocode public function fallbackStructuredQuery() { - if (!$this->aStructuredQuery) return false; - $aParams = $this->aStructuredQuery; - if (count($aParams) == 1) return false; + if (!$aParams || count($aParams) == 1) { + return false; + } $aOrderToFallback = array('postalcode', 'street', 'city', 'county', 'state'); @@ -330,7 +330,7 @@ class Geocode return false; } - public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bIsStructured) + public function getGroupedSearches($aSearches, $aPhrases, $oValidTokens) { /* Calculate all searches using oValidTokens i.e. @@ -345,32 +345,25 @@ class Geocode */ foreach ($aPhrases as $iPhrase => $oPhrase) { $aNewPhraseSearches = array(); - $sPhraseType = $bIsStructured ? $oPhrase->getPhraseType() : ''; + $sPhraseType = $oPhrase->getPhraseType(); foreach ($oPhrase->getWordSets() as $aWordset) { $aWordsetSearches = $aSearches; // Add all words from this wordset foreach ($aWordset as $iToken => $sToken) { - //echo "
$sToken"; $aNewWordsetSearches = array(); foreach ($aWordsetSearches as $oCurrentSearch) { - //echo ""; - //var_dump($oCurrentSearch); - //echo ""; - - // Tokens with full name matches. - foreach ($oValidTokens->get(' '.$sToken) as $oSearchTerm) { - $aNewSearches = $oCurrentSearch->extendWithFullTerm( + foreach ($oValidTokens->get($sToken) as $oSearchTerm) { + $aNewSearches = $oCurrentSearch->extendWithSearchTerm( + $sToken, $oSearchTerm, - $oValidTokens->contains($sToken) - && strpos($sToken, ' ') === false, $sPhraseType, $iToken == 0 && $iPhrase == 0, - $iPhrase == 0, $iToken + 1 == count($aWordset) - && $iPhrase + 1 == count($aPhrases) + && $iPhrase + 1 == count($aPhrases), + $iPhrase ); foreach ($aNewSearches as $oSearch) { @@ -379,33 +372,11 @@ class Geocode } } } - // Look for partial matches. - // Note that there is no point in adding country terms here - // because country is omitted in the address. - if ($sPhraseType != 'country') { - // Allow searching for a word - but at extra cost - foreach ($oValidTokens->get($sToken) as $oSearchTerm) { - $aNewSearches = $oCurrentSearch->extendWithPartialTerm( - $sToken, - $oSearchTerm, - $bIsStructured, - $iPhrase, - $oValidTokens->get(' '.$sToken) - ); - - foreach ($aNewSearches as $oSearch) { - if ($oSearch->getRank() < $this->iMaxRank) { - $aNewWordsetSearches[] = $oSearch; - } - } - } - } } // Sort and cut usort($aNewWordsetSearches, array('Nominatim\SearchDescription', 'bySearchRank')); $aWordsetSearches = array_slice($aNewWordsetSearches, 0, 50); } - //var_Dump('
',count($aWordsetSearches)); exit; $aNewPhraseSearches = array_merge($aNewPhraseSearches, $aNewWordsetSearches); usort($aNewPhraseSearches, array('Nominatim\SearchDescription', 'bySearchRank')); @@ -413,8 +384,11 @@ class Geocode $aSearchHash = array(); foreach ($aNewPhraseSearches as $iSearch => $aSearch) { $sHash = serialize($aSearch); - if (isset($aSearchHash[$sHash])) unset($aNewPhraseSearches[$iSearch]); - else $aSearchHash[$sHash] = 1; + if (isset($aSearchHash[$sHash])) { + unset($aNewPhraseSearches[$iSearch]); + } else { + $aSearchHash[$sHash] = 1; + } } $aNewPhraseSearches = array_slice($aNewPhraseSearches, 0, 50); @@ -435,10 +409,12 @@ class Geocode $iSearchCount = 0; $aSearches = array(); - foreach ($aGroupedSearches as $iScore => $aNewSearches) { + foreach ($aGroupedSearches as $aNewSearches) { $iSearchCount += count($aNewSearches); $aSearches = array_merge($aSearches, $aNewSearches); - if ($iSearchCount > 50) break; + if ($iSearchCount > 50) { + break; + } } } @@ -495,7 +471,9 @@ class Geocode public function lookup() { Debug::newFunction('Geocode::lookup'); - if (!$this->sQuery && !$this->aStructuredQuery) return array(); + if (!$this->sQuery && !$this->aStructuredQuery) { + return array(); + } Debug::printDebugArray('Geocode', $this); @@ -517,16 +495,10 @@ class Geocode if ($this->aCountryCodes) { $oCtx->setCountryList($this->aCountryCodes); } + $this->oTokenizer->setCountryRestriction($this->aCountryCodes); Debug::newSection('Query Preprocessing'); - $sNormQuery = $this->normTerm($this->sQuery); - Debug::printVar('Normalized query', $sNormQuery); - - $sLanguagePrefArraySQL = $this->oDB->getArraySQL( - $this->oDB->getDBQuotedList($this->aLangPrefOrder) - ); - $sQuery = $this->sQuery; if (!preg_match('//u', $sQuery)) { userError('Query string is not UTF-8 encoded.'); @@ -576,117 +548,62 @@ class Geocode } if ($sSpecialTerm && !$aSearches[0]->hasOperator()) { - $sSpecialTerm = pg_escape_string($sSpecialTerm); - $sToken = $this->oDB->getOne( - 'SELECT make_standard_name(:term)', - array(':term' => $sSpecialTerm), - 'Cannot decode query. Wrong encoding?' - ); - $sSQL = 'SELECT class, type FROM word '; - $sSQL .= ' WHERE word_token in (\' '.$sToken.'\')'; - $sSQL .= ' AND class is not null AND class not in (\'place\')'; - - Debug::printSQL($sSQL); - $aSearchWords = $this->oDB->getAll($sSQL); - $aNewSearches = array(); - foreach ($aSearches as $oSearch) { - foreach ($aSearchWords as $aSearchTerm) { - $oNewSearch = clone $oSearch; - $oNewSearch->setPoiSearch( - Operator::TYPE, - $aSearchTerm['class'], - $aSearchTerm['type'] - ); - $aNewSearches[] = $oNewSearch; + $aTokens = $this->oTokenizer->tokensForSpecialTerm($sSpecialTerm); + + if (!empty($aTokens)) { + $aNewSearches = array(); + foreach ($aSearches as $oSearch) { + foreach ($aTokens as $oToken) { + $oNewSearch = clone $oSearch; + $oNewSearch->setPoiSearch( + $oToken->iOperator, + $oToken->sClass, + $oToken->sType + ); + $aNewSearches[] = $oNewSearch; + } } + $aSearches = $aNewSearches; } - $aSearches = $aNewSearches; } // Split query into phrases // Commas are used to reduce the search space by indicating where phrases split + $aPhrases = array(); if ($this->aStructuredQuery) { - $aInPhrases = $this->aStructuredQuery; - $bStructuredPhrases = true; + foreach ($this->aStructuredQuery as $iPhrase => $sPhrase) { + $aPhrases[] = new Phrase($sPhrase, $iPhrase); + } } else { - $aInPhrases = explode(',', $sQuery); - $bStructuredPhrases = false; + foreach (explode(',', $sQuery) as $sPhrase) { + $aPhrases[] = new Phrase($sPhrase, ''); + } } Debug::printDebugArray('Search context', $oCtx); Debug::printDebugArray('Base search', empty($aSearches) ? null : $aSearches[0]); - Debug::printVar('Final query phrases', $aInPhrases); - // Convert each phrase to standard form - // Create a list of standard words - // Get all 'sets' of words - // Generate a complete list of all Debug::newSection('Tokenization'); - $aTokens = array(); - $aPhrases = array(); - foreach ($aInPhrases as $iPhrase => $sPhrase) { - $sPhrase = $this->oDB->getOne( - 'SELECT make_standard_name(:phrase)', - array(':phrase' => $sPhrase), - 'Cannot normalize query string (is it a UTF-8 string?)' - ); - if (trim($sPhrase)) { - $oPhrase = new Phrase($sPhrase, is_string($iPhrase) ? $iPhrase : ''); - $oPhrase->addTokens($aTokens); - $aPhrases[] = $oPhrase; - } - } - - Debug::printVar('Tokens', $aTokens); - - $oValidTokens = new TokenList(); - - if (!empty($aTokens)) { - $oValidTokens->addTokensFromDB( - $this->oDB, - $aTokens, - $this->aCountryCodes, - $sNormQuery, - $this->oNormalizer - ); + $oValidTokens = $this->oTokenizer->extractTokensFromPhrases($aPhrases); + if ($oValidTokens->count() > 0) { $oCtx->setFullNameWords($oValidTokens->getFullWordIDs()); - // Try more interpretations for Tokens that could not be matched. - foreach ($aTokens as $sToken) { - if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) { - if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) { - // US ZIP+4 codes - merge in the 5-digit ZIP code - $oValidTokens->addToken( - $sToken, - new Token\Postcode(null, $aData[1], 'us') - ); - } elseif (preg_match('/^ [0-9]+$/', $sToken)) { - // Unknown single word token with a number. - // Assume it is a house number. - $oValidTokens->addToken( - $sToken, - new Token\HouseNumber(null, trim($sToken)) - ); - } - } - } + $aPhrases = array_filter($aPhrases, function ($oPhrase) { + return $oPhrase->getWordSets() !== null; + }); // Any words that have failed completely? // TODO: suggestions Debug::printGroupTable('Valid Tokens', $oValidTokens->debugInfo()); - - foreach ($aPhrases as $oPhrase) { - $oPhrase->computeWordSets($oValidTokens); - } Debug::printDebugTable('Phrases', $aPhrases); Debug::newSection('Search candidates'); - $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, $bStructuredPhrases); + $aGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens); - if ($this->bReverseInPlan) { + if (!$this->aStructuredQuery) { // Reverse phrase array and also reverse the order of the wordsets in // the first and final phrase. Don't bother about phrases in the middle // because order in the address doesn't matter. @@ -695,7 +612,7 @@ class Geocode if (count($aPhrases) > 1) { $aPhrases[count($aPhrases)-1]->invertWordSets(); } - $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens, false); + $aReverseGroupedSearches = $this->getGroupedSearches($aSearches, $aPhrases, $oValidTokens); foreach ($aGroupedSearches as $aSearches) { foreach ($aSearches as $aSearch) { @@ -714,7 +631,9 @@ class Geocode $aGroupedSearches = array(); foreach ($aSearches as $aSearch) { if ($aSearch->getRank() < $this->iMaxRank) { - if (!isset($aGroupedSearches[$aSearch->getRank()])) $aGroupedSearches[$aSearch->getRank()] = array(); + if (!isset($aGroupedSearches[$aSearch->getRank()])) { + $aGroupedSearches[$aSearch->getRank()] = array(); + } $aGroupedSearches[$aSearch->getRank()][] = $aSearch; } } @@ -728,7 +647,9 @@ class Geocode $sHash = serialize($aSearch); if (isset($aSearchHash[$sHash])) { unset($aGroupedSearches[$iGroup][$iSearch]); - if (empty($aGroupedSearches[$iGroup])) unset($aGroupedSearches[$iGroup]); + if (empty($aGroupedSearches[$iGroup])) { + unset($aGroupedSearches[$iGroup]); + } } else { $aSearchHash[$sHash] = 1; } @@ -772,14 +693,17 @@ class Geocode } } - if ($iQueryLoop > 20) break; + if ($iQueryLoop > 20) { + break; + } } if (!empty($aResults)) { $aSplitResults = Result::splitResults($aResults); Debug::printVar('Split results', $aSplitResults); if ($iGroupLoop <= 4 - && reset($aSplitResults['head'])->iResultRank > 0) { + && reset($aSplitResults['head'])->iResultRank > 0 + && $iGroupedRank !== array_key_last($aGroupedSearches)) { // Haven't found an exact match for the query yet. // Therefore add result from the next group level. $aNextResults = $aSplitResults['head']; @@ -837,7 +761,6 @@ class Geocode foreach ($aResults as $oResult) { if (($this->iMaxAddressRank == 30 && ($oResult->iTable == Result::TABLE_OSMLINE - || $oResult->iTable == Result::TABLE_AUX || $oResult->iTable == Result::TABLE_TIGER)) || in_array($oResult->iId, $aFilteredIDs) ) { @@ -847,9 +770,9 @@ class Geocode $aResults = $tempIDs; } - if (!empty($aResults)) break; - if ($iGroupLoop > 4) break; - if ($iQueryLoop > 30) break; + if (!empty($aResults) || $iGroupLoop > 4 || $iQueryLoop > 30) { + break; + } } } else { // Just interpret as a reverse geocode @@ -867,10 +790,8 @@ class Geocode // No results? Done if (empty($aResults)) { - if ($this->bFallback) { - if ($this->fallbackStructuredQuery()) { - return $this->lookup(); - } + if ($this->bFallback && $this->fallbackStructuredQuery()) { + return $this->lookup(); } return array(); @@ -889,7 +810,9 @@ class Geocode $aRecheckWords = preg_split('/\b[\s,\\-]*/u', $sQuery); foreach ($aRecheckWords as $i => $sWord) { - if (!preg_match('/[\pL\pN]/', $sWord)) unset($aRecheckWords[$i]); + if (!preg_match('/[\pL\pN]/', $sWord)) { + unset($aRecheckWords[$i]); + } } Debug::printVar('Recheck words', $aRecheckWords); @@ -949,7 +872,9 @@ class Geocode foreach ($aRecheckWords as $i => $sWord) { if (stripos($sAddress, $sWord)!==false) { $iCountWords++; - if (preg_match('/(^|,)\s*'.preg_quote($sWord, '/').'\s*(,|$)/', $sAddress)) $iCountWords += 0.1; + if (preg_match('/(^|,)\s*'.preg_quote($sWord, '/').'\s*(,|$)/', $sAddress)) { + $iCountWords += 0.1; + } } } @@ -966,15 +891,8 @@ class Geocode $aToFilter = $aSearchResults; $aSearchResults = array(); - $bFirst = true; foreach ($aToFilter as $aResult) { $this->aExcludePlaceIDs[$aResult['place_id']] = $aResult['place_id']; - if ($bFirst) { - $fLat = $aResult['lat']; - $fLon = $aResult['lon']; - if (isset($aResult['zoom'])) $iZoom = $aResult['zoom']; - $bFirst = false; - } if (!$this->oPlaceLookup->doDeDupe() || (!isset($aOSMIDDone[$aResult['osm_type'].$aResult['osm_id']]) && !isset($aClassTypeNameDone[$aResult['osm_type'].$aResult['class'].$aResult['type'].$aResult['name'].$aResult['admin_level']])) ) { @@ -984,7 +902,9 @@ class Geocode } // Absolute limit on number of results - if (count($aSearchResults) >= $this->iFinalLimit) break; + if (count($aSearchResults) >= $this->iFinalLimit) { + break; + } } Debug::printVar('Post-filter results', $aSearchResults); @@ -998,7 +918,6 @@ class Geocode 'Structured query' => $this->aStructuredQuery, 'Name keys' => Debug::fmtArrayVals($this->aLangPrefOrder), 'Excluded place IDs' => Debug::fmtArrayVals($this->aExcludePlaceIDs), - 'Try reversed query'=> $this->bReverseInPlan, 'Limit (for searches)' => $this->iLimit, 'Limit (for results)'=> $this->iFinalLimit, 'Country codes' => Debug::fmtArrayVals($this->aCountryCodes),