From: Sarah Hoffmann Date: Thu, 31 Jan 2013 19:57:31 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' X-Git-Tag: deploy~642 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/ea04658bde5c61965442562889292351b8df79ec?ds=inline;hp=-c Merge remote-tracking branch 'upstream/master' --- ea04658bde5c61965442562889292351b8df79ec diff --combined website/search.php index 77a3d135,be74cc9b..c3edbcd6 --- a/website/search.php +++ b/website/search.php @@@ -71,7 -71,6 +71,7 @@@ if (isset($aLangPrefOrder['name:de'])) $bReverseInPlan = true; if (isset($aLangPrefOrder['name:ru'])) $bReverseInPlan = true; if (isset($aLangPrefOrder['name:ja'])) $bReverseInPlan = true; + if (isset($aLangPrefOrder['name:pl'])) $bReverseInPlan = true; $sLanguagePrefArraySQL = "ARRAY[".join(',',array_map("getDBQuoted",$aLangPrefOrder))."]"; @@@ -287,6 -286,7 +287,7 @@@ // Start with a blank search $aSearches = array( array('iSearchRank' => 0, 'iNamePhrase' => -1, 'sCountryCode' => false, 'aName'=>array(), 'aAddress'=>array(), + 'aNameNonSearch'=>array(), 'aAddressNonSearch'=>array(), 'sOperator'=>'', 'aFeatureName' => array(), 'sClass'=>'', 'sType'=>'', 'sHouseNumber'=>'', 'fLat'=>'', 'fLon'=>'', 'fRadius'=>'') ); @@@ -398,7 -398,7 +399,7 @@@ // Check which tokens we have, get the ID numbers $sSQL = 'select word_id,word_token, word, class, type, location, country_code, operator, search_name_count'; $sSQL .= ' from word where word_token in ('.join(',',array_map("getDBQuoted",$aTokens)).')'; - $sSQL .= ' and search_name_count < '.CONST_Max_Word_Frequency; + // $sSQL .= ' and search_name_count < '.CONST_Max_Word_Frequency; // $sSQL .= ' group by word_token, word, class, type, location, country_code'; if (CONST_Debug) var_Dump($sSQL); @@@ -413,8 -413,16 +414,16 @@@ failInternalError("Could not get word tokens.", $sSQL, $aDatabaseWords); } $aPossibleMainWordIDs = array(); + $aWordFrequencyScores = array(); foreach($aDatabaseWords as $aToken) { + // Very special case - require 2 letter country param to match the country code found + if ($bStructuredPhrases && $aToken['country_code'] && !empty($aStructuredQuery['country']) + && strlen($aStructuredQuery['country']) == 2 && strtolower($aStructuredQuery['country']) != $aToken['country_code']) + { + continue; + } + if (isset($aValidTokens[$aToken['word_token']])) { $aValidTokens[$aToken['word_token']][] = $aToken; @@@ -423,7 -431,8 +432,8 @@@ { $aValidTokens[$aToken['word_token']] = array($aToken); } - if ($aToken['word_token'][0]==' ' && !$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1 + $aToken['search_name_count']; + if ($aToken['word_token'][0]==' ' && !$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1; + $aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1; } if (CONST_Debug) var_Dump($aPhrases, $aValidTokens); @@@ -599,7 -608,7 +609,7 @@@ { if (sizeof($aSearch['aName'])) { - if (($sPhraseType != 'street' && $sPhraseType != 'country') && (!isset($aValidTokens[$sToken]) || strlen($sToken) < 4 || strpos($sToken, ' ') !== false)) + if ((!$bStructuredPhrases || $iPhrase > 0) && $sPhraseType != 'country' && (!isset($aValidTokens[$sToken]) || strlen($sToken) < 4 || strpos($sToken, ' ') !== false)) { $aSearch['aAddress'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; } @@@ -624,12 -633,35 +634,35 @@@ { if (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) { - if (($sPhraseType != 'street') && sizeof($aCurrentSearch['aName']) && strlen($sToken) >= 4) + if ((!$bStructuredPhrases || $iPhrase > 0) && sizeof($aCurrentSearch['aName']) && strlen($sToken) >= 4) { $aSearch = $aCurrentSearch; $aSearch['iSearchRank'] += 1; - $aSearch['aAddress'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; - if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; + if ($aWordFrequencyScores[$aSearchTerm['word_id']] < CONST_Max_Word_Frequency) + { + $aSearch['aAddress'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; + if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; + } + elseif (isset($aValidTokens[' '.$sToken])) // revert to the token version? + { + foreach($aValidTokens[' '.$sToken] as $aSearchTermToken) + { + if (empty($aSearchTermToken['country_code']) + && empty($aSearchTermToken['lat']) + && empty($aSearchTermToken['class'])) + { + $aSearch = $aCurrentSearch; + $aSearch['iSearchRank'] += 1; + $aSearch['aAddress'][$aSearchTermToken['word_id']] = $aSearchTermToken['word_id']; + if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; + } + } + } + else + { + $aSearch['aAddressNonSearch'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; + if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; + } } if (!sizeof($aCurrentSearch['aName']) || $aCurrentSearch['iNamePhrase'] == $iPhrase) @@@ -637,7 -669,10 +670,10 @@@ $aSearch = $aCurrentSearch; $aSearch['iSearchRank'] += 2; if (preg_match('#^[0-9]+$#', $sToken)) $aSearch['iSearchRank'] += 2; - $aSearch['aName'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; + if ($aWordFrequencyScores[$aSearchTerm['word_id']] < CONST_Max_Word_Frequency) + $aSearch['aName'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; + else + $aSearch['aNameNonSearch'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; $aSearch['iNamePhrase'] = $iPhrase; if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; } @@@ -885,18 -920,20 +921,20 @@@ // TODO: filter out the pointless search terms (2 letter name tokens and less) // they might be right - but they are just too darned expensive to run if (sizeof($aSearch['aName'])) $aTerms[] = "name_vector @> ARRAY[".join($aSearch['aName'],",")."]"; + if (sizeof($aSearch['aNameNonSearch'])) $aTerms[] = "array_cat(name_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aNameNonSearch'],",")."]"; if (sizeof($aSearch['aAddress']) && $aSearch['aName'] != $aSearch['aAddress']) { // For infrequent name terms disable index usage for address if (CONST_Search_NameOnlySearchFrequencyThreshold && sizeof($aSearch['aName']) == 1 && - $aPossibleMainWordIDs[$aSearch['aName'][reset($aSearch['aName'])]] < CONST_Search_NameOnlySearchFrequencyThreshold) + $aWordFrequencyScores[$aSearch['aName'][reset($aSearch['aName'])]] < CONST_Search_NameOnlySearchFrequencyThreshold) { - $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aAddress'],",")."]"; + $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join(array_merge($aSearch['aAddress'],$aSearch['aAddressNonSearch']),",")."]"; } else { $aTerms[] = "nameaddress_vector @> ARRAY[".join($aSearch['aAddress'],",")."]"; + if (sizeof($aSearch['aAddressNonSearch'])) $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aAddressNonSearch'],",")."]"; } } if ($aSearch['sCountryCode']) $aTerms[] = "country_code = '".pg_escape_string($aSearch['sCountryCode'])."'"; @@@ -937,17 -974,12 +975,17 @@@ else $sSQL .= " limit ".$iLimit; - if (CONST_Debug) { var_dump($sSQL); } + if (CONST_Debug) var_dump($sSQL); + $iStartTime = time(); $aViewBoxPlaceIDs = $oDB->getAll($sSQL); if (PEAR::IsError($aViewBoxPlaceIDs)) { failInternalError("Could not get places for search terms.", $sSQL, $aViewBoxPlaceIDs); } + if (time() - $iStartTime > 60) { + file_put_contents(CONST_BasePath.'/log/long_queries.log', date('Y-m-d H:i:s', $iStartTime).' '.$sSQL."\n", FILE_APPEND); + } + //var_dump($aViewBoxPlaceIDs); // Did we have an viewbox matches? $aPlaceIDs = array(); @@@ -1294,7 -1326,7 +1332,7 @@@ //var_Dump($aSearchResults); //exit; $aClassType = getClassTypesWithImportance(); - $aRecheckWords = preg_split('/\b/',$sQuery); + $aRecheckWords = preg_split('/\b/u',$sQuery); foreach($aRecheckWords as $i => $sWord) { if (!$sWord) unset($aRecheckWords[$i]); @@@ -1498,7 -1530,7 +1536,7 @@@ if (sizeof($aSearchResults) >= $iFinalLimit) break; } - $sDataDate = $oDB->getOne("select TO_CHAR(lastimportdate - '1 day'::interval,'YYYY/MM/DD') from import_status limit 1"); + $sDataDate = $oDB->getOne("select TO_CHAR(lastimportdate - '2 minutes'::interval,'YYYY/MM/DD HH24:MI')||' GMT' from import_status limit 1"); if (isset($_GET['nearlat']) && isset($_GET['nearlon'])) {