From 68e4fddbc694fe0268bac91c86107d67507b3462 Mon Sep 17 00:00:00 2001 From: Brian Quinion Date: Thu, 24 Jan 2013 18:00:29 +0000 Subject: [PATCH] =?utf8?q?Change=20round=20how=20CONST=5FMax=5FWord=5FFreq?= =?utf8?q?uency=20is=20applied.=20=20Was=20causing=20some=20valid=20combin?= =?utf8?q?ations=20to=20be=20skipped=20(e.g.=20Casal=20De=20S=C3=A3o=20Tom?= =?utf8?q?=C3=A9).=20=20Change=20structured=20queries=20so=20the=20first?= =?utf8?q?=20search=20term=20can't=20end=20up=20in=20the=20address=20term?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- website/search.php | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/website/search.php b/website/search.php index a1860f25..80a05a64 100755 --- a/website/search.php +++ b/website/search.php @@ -286,6 +286,7 @@ // Start with a blank search $aSearches = array( array('iSearchRank' => 0, 'iNamePhrase' => -1, 'sCountryCode' => false, 'aName'=>array(), 'aAddress'=>array(), + 'aNameNonSearch'=>array(), 'aAddressNonSearch'=>array(), 'sOperator'=>'', 'aFeatureName' => array(), 'sClass'=>'', 'sType'=>'', 'sHouseNumber'=>'', 'fLat'=>'', 'fLon'=>'', 'fRadius'=>'') ); @@ -397,7 +398,7 @@ // Check which tokens we have, get the ID numbers $sSQL = 'select word_id,word_token, word, class, type, location, country_code, operator, search_name_count'; $sSQL .= ' from word where word_token in ('.join(',',array_map("getDBQuoted",$aTokens)).')'; - $sSQL .= ' and search_name_count < '.CONST_Max_Word_Frequency; +// $sSQL .= ' and search_name_count < '.CONST_Max_Word_Frequency; // $sSQL .= ' group by word_token, word, class, type, location, country_code'; if (CONST_Debug) var_Dump($sSQL); @@ -412,6 +413,7 @@ failInternalError("Could not get word tokens.", $sSQL, $aDatabaseWords); } $aPossibleMainWordIDs = array(); + $aWordFrequencyScores = array(); foreach($aDatabaseWords as $aToken) { if (isset($aValidTokens[$aToken['word_token']])) @@ -422,7 +424,8 @@ { $aValidTokens[$aToken['word_token']] = array($aToken); } - if ($aToken['word_token'][0]==' ' && !$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1 + $aToken['search_name_count']; + if ($aToken['word_token'][0]==' ' && !$aToken['class'] && !$aToken['country_code']) $aPossibleMainWordIDs[$aToken['word_id']] = 1; + $aWordFrequencyScores[$aToken['word_id']] = $aToken['search_name_count'] + 1; } if (CONST_Debug) var_Dump($aPhrases, $aValidTokens); @@ -598,7 +601,7 @@ { if (sizeof($aSearch['aName'])) { - if (($sPhraseType != 'street' && $sPhraseType != 'country') && (!isset($aValidTokens[$sToken]) || strlen($sToken) < 4 || strpos($sToken, ' ') !== false)) + if ((!$bStructuredPhrases || $iPhrase > 0) && $sPhraseType != 'country' && (!isset($aValidTokens[$sToken]) || strlen($sToken) < 4 || strpos($sToken, ' ') !== false)) { $aSearch['aAddress'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; } @@ -623,11 +626,14 @@ { if (isset($aSearchTerm['word_id']) && $aSearchTerm['word_id']) { - if (($sPhraseType != 'street') && sizeof($aCurrentSearch['aName']) && strlen($sToken) >= 4) + if ((!$bStructuredPhrases || $iPhrase > 0) && sizeof($aCurrentSearch['aName']) && strlen($sToken) >= 4) { $aSearch = $aCurrentSearch; $aSearch['iSearchRank'] += 1; - $aSearch['aAddress'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; + if ($aWordFrequencyScores[$aSearchTerm['word_id']] < CONST_Max_Word_Frequency) + $aSearch['aAddress'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; + else + $aSearch['aAddressNonSearch'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; } @@ -636,7 +642,10 @@ $aSearch = $aCurrentSearch; $aSearch['iSearchRank'] += 2; if (preg_match('#^[0-9]+$#', $sToken)) $aSearch['iSearchRank'] += 2; - $aSearch['aName'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; + if ($aWordFrequencyScores[$aSearchTerm['word_id']] < CONST_Max_Word_Frequency) + $aSearch['aName'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; + else + $aSearch['aNameNonSearch'][$aSearchTerm['word_id']] = $aSearchTerm['word_id']; $aSearch['iNamePhrase'] = $iPhrase; if ($aSearch['iSearchRank'] < $iMaxRank) $aNewWordsetSearches[] = $aSearch; } @@ -889,7 +898,7 @@ // For infrequent name terms disable index usage for address if (CONST_Search_NameOnlySearchFrequencyThreshold && sizeof($aSearch['aName']) == 1 && - $aPossibleMainWordIDs[$aSearch['aName'][reset($aSearch['aName'])]] < CONST_Search_NameOnlySearchFrequencyThreshold) + $aWordFrequencyScores[$aSearch['aName'][reset($aSearch['aName'])]] < CONST_Search_NameOnlySearchFrequencyThreshold) { $aTerms[] = "array_cat(nameaddress_vector,ARRAY[]::integer[]) @> ARRAY[".join($aSearch['aAddress'],",")."]"; } -- 2.39.5