From: Sarah Hoffmann Date: Tue, 13 Jul 2021 14:54:51 +0000 (+0200) Subject: introduce a separate token type for partials X-Git-Tag: v4.0.0~50^2~8 X-Git-Url: https://git.openstreetmap.org/nominatim.git/commitdiff_plain/6070c3d1d58fb8737b387e8a3ef1f17fb1eb5d54 introduce a separate token type for partials This means that the leading space can be removed as a partial word indicator. --- diff --git a/lib-php/TokenList.php b/lib-php/TokenList.php index 2df9fe05..f310306d 100644 --- a/lib-php/TokenList.php +++ b/lib-php/TokenList.php @@ -7,6 +7,7 @@ require_once(CONST_LibDir.'/TokenHousenumber.php'); require_once(CONST_LibDir.'/TokenPostcode.php'); require_once(CONST_LibDir.'/TokenSpecialTerm.php'); require_once(CONST_LibDir.'/TokenWord.php'); +require_once(CONST_LibDir.'/TokenPartial.php'); require_once(CONST_LibDir.'/SpecialSearchOperator.php'); /** diff --git a/lib-php/TokenPartial.php b/lib-php/TokenPartial.php new file mode 100644 index 00000000..477ef9c5 --- /dev/null +++ b/lib-php/TokenPartial.php @@ -0,0 +1,31 @@ +iId = $iId; + $this->iSearchNameCount = $iSearchNameCount; + } + + public function debugInfo() + { + return array( + 'ID' => $this->iId, + 'Type' => 'partial', + 'Info' => array( + 'count' => $this->iSearchNameCount + ) + ); + } +} diff --git a/lib-php/TokenWord.php b/lib-php/TokenWord.php index fc28535d..6de58422 100644 --- a/lib-php/TokenWord.php +++ b/lib-php/TokenWord.php @@ -9,17 +9,14 @@ class Word { /// Database word id, if applicable. public $iId; - /// If true, the word may represent only part of a place name. - public $bPartial; /// Number of appearances in the database. public $iSearchNameCount; /// Number of terms in the word. public $iTermCount; - public function __construct($iId, $bPartial, $iSearchNameCount, $iTermCount) + public function __construct($iId, $iSearchNameCount, $iTermCount) { $this->iId = $iId; - $this->bPartial = $bPartial; $this->iSearchNameCount = $iSearchNameCount; $this->iTermCount = $iTermCount; } @@ -30,8 +27,8 @@ class Word 'ID' => $this->iId, 'Type' => 'word', 'Info' => array( - 'partial' => $this->bPartial, - 'count' => $this->iSearchNameCount + 'count' => $this->iSearchNameCount, + 'terms' => $this->iTermCount ) ); } diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php index 92dd7272..8cff6f32 100644 --- a/lib-php/tokenizer/legacy_icu_tokenizer.php +++ b/lib-php/tokenizer/legacy_icu_tokenizer.php @@ -195,17 +195,27 @@ class Tokenizer ) { $oToken = new Token\Country($iId, $aWord['country_code']); } + } elseif ($aWord['word_token'][0] == ' ') { + $oToken = new Token\Word( + $iId, + $aWord['word_token'][0] != ' ', + (int) $aWord['count'], + substr_count($aWord['word_token'], ' ') + ); } else { - $oToken = new Token\Word( + $oToken = new Token\Partial( $iId, - $aWord['word_token'][0] != ' ', - (int) $aWord['count'], - substr_count($aWord['word_token'], ' ') + (int) $aWord['count'] ); } if ($oToken) { - $oValidTokens->addToken($aWord['word_token'], $oToken); + // remove any leading spaces + if ($aWord['word_token'][0] == ' ') { + $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken); + } else { + $oValidTokens->addToken($aWord['word_token'], $oToken); + } } } } diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php index 50207c31..ec2d7e68 100644 --- a/lib-php/tokenizer/legacy_tokenizer.php +++ b/lib-php/tokenizer/legacy_tokenizer.php @@ -212,17 +212,26 @@ class Tokenizer ) { $oToken = new Token\Country($iId, $aWord['country_code']); } - } else { + } elseif ($aWord['word_token'][0] == ' ') { $oToken = new Token\Word( $iId, - $aWord['word_token'][0] != ' ', (int) $aWord['count'], substr_count($aWord['word_token'], ' ') ); + } else { + $oToken = new Token\Partial( + $iId, + (int) $aWord['count'] + ); } if ($oToken) { - $oValidTokens->addToken($aWord['word_token'], $oToken); + // remove any leading spaces + if ($aWord['word_token'][0] == ' ') { + $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken); + } else { + $oValidTokens->addToken($aWord['word_token'], $oToken); + } } } }