remove special status of partial tokens

author Sarah Hoffmann <lonvia@denofr.de>

Wed, 14 Jul 2021 20:17:17 +0000 (22:17 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 14 Jul 2021 20:17:17 +0000 (22:17 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Wed, 14 Jul 2021 20:17:17 +0000 (22:17 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 14 Jul 2021 20:17:17 +0000 (22:17 +0200)
diff --git a/lib-php/Geocode.php b/lib-php/Geocode.php

index ec21a0dcd79d498d6f3943ae8e77ce78977b87bf..eda6df5492c0e9c160e5c4ae7507b2fa1c96cd9b 100644 (file)
--- a/lib-php/Geocode.php
+++ b/lib-php/Geocode.php
@@ -355,15 +355,15 @@ class Geocode
                      $aNewWordsetSearches = array();
  
                      foreach ($aWordsetSearches as $oCurrentSearch) {
-                        // Tokens with full name matches.
-                        foreach ($oValidTokens->get(' '.$sToken) as $oSearchTerm) {
-                            $aNewSearches = $oCurrentSearch->extendWithFullTerm(
+                        foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
+                            $aNewSearches = $oCurrentSearch->extendWithSearchTerm(
+                                $sToken,
                                  $oSearchTerm,
                                  $sPhraseType,
                                  $iToken == 0 && $iPhrase == 0,
-                                $iPhrase == 0,
                                  $iToken + 1 == count($aWordset)
-                                  && $iPhrase + 1 == count($aPhrases)
+                                  && $iPhrase + 1 == count($aPhrases),
+                                $iPhrase
                              );
  
                              foreach ($aNewSearches as $oSearch) {
@@ -372,27 +372,6 @@ class Geocode
                                  }
                              }
                          }
-                        // Look for partial matches.
-                        // Note that there is no point in adding country terms here
-                        // because country is omitted in the address.
-                        if ($sPhraseType != 'country') {
-                            // Allow searching for a word - but at extra cost
-                            foreach ($oValidTokens->get($sToken) as $oSearchTerm) {
-                                $aNewSearches = $oCurrentSearch->extendWithPartialTerm(
-                                    $sToken,
-                                    $oSearchTerm,
-                                    (bool) $sPhraseType,
-                                    $iPhrase,
-                                    $oValidTokens->get(' '.$sToken)
-                                );
-
-                                foreach ($aNewSearches as $oSearch) {
-                                    if ($oSearch->getRank() < $this->iMaxRank) {
-                                        $aNewWordsetSearches[] = $oSearch;
-                                    }
-                                }
-                            }
-                        }
                      }
                      // Sort and cut
                      usort($aNewWordsetSearches, array('Nominatim\SearchDescription', 'bySearchRank'));
diff --git a/lib-php/SearchDescription.php b/lib-php/SearchDescription.php

index 6091fd617396b38cf55bd242fcf95d70de67c5c3..938beb61206d457cc577c3ade63bc072c2b8ed17 100644 (file)
--- a/lib-php/SearchDescription.php
+++ b/lib-php/SearchDescription.php
@@ -152,17 +152,17 @@ class SearchDescription
      /**
       * Derive new searches by adding a full term to the existing search.
       *
-     * @param object $oSearchTerm  Description of the token.
-     * @param string $sPhraseType  Type of phrase the token is contained in.
-     * @param bool   $bFirstToken  True if the token is at the beginning of the
-     *                             query.
-     * @param bool   $bFirstPhrase True if the token is in the first phrase of
-     *                             the query.
-     * @param bool   $bLastToken   True if the token is at the end of the query.
+     * @param string  $sToken       Term for the token.
+     * @param object  $oSearchTerm  Description of the token.
+     * @param string  $sPhraseType  Type of phrase the token is contained in.
+     * @param bool    $bFirstToken  True if the token is at the beginning of the
+     *                              query.
+     * @param bool    $bLastToken   True if the token is at the end of the query.
+     * @param integer $iPhrase      Number of the phrase the token is in.
       *
       * @return SearchDescription[] List of derived search descriptions.
       */
-    public function extendWithFullTerm($oSearchTerm, $sPhraseType, $bFirstToken, $bFirstPhrase, $bLastToken)
+    public function extendWithSearchTerm($sToken, $oSearchTerm, $sPhraseType, $bFirstToken, $bLastToken, $iPhrase)
      {
          $aNewSearches = array();
  
@@ -295,8 +295,8 @@ class SearchDescription
              // of the phrase. In structured search the name must forcably in
              // the first phrase. In unstructured search it may be in a later
              // phrase when the first phrase is a house number.
-            if (!empty($this->aName) || !($bFirstPhrase || $sPhraseType == '')) {
-                if (($sPhraseType == '' || !$bFirstPhrase) && $oSearchTerm->iTermCount > 1) {
+            if (!empty($this->aName) || !($iPhrase == 0 || $sPhraseType == '')) {
+                if (($sPhraseType == '' || $iPhrase > 0) && $oSearchTerm->iTermCount > 1) {
                      $oSearch = clone $this;
                      $oSearch->iNamePhrase = -1;
                      $oSearch->iSearchRank += 1;
@@ -314,6 +314,16 @@ class SearchDescription
                  }
                  $aNewSearches[] = $oSearch;
              }
+        } elseif ($sPhraseType != 'country'
+                  && is_a($oSearchTerm, '\Nominatim\Token\Partial')
+                  && strpos($sToken, ' ') === false
+        ) {
+            $aNewSearches = $this->extendWithPartialTerm(
+                $sToken,
+                $oSearchTerm,
+                (bool) $sPhraseType,
+                $iPhrase
+            );
          }
  
          return $aNewSearches;
@@ -326,20 +336,11 @@ class SearchDescription
       * @param object  $oSearchTerm        Description of the token.
       * @param bool    $bStructuredPhrases True if the search is structured.
       * @param integer $iPhrase            Number of the phrase the token is in.
-     * @param array[] $aFullTokens        List of full term tokens with the
-     *                                    same name.
       *
       * @return SearchDescription[] List of derived search descriptions.
       */
-    public function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase, $aFullTokens)
+    private function extendWithPartialTerm($sToken, $oSearchTerm, $bStructuredPhrases, $iPhrase)
      {
-        // Only allow name terms.
-        if (!(is_a($oSearchTerm, '\Nominatim\Token\Word'))
-            || strpos($sToken, ' ') !== false
-        ) {
-            return array();
-        }
-
          $aNewSearches = array();
          $iWordID = $oSearchTerm->iId;
  
@@ -355,9 +356,6 @@ class SearchDescription
                  $oSearch->aAddress[$iWordID] = $iWordID;
              } else {
                  $oSearch->aAddressNonSearch[$iWordID] = $iWordID;
-                if (!empty($aFullTokens)) {
-                    $oSearch->iSearchRank++;
-                }
              }
              $aNewSearches[] = $oSearch;
          }
@@ -385,9 +383,6 @@ class SearchDescription
                  }
                  $oSearch->aName[$iWordID] = $iWordID;
              } else {
-                if (!empty($aFullTokens)) {
-                    $oSearch->iSearchRank++;
-                }
                  $oSearch->aNameNonSearch[$iWordID] = $iWordID;
              }
              $oSearch->iNamePhrase = $iPhrase;
diff --git a/lib-php/TokenList.php b/lib-php/TokenList.php

index f310306d81e22963b45ff0bb1ac9dd322284eb3a..bc8f9c3f1b06d9f24038342d7c8215f1b2dc3dea 100644 (file)
--- a/lib-php/TokenList.php
+++ b/lib-php/TokenList.php
@@ -18,15 +18,6 @@ require_once(CONST_LibDir.'/SpecialSearchOperator.php');
   * tokens do not have a common base class. All tokens need to have a field
   * with the word id that points to an entry in the `word` database table
   * but otherwise the information saved about a token can be very different.
- *
- * There are two different kinds of token words: full words and partial terms.
- *
- * Full words start with a space. They represent a complete name of a place.
- * All special tokens are normally full words.
- *
- * Partial terms have no space at the beginning. They may represent a part of
- * a name of a place (e.g. in the name 'World Trade Center' a partial term
- * would be 'Trade' or 'Trade Center'). They are only used in TokenWord.
   */
  class TokenList
  {
@@ -65,7 +56,7 @@ class TokenList
       */
      public function containsAny($sWord)
      {
-        return isset($this->aTokens[$sWord]) || isset($this->aTokens[' '.$sWord]);
+        return isset($this->aTokens[$sWord]);
      }
  
      /**
@@ -87,7 +78,7 @@ class TokenList
  
          foreach ($this->aTokens as $aTokenList) {
              foreach ($aTokenList as $oToken) {
-                if (is_a($oToken, '\Nominatim\Token\Word') && !$oToken->bPartial) {
+                if (is_a($oToken, '\Nominatim\Token\Word')) {
                      $ids[$oToken->iId] = $oToken->iId;
                  }
              }
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php

index 8cff6f322410366d2e0ca2ceaf143d2b2035ce64..96a1d8a659fda9ca45d0c9753b2439e66ec00ab4 100644 (file)
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -120,14 +120,14 @@ class Tokenizer
  
              // Try more interpretations for Tokens that could not be matched.
              foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                          // US ZIP+4 codes - merge in the 5-digit ZIP code
                          $oValidTokens->addToken(
                              $sToken,
                              new Token\Postcode(null, $aData[1], 'us')
                          );
-                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                    } elseif (preg_match('/^[0-9]+$/', $sToken)) {
                          // Unknown single word token with a number.
                          // Assume it is a house number.
                          $oValidTokens->addToken(
diff --git a/lib-php/tokenizer/legacy_tokenizer.php b/lib-php/tokenizer/legacy_tokenizer.php

index ec2d7e68cbeb5ed4baa011db99ec13295f9cc1aa..238fbcf45e48a2120d4b0f2b566032999bc963be 100644 (file)
--- a/lib-php/tokenizer/legacy_tokenizer.php
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@@ -137,14 +137,14 @@ class Tokenizer
  
              // Try more interpretations for Tokens that could not be matched.
              foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                          // US ZIP+4 codes - merge in the 5-digit ZIP code
                          $oValidTokens->addToken(
                              $sToken,
                              new Token\Postcode(null, $aData[1], 'us')
                          );
-                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                    } elseif (preg_match('/^[0-9]+$/', $sToken)) {
                          // Unknown single word token with a number.
                          // Assume it is a house number.
                          $oValidTokens->addToken(
author	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 14 Jul 2021 20:17:17 +0000 (22:17 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 14 Jul 2021 20:17:17 +0000 (22:17 +0200)
lib-php/Geocode.php		patch \| blob \| history
lib-php/SearchDescription.php		patch \| blob \| history
lib-php/TokenList.php		patch \| blob \| history
lib-php/tokenizer/legacy_icu_tokenizer.php		patch \| blob \| history
lib-php/tokenizer/legacy_tokenizer.php		patch \| blob \| history