]> git.openstreetmap.org Git - nominatim.git/commitdiff
introduce a separate token type for partials
authorSarah Hoffmann <lonvia@denofr.de>
Tue, 13 Jul 2021 14:54:51 +0000 (16:54 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Tue, 13 Jul 2021 14:57:12 +0000 (16:57 +0200)
This means that the leading space can be removed as a partial
word indicator.

lib-php/TokenList.php
lib-php/TokenPartial.php [new file with mode: 0644]
lib-php/TokenWord.php
lib-php/tokenizer/legacy_icu_tokenizer.php
lib-php/tokenizer/legacy_tokenizer.php

index 2df9fe0586710f120c821b09f809f286cd616f44..f310306d81e22963b45ff0bb1ac9dd322284eb3a 100644 (file)
@@ -7,6 +7,7 @@ require_once(CONST_LibDir.'/TokenHousenumber.php');
 require_once(CONST_LibDir.'/TokenPostcode.php');
 require_once(CONST_LibDir.'/TokenSpecialTerm.php');
 require_once(CONST_LibDir.'/TokenWord.php');
 require_once(CONST_LibDir.'/TokenPostcode.php');
 require_once(CONST_LibDir.'/TokenSpecialTerm.php');
 require_once(CONST_LibDir.'/TokenWord.php');
+require_once(CONST_LibDir.'/TokenPartial.php');
 require_once(CONST_LibDir.'/SpecialSearchOperator.php');
 
 /**
 require_once(CONST_LibDir.'/SpecialSearchOperator.php');
 
 /**
diff --git a/lib-php/TokenPartial.php b/lib-php/TokenPartial.php
new file mode 100644 (file)
index 0000000..477ef9c
--- /dev/null
@@ -0,0 +1,31 @@
+<?php
+
+namespace Nominatim\Token;
+
+/**
+ * A standard word token.
+ */
+class Partial
+{
+    /// Database word id, if applicable.
+    public $iId;
+    /// Number of appearances in the database.
+    public $iSearchNameCount;
+
+    public function __construct($iId, $iSearchNameCount)
+    {
+        $this->iId = $iId;
+        $this->iSearchNameCount = $iSearchNameCount;
+    }
+
+    public function debugInfo()
+    {
+        return array(
+                'ID' => $this->iId,
+                'Type' => 'partial',
+                'Info' => array(
+                           'count' => $this->iSearchNameCount
+                          )
+               );
+    }
+}
index fc28535d4582e459f5d88c72b8977efaf1930fa9..6de584229b47f2e0b2af2cf0cceb96924c8be483 100644 (file)
@@ -9,17 +9,14 @@ class Word
 {
     /// Database word id, if applicable.
     public $iId;
 {
     /// Database word id, if applicable.
     public $iId;
-    /// If true, the word may represent only part of a place name.
-    public $bPartial;
     /// Number of appearances in the database.
     public $iSearchNameCount;
     /// Number of terms in the word.
     public $iTermCount;
 
     /// Number of appearances in the database.
     public $iSearchNameCount;
     /// Number of terms in the word.
     public $iTermCount;
 
-    public function __construct($iId, $bPartial, $iSearchNameCount, $iTermCount)
+    public function __construct($iId, $iSearchNameCount, $iTermCount)
     {
         $this->iId = $iId;
     {
         $this->iId = $iId;
-        $this->bPartial = $bPartial;
         $this->iSearchNameCount = $iSearchNameCount;
         $this->iTermCount = $iTermCount;
     }
         $this->iSearchNameCount = $iSearchNameCount;
         $this->iTermCount = $iTermCount;
     }
@@ -30,8 +27,8 @@ class Word
                 'ID' => $this->iId,
                 'Type' => 'word',
                 'Info' => array(
                 'ID' => $this->iId,
                 'Type' => 'word',
                 'Info' => array(
-                           'partial' => $this->bPartial,
-                           'count' => $this->iSearchNameCount
+                           'count' => $this->iSearchNameCount,
+                           'terms' => $this->iTermCount
                           )
                );
     }
                           )
                );
     }
index 92dd727283019ea3454b20ee7232f0234f583b0c..8cff6f322410366d2e0ca2ceaf143d2b2035ce64 100644 (file)
@@ -195,17 +195,27 @@ class Tokenizer
                 ) {
                     $oToken = new Token\Country($iId, $aWord['country_code']);
                 }
                 ) {
                     $oToken = new Token\Country($iId, $aWord['country_code']);
                 }
+            } elseif ($aWord['word_token'][0] == ' ') {
+                 $oToken = new Token\Word(
+                     $iId,
+                     $aWord['word_token'][0] != ' ',
+                     (int) $aWord['count'],
+                     substr_count($aWord['word_token'], ' ')
+                 );
             } else {
             } else {
-                $oToken = new Token\Word(
+                $oToken = new Token\Partial(
                     $iId,
                     $iId,
-                    $aWord['word_token'][0] != ' ',
-                    (int) $aWord['count'],
-                    substr_count($aWord['word_token'], ' ')
+                    (int) $aWord['count']
                 );
             }
 
             if ($oToken) {
                 );
             }
 
             if ($oToken) {
-                $oValidTokens->addToken($aWord['word_token'], $oToken);
+                // remove any leading spaces
+                if ($aWord['word_token'][0] == ' ') {
+                    $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken);
+                } else {
+                    $oValidTokens->addToken($aWord['word_token'], $oToken);
+                }
             }
         }
     }
             }
         }
     }
index 50207c31785d6c3579418fa42931f8d6a39be81c..ec2d7e68cbeb5ed4baa011db99ec13295f9cc1aa 100644 (file)
@@ -212,17 +212,26 @@ class Tokenizer
                 ) {
                     $oToken = new Token\Country($iId, $aWord['country_code']);
                 }
                 ) {
                     $oToken = new Token\Country($iId, $aWord['country_code']);
                 }
-            } else {
+            } elseif ($aWord['word_token'][0] == ' ') {
                 $oToken = new Token\Word(
                     $iId,
                 $oToken = new Token\Word(
                     $iId,
-                    $aWord['word_token'][0] != ' ',
                     (int) $aWord['count'],
                     substr_count($aWord['word_token'], ' ')
                 );
                     (int) $aWord['count'],
                     substr_count($aWord['word_token'], ' ')
                 );
+            } else {
+                $oToken = new Token\Partial(
+                    $iId,
+                    (int) $aWord['count']
+                );
             }
 
             if ($oToken) {
             }
 
             if ($oToken) {
-                $oValidTokens->addToken($aWord['word_token'], $oToken);
+                // remove any leading spaces
+                if ($aWord['word_token'][0] == ' ') {
+                    $oValidTokens->addToken(substr($aWord['word_token'], 1), $oToken);
+                } else {
+                    $oValidTokens->addToken($aWord['word_token'], $oToken);
+                }
             }
         }
     }
             }
         }
     }