switch special phrases to new word table format

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 20 Jul 2021 19:11:01 +0000 (21:11 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 20 Jul 2021 19:11:01 +0000 (21:11 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
diff --git a/lib-php/tokenizer/legacy_icu_tokenizer.php b/lib-php/tokenizer/legacy_icu_tokenizer.php

index 2461a1fd828698bb6786e16778b62c1b8b6077f2..70358976f15a0e7f6a67d485d8fe5eb6977bb1bb 100644 (file)
--- a/lib-php/tokenizer/legacy_icu_tokenizer.php
+++ b/lib-php/tokenizer/legacy_icu_tokenizer.php
@@ -147,7 +147,9 @@ class Tokenizer
      {
          // Check which tokens we have, get the ID numbers
          $sSQL = 'SELECT word_id, word_token, type';
-        $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode";
+        $sSQL .= "      info->>'cc' as country, info->>'postcode' as postcode,";
+        $sSQL .= "      info->>'word' as word, info->>'op' as operator,";
+        $sSQL .= "      info->>'class' as class, info->>'type' as type";
          $sSQL .= ' FROM word WHERE word_token in (';
          $sSQL .= join(',', $this->oDB->getDBQuotedList($aTokens)).')';
  
@@ -180,7 +182,26 @@ class Tokenizer
                      ) {
                         continue;
                      }
+                    $sNormPostcode = $this->normalizeString($aWord['postcode']);
+                    if (strpos($sNormQuery, $sNormPostcode) === false) {
+                        continue;
+                    }
                      $oToken = new Token\Postcode($iId, $aWord['postcode'], null);
+                    break;
+                'S':  // tokens for classification terms (special phrases)
+                    if ($aWord['class'] === null || $aWord['type'] === null
+                        || $aWord['word'] === null
+                        || strpos($sNormQuery, $aWord['word']) === false
+                    ) {
+                        continue;
+                    }
+                    $oToken = new Token\SpecialTerm(
+                        $iId,
+                        $aWord['class'],
+                        $aWord['type'],
+                        $aWord['op'] ? Operator::NEAR : Operator::NONE
+                    );
+                    break;
                  default:
                      continue;
              }
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index e0fd3a023d5453fc82838af563c01d9d4f759181..a645b598df3d546c222f9c8f4e0e7c8275dc6be4 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -299,6 +299,9 @@ class LegacyICUNameAnalyzer:
  
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
+            If `should_replace` is True, then the previous set of will be
+            completely replaced. Otherwise the phrases are added to the
+            already existing ones.
          """
          norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
@@ -306,11 +309,10 @@ class LegacyICUNameAnalyzer:
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
-            cur.execute("""SELECT word, class, type, operator FROM word
-                           WHERE class != 'place'
-                                 OR (type != 'house' AND type != 'postcode')""")
-            for label, cls, typ, oper in cur:
-                existing_phrases.add((label, cls, typ, oper or '-'))
+            cur.execute("SELECT info FROM word WHERE type = 'S'")
+            for (info, ) in cur:
+                existing_phrases.add((info['word'], info['class'], info['type'],
+                                      info.get('op') or '-'))
  
              added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
              if should_replace:
@@ -333,13 +335,13 @@ class LegacyICUNameAnalyzer:
              for word, cls, typ, oper in to_add:
                  term = self.name_processor.get_search_normalized(word)
                  if term:
-                    copystr.add(word, ' ' + term, cls, typ,
-                                oper if oper in ('in', 'near') else None, 0)
+                    copystr.add(term, 'S',
+                                {'word': word, 'class': cls, 'type': typ,
+                                 'op': oper if oper in ('in', 'near') else None})
                      added += 1
  
              copystr.copy_out(cursor, 'word',
-                             columns=['word', 'word_token', 'class', 'type',
-                                      'operator', 'search_name_count'])
+                             columns=['word_token', 'type', 'info'])
  
          return added
  
@@ -354,9 +356,10 @@ class LegacyICUNameAnalyzer:
          if to_delete:
              cursor.execute_values(
                  """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                    WHERE word = name and class = in_class and type = in_type
-                          and ((op = '-' and operator is null) or op = operator)""",
-                to_delete)
+                    WHERE info->>'word' = name
+                          and info->>'class' = in_class and info->>'type' = in_type
+                          and ((op = '-' and info->>'op' is null) or op = info->>'op')
+                """, to_delete)
  
          return len(to_delete)
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 20 Jul 2021 19:11:01 +0000 (21:11 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Wed, 28 Jul 2021 09:31:47 +0000 (11:31 +0200)
lib-php/tokenizer/legacy_icu_tokenizer.php		patch \| blob \| history
nominatim/tokenizer/legacy_icu_tokenizer.py		patch \| blob \| history