update unit tests for adapted abbreviation code

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 20932144412d42809c363c2c7f2722414ff4a51a..f3eb7b4ef4fd9fae8bfcf6f7ef538ded91cfa08e 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,16 +3,13 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
-import functools
  import io
  import itertools
-import json
  import logging
  import re
  from textwrap import dedent
  from pathlib import Path
  
-from icu import Transliterator
  import psycopg2.extras
  
  from nominatim.db.connection import connect
@@ -103,9 +100,7 @@ class LegacyICUTokenizer:
          """
          self.init_from_project()
  
-        if self.normalization is None\
-           or self.transliteration is None\
-           or self.abbreviations is None:
+        if self.naming_rules is None:
              return "Configuration for tokenizer 'legacy_icu' are missing."
  
          return None
@@ -320,40 +315,64 @@ class LegacyICUNameAnalyzer:
              for label, cls, typ, oper in cur:
                  existing_phrases.add((label, cls, typ, oper or '-'))
  
-            to_add = norm_phrases - existing_phrases
-            to_delete = existing_phrases - norm_phrases
-
-            if to_add:
-                copystr = io.StringIO()
-                for word, cls, typ, oper in to_add:
-                    term = self.name_processor.get_search_normalized(word)
-                    if term:
-                        copystr.write(word)
-                        copystr.write('\t ')
-                        copystr.write(term)
-                        copystr.write('\t')
-                        copystr.write(cls)
-                        copystr.write('\t')
-                        copystr.write(typ)
-                        copystr.write('\t')
-                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                        copystr.write('\t0\n')
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
  
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'operator', 'search_name_count'])
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), added, deleted)
  
-            if to_delete and should_replace:
-                psycopg2.extras.execute_values(
-                    cur,
-                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                        WHERE word = name and class = in_class and type = in_type
-                              and ((op = '-' and operator is null) or op = operator)""",
-                    to_delete)
  
-        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
-                 len(norm_phrases), len(to_add), len(to_delete))
+    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Add all phrases to the database that are not yet there.
+        """
+        to_add = new_phrases - existing_phrases
+
+        copystr = io.StringIO()
+        added = 0
+        for word, cls, typ, oper in to_add:
+            term = self.name_processor.get_search_normalized(word)
+            if term:
+                copystr.write(word)
+                copystr.write('\t ')
+                copystr.write(term)
+                copystr.write('\t')
+                copystr.write(cls)
+                copystr.write('\t')
+                copystr.write(typ)
+                copystr.write('\t')
+                copystr.write(oper if oper in ('in', 'near')  else '\\N')
+                copystr.write('\t0\n')
+                added += 1
+
+
+        if copystr.tell() > 0:
+            copystr.seek(0)
+            cursor.copy_from(copystr, 'word',
+                             columns=['word', 'word_token', 'class', 'type',
+                                      'operator', 'search_name_count'])
+
+        return added
+
+
+    def _remove_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Remove all phrases from the databse that are no longer in the
+            new phrase list.
+        """
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            psycopg2.extras.execute_values(
+                cursor,
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE word = name and class = in_class and type = in_type
+                          and ((op = '-' and operator is null) or op = operator)""",
+                to_delete)
+
+        return len(to_delete)
  
  
      def add_country_names(self, country_code, names):
@@ -451,7 +470,8 @@ class LegacyICUNameAnalyzer:
          return full_tokens, partial_tokens
  
  
-    def _compute_full_names(self, names):
+    @staticmethod
+    def _compute_full_names(names):
          """ Return the set of all full name word ids to be used with the
              given dictionary of names.
          """
@@ -534,7 +554,7 @@ class _TokenInfo:
          self.data['hnr'] = ';'.join(hnrs)
  
  
-    def add_street(self, fulls, partials):
+    def add_street(self, fulls, _):
          """ Add addr:street match terms.
          """
          if fulls: