From 2f6e4edcdb0f919f6a0481dd02664e4b2e507aaa Mon Sep 17 00:00:00 2001
From: Sarah Hoffmann <lonvia@denofr.de>
Date: Wed, 9 Jun 2021 15:07:36 +0200
Subject: [PATCH] update unit tests for adapted abbreviation code

---
 .pylintrc                                   |  2 +-
 nominatim/tokenizer/icu_name_processor.py   |  3 +
 nominatim/tokenizer/icu_rule_loader.py      |  8 +-
 nominatim/tokenizer/legacy_icu_tokenizer.py | 98 +++++++++++++--------
 test/python/test_tokenizer_legacy.py        |  6 +-
 test/python/test_tokenizer_legacy_icu.py    |  8 +-
 test/python/test_tools_database_import.py   |  2 +-
 7 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 28ce1ff4..e283292a 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,7 +1,7 @@
 [MASTER]
 
 extension-pkg-whitelist=osmium
-ignored-modules=icu
+ignored-modules=icu,datrie
 
 [MESSAGES CONTROL]
 
diff --git a/nominatim/tokenizer/icu_name_processor.py b/nominatim/tokenizer/icu_name_processor.py
index a0f22974..4d5975c3 100644
--- a/nominatim/tokenizer/icu_name_processor.py
+++ b/nominatim/tokenizer/icu_name_processor.py
@@ -58,6 +58,9 @@ class ICUNameProcessorRules:
 
 
 class ICUNameProcessor:
+    """ Collects the different transformation rules for normalisation of names
+        and provides the functions to aply the transformations.
+    """
 
     def __init__(self, rules):
         self.normalizer = Transliterator.createFromRules("icu_normalization",
diff --git a/nominatim/tokenizer/icu_rule_loader.py b/nominatim/tokenizer/icu_rule_loader.py
index 2597656b..a11b9bd8 100644
--- a/nominatim/tokenizer/icu_rule_loader.py
+++ b/nominatim/tokenizer/icu_rule_loader.py
@@ -2,11 +2,11 @@
 Helper class to create ICU rules from a configuration file.
 """
 import io
-import yaml
 import logging
 from collections import defaultdict
 import itertools
 
+import yaml
 from icu import Transliterator
 
 from nominatim.errors import UsageError
@@ -20,6 +20,8 @@ class ICURuleLoader:
 
     def __init__(self, configfile):
         self.configfile = configfile
+        self.compound_suffixes = set()
+        self.abbreviations = defaultdict()
 
         if configfile.suffix == '.yaml':
             self._load_from_yaml()
@@ -42,7 +44,7 @@ class ICURuleLoader:
             suffixes.add(suffix)
             suffixes.update(self.abbreviations.get(suffix, []))
 
-        for suffix in sorted(suffixes, key=lambda x:len(x), reverse=True):
+        for suffix in sorted(suffixes, key=len, reverse=True):
             rules.write("'{0} ' > ' {0} ';".format(suffix))
 
         # Finally add transliteration.
@@ -85,7 +87,7 @@ class ICURuleLoader:
                 synonyms[abbr + ' '].add(' ' + abbr + ' ')
 
         # sort the resulting list by descending length (longer matches are prefered).
-        sorted_keys = sorted(synonyms.keys(), key=lambda x: len(x), reverse=True)
+        sorted_keys = sorted(synonyms.keys(), key=len, reverse=True)
 
         return [(k, list(synonyms[k])) for k in sorted_keys]
 
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py
index 20932144..f3eb7b4e 100644
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,16 +3,13 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
-import functools
 import io
 import itertools
-import json
 import logging
 import re
 from textwrap import dedent
 from pathlib import Path
 
-from icu import Transliterator
 import psycopg2.extras
 
 from nominatim.db.connection import connect
@@ -103,9 +100,7 @@ class LegacyICUTokenizer:
         """
         self.init_from_project()
 
-        if self.normalization is None\
-           or self.transliteration is None\
-           or self.abbreviations is None:
+        if self.naming_rules is None:
             return "Configuration for tokenizer 'legacy_icu' are missing."
 
         return None
@@ -320,40 +315,64 @@ class LegacyICUNameAnalyzer:
             for label, cls, typ, oper in cur:
                 existing_phrases.add((label, cls, typ, oper or '-'))
 
-            to_add = norm_phrases - existing_phrases
-            to_delete = existing_phrases - norm_phrases
-
-            if to_add:
-                copystr = io.StringIO()
-                for word, cls, typ, oper in to_add:
-                    term = self.name_processor.get_search_normalized(word)
-                    if term:
-                        copystr.write(word)
-                        copystr.write('\t ')
-                        copystr.write(term)
-                        copystr.write('\t')
-                        copystr.write(cls)
-                        copystr.write('\t')
-                        copystr.write(typ)
-                        copystr.write('\t')
-                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
-                        copystr.write('\t0\n')
+            added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
+            if should_replace:
+                deleted = self._remove_special_phrases(cur, norm_phrases,
+                                                       existing_phrases)
+            else:
+                deleted = 0
 
-                copystr.seek(0)
-                cur.copy_from(copystr, 'word',
-                              columns=['word', 'word_token', 'class', 'type',
-                                       'operator', 'search_name_count'])
+        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
+                 len(norm_phrases), added, deleted)
 
-            if to_delete and should_replace:
-                psycopg2.extras.execute_values(
-                    cur,
-                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                        WHERE word = name and class = in_class and type = in_type
-                              and ((op = '-' and operator is null) or op = operator)""",
-                    to_delete)
 
-        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
-                 len(norm_phrases), len(to_add), len(to_delete))
+    def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Add all phrases to the database that are not yet there.
+        """
+        to_add = new_phrases - existing_phrases
+
+        copystr = io.StringIO()
+        added = 0
+        for word, cls, typ, oper in to_add:
+            term = self.name_processor.get_search_normalized(word)
+            if term:
+                copystr.write(word)
+                copystr.write('\t ')
+                copystr.write(term)
+                copystr.write('\t')
+                copystr.write(cls)
+                copystr.write('\t')
+                copystr.write(typ)
+                copystr.write('\t')
+                copystr.write(oper if oper in ('in', 'near')  else '\\N')
+                copystr.write('\t0\n')
+                added += 1
+
+
+        if copystr.tell() > 0:
+            copystr.seek(0)
+            cursor.copy_from(copystr, 'word',
+                             columns=['word', 'word_token', 'class', 'type',
+                                      'operator', 'search_name_count'])
+
+        return added
+
+
+    def _remove_special_phrases(self, cursor, new_phrases, existing_phrases):
+        """ Remove all phrases from the databse that are no longer in the
+            new phrase list.
+        """
+        to_delete = existing_phrases - new_phrases
+
+        if to_delete:
+            psycopg2.extras.execute_values(
+                cursor,
+                """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
+                    WHERE word = name and class = in_class and type = in_type
+                          and ((op = '-' and operator is null) or op = operator)""",
+                to_delete)
+
+        return len(to_delete)
 
 
     def add_country_names(self, country_code, names):
@@ -451,7 +470,8 @@ class LegacyICUNameAnalyzer:
         return full_tokens, partial_tokens
 
 
-    def _compute_full_names(self, names):
+    @staticmethod
+    def _compute_full_names(names):
         """ Return the set of all full name word ids to be used with the
             given dictionary of names.
         """
@@ -534,7 +554,7 @@ class _TokenInfo:
         self.data['hnr'] = ';'.join(hnrs)
 
 
-    def add_street(self, fulls, partials):
+    def add_street(self, fulls, _):
         """ Add addr:street match terms.
         """
         if fulls:
diff --git a/test/python/test_tokenizer_legacy.py b/test/python/test_tokenizer_legacy.py
index 35e24ca4..4dd3a141 100644
--- a/test/python/test_tokenizer_legacy.py
+++ b/test/python/test_tokenizer_legacy.py
@@ -260,7 +260,9 @@ def test_update_special_phrase_modify(analyzer, word_table, make_standard_name):
 
 
 def test_add_country_names(analyzer, word_table, make_standard_name):
-    analyzer.add_country_names('de', ['Germany', 'Deutschland', 'germany'])
+    analyzer.add_country_names('de', {'name': 'Germany',
+                                      'name:de': 'Deutschland',
+                                      'short_name': 'germany'})
 
     assert word_table.get_country() \
                == {('de', ' #germany#'),
@@ -272,7 +274,7 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
     word_table.add_country('it', ' #italy#')
     word_table.add_country('it', ' #itala#')
 
-    analyzer.add_country_names('it', ['Italy', 'IT'])
+    analyzer.add_country_names('it', {'name': 'Italy', 'ref': 'IT'})
 
     assert word_table.get_country() \
                == {('fr', ' #france#'),
diff --git a/test/python/test_tokenizer_legacy_icu.py b/test/python/test_tokenizer_legacy_icu.py
index f7558dac..0f9230ac 100644
--- a/test/python/test_tokenizer_legacy_icu.py
+++ b/test/python/test_tokenizer_legacy_icu.py
@@ -212,14 +212,14 @@ def test_update_postcodes_from_db_add_and_remove(analyzer, table_factory, word_t
 def test_update_special_phrase_empty_table(analyzer, word_table):
     with analyzer() as anl:
         anl.update_special_phrases([
-            ("KÃ¶nig bei", "amenity", "royal", "near"),
-            ("KÃ¶nige", "amenity", "royal", "-"),
+            ("KÃ¶nig  bei", "amenity", "royal", "near"),
+            ("KÃ¶nige ", "amenity", "royal", "-"),
             ("street", "highway", "primary", "in")
         ], True)
 
     assert word_table.get_special() \
-               == {(' KÃNIG BEI', 'kÃ¶nig bei', 'amenity', 'royal', 'near'),
-                   (' KÃNIGE', 'kÃ¶nige', 'amenity', 'royal', None),
+               == {(' KÃNIG BEI', 'KÃ¶nig bei', 'amenity', 'royal', 'near'),
+                   (' KÃNIGE', 'KÃ¶nige', 'amenity', 'royal', None),
                    (' STREET', 'street', 'highway', 'primary', 'in')}
 
 
diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py
index 621610cf..2291c166 100644
--- a/test/python/test_tools_database_import.py
+++ b/test/python/test_tools_database_import.py
@@ -180,7 +180,7 @@ def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cur
 
     assert len(tokenizer.analyser_cache['countries']) == 2
 
-    result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
+    result_set = {k: set(v.values()) for k, v in tokenizer.analyser_cache['countries']}
 
     if languages:
         assert result_set == {'us' : set(('us', 'us1', 'United States')),
-- 
2.39.5