]> git.openstreetmap.org Git - nominatim.git/commitdiff
enable BDD tests for different tokenizers
authorSarah Hoffmann <lonvia@denofr.de>
Wed, 5 May 2021 08:00:34 +0000 (10:00 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Wed, 5 May 2021 08:31:51 +0000 (10:31 +0200)
The tokenizer to be used can be choosen with -DTOKENIZER.

Adapt all tests, so that they work with legacy_icu tokenizer.
Move lookup in word table to a function in the tokenizer.
Special phrases are temporarily imported from the wiki until
we have an implementation that can import from file. TIGER
tests do not work yet.

nominatim/tokenizer/legacy_icu_tokenizer.py
nominatim/tokenizer/legacy_tokenizer.py
test/bdd/api/search/queries.feature
test/bdd/db/import/naming.feature
test/bdd/environment.py
test/bdd/steps/nominatim_environment.py
test/bdd/steps/steps_db_ops.py

index 6fba6c8d73833bbdd87d796a21d1b3cf711aa2d5..696004f5136d74a51a6b79985b2baf561f4a6371 100644 (file)
@@ -35,7 +35,7 @@ def create(dsn, data_dir):
 class LegacyICUTokenizer:
     """ This tokenizer uses libICU to covert names and queries to ASCII.
         Otherwise it uses the same algorithms and data structures as the
-        normalization routines in Nominatm 3.
+        normalization routines in Nominatim 3.
     """
 
     def __init__(self, dsn, data_dir):
@@ -228,6 +228,35 @@ class LegacyICUNameAnalyzer:
             self.conn = None
 
 
+    def get_word_token_info(self, conn, words):
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        tokens = {}
+        for word in words:
+            if word.startswith('#'):
+                tokens[word] = ' ' + self.make_standard_word(word[1:])
+            else:
+                tokens[word] = self.make_standard_word(word)
+
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = t.term
+                                 and class is null and country_code is null""",
+                        (list(tokens.values()), ))
+            ids = {r[0]: r[1] for r in cur}
+
+        return [(k, v, ids[v]) for k, v in tokens.items()]
+
+
     def normalize(self, phrase):
         """ Normalize the given phrase, i.e. remove all properties that
             are irrelevant for search.
@@ -236,7 +265,7 @@ class LegacyICUNameAnalyzer:
 
     @functools.lru_cache(maxsize=1024)
     def make_standard_word(self, name):
-        """ Create the normalised version of the name.
+        """ Create the normalised version of the input.
         """
         norm = ' ' + self.transliterator.transliterate(name) + ' '
         for full, abbr in self.abbreviations:
@@ -333,24 +362,25 @@ class LegacyICUNameAnalyzer:
         """
         full_names = set((self.make_standard_word(n) for n in names))
         full_names.discard('')
-        self._add_normalised_country_names(country_code, full_names)
+        self._add_normalized_country_names(country_code, full_names)
 
 
-    def _add_normalised_country_names(self, country_code, names):
+    def _add_normalized_country_names(self, country_code, names):
         """ Add names for the given country to the search index.
         """
+        word_tokens = set((' ' + name for name in names))
         with self.conn.cursor() as cur:
             # Get existing names
             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
                         (country_code, ))
-            new_names = names.difference((t[0] for t in cur))
+            word_tokens.difference_update((t[0] for t in cur))
 
-            if new_names:
+            if word_tokens:
                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
                                                  search_name_count)
                                (SELECT nextval('seq_word'), token, '{}', 0
                                 FROM unnest(%s) as token)
-                            """.format(country_code), (list(new_names),))
+                            """.format(country_code), (list(word_tokens),))
 
 
     def process_place(self, place):
@@ -371,7 +401,7 @@ class LegacyICUNameAnalyzer:
 
             country_feature = place.get('country_feature')
             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalised_country_names(country_feature.lower(),
+                self._add_normalized_country_names(country_feature.lower(),
                                                    full_names)
 
         address = place.get('address')
index 2f060b84aa98761b653f15a41f68566dc31cb2d8..438a5aff9ed3861995606c5d8409ff0c7ac13c35 100644 (file)
@@ -271,6 +271,33 @@ class LegacyNameAnalyzer:
             self.conn = None
 
 
+    @staticmethod
+    def get_word_token_info(conn, words):
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT t.term, word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = (CASE
+                                   WHEN left(t.term, 1) = '#' THEN
+                                     ' ' || make_standard_name(substring(t.term from 2))
+                                   ELSE
+                                     make_standard_name(t.term)
+                                   END)
+                                 and class is null and country_code is null""",
+                        (words, ))
+
+            return [(r[0], r[1], r[2]) for r in cur]
+
+
     def normalize(self, phrase):
         """ Normalize the given phrase, i.e. remove all properties that
             are irrelevant for search.
index ea353f4568ad46f83f8d62d7511ef400e7234e67..6d697ef96fe7ea3a3cded93f086d8e4f96ab1174 100644 (file)
@@ -163,7 +163,7 @@ Feature: Search queries
         Then exactly 0 results are returned
 
     Scenario: Ignore country searches when query is restricted to countries
-        When sending json search query "de"
+        When sending json search query "fr"
             | countrycodes |
             | li  |
         Then exactly 0 results are returned
index bb29d2a314a46f7f917e298c4cf37fa311eb340c..9e11eacd827902e22dfe7a90ebba45c34eb3c07a 100644 (file)
@@ -55,6 +55,4 @@ Feature: Import and search of names
         | Вологда |
         | Αθήνα |
         | القاهرة |
-        | រាជធានីភ្នំពេញ |
         | 東京都 |
-        | ပုဗ္ဗသီရိ |
index 30ea30a261c873cd10fecf334ce1dca9a5d90cf6..f179c8f13da343283b0aaf4deb855587a471cd6f 100644 (file)
@@ -20,6 +20,7 @@ userconfig = {
     'API_TEST_DB' : 'test_api_nominatim',
     'API_TEST_FILE'  : (TEST_BASE_DIR / 'testdb' / 'apidb-test-data.pbf').resolve(),
     'SERVER_MODULE_PATH' : None,
+    'TOKENIZER' : None, # Test with a custom tokenizer
     'PHPCOV' : False, # set to output directory to enable code coverage
 }
 
index f6e3a03945d0a8b834fbf8500874c74693648b7c..de02e3460b420a8991c49be208b5171b339463c1 100644 (file)
@@ -28,6 +28,7 @@ class NominatimEnvironment:
         self.test_db = config['TEST_DB']
         self.api_test_db = config['API_TEST_DB']
         self.api_test_file = config['API_TEST_FILE']
+        self.tokenizer = config['TOKENIZER']
         self.server_module_path = config['SERVER_MODULE_PATH']
         self.reuse_template = not config['REMOVE_TEMPLATE']
         self.keep_scenario_db = config['KEEP_TEST_DB']
@@ -96,6 +97,8 @@ class NominatimEnvironment:
         self.test_env['NOMINATIM_DATABASE_MODULE_SRC_PATH'] = str((self.build_dir / 'module').resolve())
         self.test_env['NOMINATIM_OSM2PGSQL_BINARY'] = str((self.build_dir / 'osm2pgsql' / 'osm2pgsql').resolve())
         self.test_env['NOMINATIM_NOMINATIM_TOOL'] = str((self.build_dir / 'nominatim').resolve())
+        if self.tokenizer is not None:
+            self.test_env['NOMINATIM_TOKENIZER'] = self.tokenizer
 
         if self.server_module_path:
             self.test_env['NOMINATIM_DATABASE_MODULE_PATH'] = self.server_module_path
@@ -189,11 +192,19 @@ class NominatimEnvironment:
 
                 try:
                     self.run_nominatim('import', '--osm-file', str(self.api_test_file))
-                    self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
+                    if self.tokenizer != 'legacy_icu':
+                        self.run_nominatim('add-data', '--tiger-data', str((testdata / 'tiger').resolve()))
                     self.run_nominatim('freeze')
 
-                    phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
-                    run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+                    if self.tokenizer != 'legacy_icu':
+                        phrase_file = str((testdata / 'specialphrases_testdb.sql').resolve())
+                        run_script(['psql', '-d', self.api_test_db, '-f', phrase_file])
+                    else:
+                        # XXX Temporary use the wiki while there is no CSV import
+                        # available.
+                        self.test_env['NOMINATIM_LANGUAGES'] = 'en'
+                        self.run_nominatim('special-phrases', '--import-from-wiki')
+                        del self.test_env['NOMINATIM_LANGUAGES']
                 except:
                     self.db_drop_database(self.api_test_db)
                     raise
index 52a50a511c8a095c0c9ce5bd8bd9cb9f7e0783fb..6d7bc188905c597211110c13b92952e336bab088 100644 (file)
@@ -199,44 +199,35 @@ def check_search_name_contents(context, exclude):
         have an identifier of the form '<NRW><osm id>[:<class>]'. All
         expected rows are expected to be present with at least one database row.
     """
-    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-        for row in context.table:
-            nid = NominatimID(row['object'])
-            nid.row_by_place_id(cur, 'search_name',
-                                ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
-            assert cur.rowcount > 0, "No rows found for " + row['object']
+    tokenizer = tokenizer_factory.get_tokenizer_for_db(context.nominatim.get_test_config())
+
+    with tokenizer.name_analyzer() as analyzer:
+        with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
+            for row in context.table:
+                nid = NominatimID(row['object'])
+                nid.row_by_place_id(cur, 'search_name',
+                                    ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
+                assert cur.rowcount > 0, "No rows found for " + row['object']
+
+                for res in cur:
+                    db_row = DBRow(nid, res, context)
+                    for name, value in zip(row.headings, row.cells):
+                        if name in ('name_vector', 'nameaddress_vector'):
+                            items = [x.strip() for x in value.split(',')]
+                            tokens = analyzer.get_word_token_info(context.db, items)
 
-            for res in cur:
-                db_row = DBRow(nid, res, context)
-                for name, value in zip(row.headings, row.cells):
-                    if name in ('name_vector', 'nameaddress_vector'):
-                        items = [x.strip() for x in value.split(',')]
-                        with context.db.cursor() as subcur:
-                            subcur.execute(""" SELECT word_id, word_token
-                                               FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                                               WHERE word_token = make_standard_name(t.term)
-                                                     and class is null and country_code is null
-                                                     and operator is null
-                                              UNION
-                                               SELECT word_id, word_token
-                                               FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                                               WHERE word_token = ' ' || make_standard_name(t.term)
-                                                     and class is null and country_code is null
-                                                     and operator is null
-                                           """,
-                                           (list(filter(lambda x: not x.startswith('#'), items)),
-                                            list(filter(lambda x: x.startswith('#'), items))))
                             if not exclude:
-                                assert subcur.rowcount >= len(items), \
-                                    "No word entry found for {}. Entries found: {!s}".format(value, subcur.rowcount)
-                            for wid in subcur:
-                                present = wid[0] in res[name]
+                                assert len(tokens) >= len(items), \
+                                       "No word entry found for {}. Entries found: {!s}".format(value, len(tokens))
+                            for word, token, wid in tokens:
                                 if exclude:
-                                    assert not present, "Found term for {}/{}: {}".format(row['object'], name, wid[1])
+                                    assert wid not in res[name], \
+                                           "Found term for {}/{}: {}".format(nid, name, wid)
                                 else:
-                                    assert present, "Missing term for {}/{}: {}".format(row['object'], name, wid[1])
-                    elif name != 'object':
-                        assert db_row.contains(name, value), db_row.assert_msg(name, value)
+                                    assert wid in res[name], \
+                                           "Missing term for {}/{}: {}".format(nid, name, wid)
+                        elif name != 'object':
+                            assert db_row.contains(name, value), db_row.assert_msg(name, value)
 
 @then("search_name has no entry for (?P<oid>.*)")
 def check_search_name_has_entry(context, oid):