move default country name creation to tokenizer

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 27 Apr 2021 09:37:18 +0000 (11:37 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 27 Apr 2021 09:37:18 +0000 (11:37 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
diff --git a/lib-sql/tokenizer/legacy_tokenizer.sql b/lib-sql/tokenizer/legacy_tokenizer.sql

index 5e80b414565e13a6e88df8cdd922adaeb757370a..2bbf3ed73b4bd94c06885e8f5f5ed6e2f0daca7c 100644 (file)
--- a/lib-sql/tokenizer/legacy_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer.sql
@@ -202,29 +202,6 @@ $$
  LANGUAGE plpgsql;
  
  
-CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
-                                               lookup_country_code varchar(2))
-  RETURNS INTEGER
-  AS $$
-DECLARE
-  lookup_token TEXT;
-  return_word_id INTEGER;
-BEGIN
-  lookup_token := ' '||trim(lookup_word);
-  SELECT min(word_id) FROM word
-    WHERE word_token = lookup_token and country_code=lookup_country_code
-    INTO return_word_id;
-  IF return_word_id IS NULL THEN
-    return_word_id := nextval('seq_word');
-    INSERT INTO word VALUES (return_word_id, lookup_token, null,
-                             null, null, lookup_country_code, 0);
-  END IF;
-  RETURN return_word_id;
-END;
-$$
-LANGUAGE plpgsql;
-
-
  CREATE OR REPLACE FUNCTION getorcreate_amenity(lookup_word TEXT, normalized_word TEXT,
                                                 lookup_class text, lookup_type text)
    RETURNS INTEGER
@@ -363,36 +340,6 @@ $$
  LANGUAGE plpgsql STABLE STRICT;
  
  
-CREATE OR REPLACE FUNCTION create_country(src HSTORE, country_code varchar(2))
-  RETURNS VOID
-  AS $$
-DECLARE
-  s TEXT;
-  w INTEGER;
-  words TEXT[];
-  item RECORD;
-  j INTEGER;
-BEGIN
-  FOR item IN SELECT (each(src)).* LOOP
-
-    s := make_standard_name(item.value);
-    w := getorcreate_country(s, country_code);
-
-    words := regexp_split_to_array(item.value, E'[,;()]');
-    IF array_upper(words, 1) != 1 THEN
-      FOR j IN 1..array_upper(words, 1) LOOP
-        s := make_standard_name(words[j]);
-        IF s != '' THEN
-          w := getorcreate_country(s, country_code);
-        END IF;
-      END LOOP;
-    END IF;
-  END LOOP;
-END;
-$$
-LANGUAGE plpgsql;
-
-
  CREATE OR REPLACE FUNCTION make_keywords(src HSTORE)
    RETURNS INTEGER[]
    AS $$
diff --git a/nominatim/clicmd/setup.py b/nominatim/clicmd/setup.py

index 1ce9cf3e0039642641c0c8ead82f52ca9b2b7d2c..0f19d0975d460b6a243151e9c902ec460ed7afbd 100644 (file)
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -133,7 +133,8 @@ class SetupAll:
              database_import.create_search_indices(conn, args.config,
                                                    drop=args.no_updates)
              LOG.warning('Create search index for default country names.')
-            database_import.create_country_names(conn, args.config)
+            database_import.create_country_names(conn, tokenizer,
+                                                 args.config.LANGUAGES)
  
          webdir = args.project_dir / 'website'
          LOG.warning('Setup website at %s', webdir)
diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py

index 7d27306469b0fb077cd7f599b2ac7884682668d5..d4068aea83410f1b78665badee27247eabaa81c1 100644 (file)
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -223,6 +223,21 @@ class LegacyNameAnalyzer:
                             FROM (SELECT distinct(postcode) as pc
                                   FROM location_postcode) x""")
  
+
+    def add_country_names(self, country_code, names):
+        """ Add names for the given country to the search index.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """INSERT INTO word (word_id, word_token, country_code)
+                   (SELECT nextval('seq_word'), lookup_token, %s
+                      FROM (SELECT ' ' || make_standard_name(n) as lookup_token
+                            FROM unnest(%s)n) y
+                      WHERE NOT EXISTS(SELECT * FROM word
+                                       WHERE word_token = lookup_token and country_code = %s))
+                """, (country_code, names, country_code))
+
+
      def process_place(self, place):
          """ Determine tokenizer information about the given place.
  
@@ -231,7 +246,14 @@ class LegacyNameAnalyzer:
          """
          token_info = _TokenInfo(self._cache)
  
-        token_info.add_names(self.conn, place.get('name'), place.get('country_feature'))
+        names = place.get('name')
+
+        if names:
+            token_info.add_names(self.conn, names)
+
+            country_feature = place.get('country_feature')
+            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
+                self.add_country_names(country_feature.lower(), list(names.values()))
  
          address = place.get('address')
  
@@ -279,22 +301,14 @@ class _TokenInfo:
          self.data = {}
  
  
-    def add_names(self, conn, names, country_feature):
+    def add_names(self, conn, names):
          """ Add token information for the names of the place.
          """
-        if not names:
-            return
-
          with conn.cursor() as cur:
              # Create the token IDs for all names.
              self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
                                              (names, ))
  
-            # Add country tokens to word table if necessary.
-            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                cur.execute("SELECT create_country(%s, %s)",
-                            (names, country_feature.lower()))
-
  
      def add_housenumbers(self, conn, hnrs):
          """ Extract housenumber information from the address.
@@ -334,7 +348,8 @@ class _TokenInfo:
          """
          def _get_place(name):
              with conn.cursor() as cur:
-                cur.execute("""SELECT (addr_ids_from_name(%s) || getorcreate_name_id(make_standard_name(%s), ''))::text,
+                cur.execute("""SELECT (addr_ids_from_name(%s)
+                                       || getorcreate_name_id(make_standard_name(%s), ''))::text,
                                        word_ids_from_name(%s)::text""",
                              (name, name, name))
                  return cur.fetchone()
diff --git a/nominatim/tools/database_import.py b/nominatim/tools/database_import.py

index 400ce7a5bbd0f3a49d366b22a8ed1c32d445b544..664d3c6b39ed2fafb57e45960c052ccd76401505 100644 (file)
--- a/nominatim/tools/database_import.py
+++ b/nominatim/tools/database_import.py
@@ -8,6 +8,7 @@ import subprocess
  from pathlib import Path
  
  import psutil
+import psycopg2.extras
  
  from nominatim.db.connection import connect, get_pg_env
  from nominatim.db import utils as db_utils
@@ -250,34 +251,37 @@ def create_search_indices(conn, config, drop=False):
  
      sql.run_sql_file(conn, 'indices.sql', drop=drop)
  
-def create_country_names(conn, config):
-    """ Create search index for default country names.
+def create_country_names(conn, tokenizer, languages=None):
+    """ Add default country names to search index. `languages` is a comma-
+        separated list of language codes as used in OSM. If `languages` is not
+        empty then only name translations for the given languages are added
+        to the index.
      """
+    if languages:
+        languages = languages.split(',')
+
+    def _include_key(key):
+        return key == 'name' or \
+               (key.startswith('name:') \
+                and (not languages or key[5:] in languages))
  
      with conn.cursor() as cur:
-        cur.execute("""SELECT getorcreate_country(make_standard_name('uk'), 'gb')""")
-        cur.execute("""SELECT getorcreate_country(make_standard_name('united states'), 'us')""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(country_code),
-                       country_code) FROM country_name WHERE country_code is not null) AS x""")
-        cur.execute("""SELECT COUNT(*) FROM
-                       (SELECT getorcreate_country(make_standard_name(name->'name'), country_code) 
-                       FROM country_name WHERE name ? 'name') AS x""")
-        sql_statement = """SELECT COUNT(*) FROM (SELECT getorcreate_country(make_standard_name(v),
-                           country_code) FROM (SELECT country_code, skeys(name)
-                           AS k, svals(name) AS v FROM country_name) x WHERE k"""
-
-        languages = config.LANGUAGES
-
-        if languages:
-            sql_statement = "{} IN (".format(sql_statement)
-            delim = ''
-            for language in languages.split(','):
-                sql_statement = "{}{}'name:{}'".format(sql_statement, delim, language)
-                delim = ', '
-            sql_statement = '{})'.format(sql_statement)
-        else:
-            sql_statement = "{} LIKE 'name:%'".format(sql_statement)
-        sql_statement = "{}) v".format(sql_statement)
-        cur.execute(sql_statement)
+        psycopg2.extras.register_hstore(cur)
+        cur.execute("""SELECT country_code, name FROM country_name
+                       WHERE country_code is not null""")
+
+        with tokenizer.name_analyzer() as analyzer:
+            for code, name in cur:
+                names = [code]
+                if code == 'gb':
+                    names.append('UK')
+                if code == 'us':
+                    names.append('United States')
+
+                # country names (only in languages as provided)
+                if name:
+                    names.extend((v for k, v in name.items() if _include_key(k)))
+
+                analyzer.add_country_names(code, names)
+
      conn.commit()
diff --git a/test/python/conftest.py b/test/python/conftest.py

index f43f09d074c7ab56ce3837872ff2539ba3f18f86..493620c45ece19c5abf3a7477e41886f7a36c192 100644 (file)
--- a/test/python/conftest.py
+++ b/test/python/conftest.py
@@ -121,9 +121,8 @@ def table_factory(temp_db_cursor):
      def mk_table(name, definition='id INT', content=None):
          temp_db_cursor.execute('CREATE TABLE {} ({})'.format(name, definition))
          if content is not None:
-            if not isinstance(content, str):
-                content = '),('.join([str(x) for x in content])
-            temp_db_cursor.execute("INSERT INTO {} VALUES ({})".format(name, content))
+            psycopg2.extras.execute_values(
+                temp_db_cursor, "INSERT INTO {} VALUES %s".format(name), content)
  
      return mk_table
  
@@ -290,7 +289,7 @@ def osm2pgsql_options(temp_db):
  
  @pytest.fixture
  def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
-    table_factory('country_name', 'partition INT', (0, 1, 2))
+    table_factory('country_name', 'partition INT', ((0, ), (1, ), (2, )))
      cfg = Configuration(None, SRC_DIR.resolve() / 'settings')
      cfg.set_libdirs(module='.', osm2pgsql='.', php=SRC_DIR / 'lib-php',
                      sql=tmp_path, data=SRC_DIR / 'data')
@@ -299,9 +298,10 @@ def sql_preprocessor(temp_db_conn, tmp_path, monkeypatch, table_factory):
  
  
  @pytest.fixture
-def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
+def tokenizer_mock(monkeypatch, property_table, temp_db_conn, tmp_path):
      """ Sets up the configuration so that the test dummy tokenizer will be
-        loaded.
+        loaded when the tokenizer factory is used. Also returns a factory
+        with which a new dummy tokenizer may be created.
      """
      monkeypatch.setenv('NOMINATIM_TOKENIZER', 'dummy')
  
@@ -310,3 +310,8 @@ def tokenizer_mock(monkeypatch, property_table, temp_db_conn, dsn):
  
      monkeypatch.setattr(importlib, "import_module", _import_dummy)
      properties.set_property(temp_db_conn, 'tokenizer', 'dummy')
+
+    def _create_tokenizer():
+        return dummy_tokenizer.DummyTokenizer(None, None)
+
+    return _create_tokenizer
diff --git a/test/python/dummy_tokenizer.py b/test/python/dummy_tokenizer.py

index ceea4a7ededdaa1e829e6ebf902ef42455e33e47..79197dfb5b8d0fe83cb0a07fdadbbc2c7cab6b51 100644 (file)
--- a/test/python/dummy_tokenizer.py
+++ b/test/python/dummy_tokenizer.py
@@ -13,6 +13,7 @@ class DummyTokenizer:
          self.dsn = dsn
          self.data_dir = data_dir
          self.init_state = None
+        self.analyser_cache = {}
  
  
      def init_new_db(self, config):
@@ -26,7 +27,7 @@ class DummyTokenizer:
  
  
      def name_analyzer(self):
-        return DummyNameAnalyzer()
+        return DummyNameAnalyzer(self.analyser_cache)
  
  
  class DummyNameAnalyzer:
@@ -38,18 +39,20 @@ class DummyNameAnalyzer:
          self.close()
  
  
+    def __init__(self, cache):
+        self.analyser_cache = cache
+        cache['countries'] = []
+
+
      def close(self):
-        """ Free all resources used by the analyzer.
-        """
          pass
  
      def add_postcodes_from_db(self):
          pass
  
-    def process_place(self, place):
-        """ Determine tokenizer information about the given place.
  
-            Returns a JSON-serialisable structure that will be handed into
-            the database via the token_info field.
-        """
+    def add_country_names(self, code, names):
+        self.analyser_cache['countries'].append((code, names))
+
+    def process_place(self, place):
          return {}
diff --git a/test/python/test_tools_database_import.py b/test/python/test_tools_database_import.py

index e31017e89bdf174db2bdc8d3a390d2fcbe1b052b..ceac7a2421dc43afd23ecc1cf3a4c3066c7b02f1 100644 (file)
--- a/test/python/test_tools_database_import.py
+++ b/test/python/test_tools_database_import.py
@@ -143,7 +143,8 @@ def test_truncate_database_tables(temp_db_conn, temp_db_cursor, table_factory):
                'location_property_tiger', 'location_property_osmline',
                'location_postcode', 'search_name', 'location_road_23')
      for table in tables:
-        table_factory(table, content=(1, 2, 3))
+        table_factory(table, content=((1, ), (2, ), (3, )))
+        assert temp_db_cursor.table_rows(table) == 3
  
      database_import.truncate_data_tables(temp_db_conn)
  
@@ -168,31 +169,28 @@ def test_load_data(dsn, src_dir, place_row, placex_table, osmline_table, word_ta
      assert temp_db_cursor.table_rows('placex') == 30
      assert temp_db_cursor.table_rows('location_property_osmline') == 1
  
-@pytest.mark.parametrize("languages", (False, True))
-def test_create_country_names(temp_db_conn, temp_db_cursor, def_config,
-                              temp_db_with_extensions, monkeypatch, languages):
-    if languages:
-        monkeypatch.setenv('NOMINATIM_LANGUAGES', 'fr,en')
-    temp_db_cursor.execute("""CREATE FUNCTION make_standard_name (name TEXT)
-                                  RETURNS TEXT AS $$ SELECT 'a'::TEXT $$ LANGUAGE SQL
-                               """)
-    temp_db_cursor.execute('CREATE TABLE country_name (country_code varchar(2), name hstore)')
-    temp_db_cursor.execute('CREATE TABLE word (code varchar(2))')
-    temp_db_cursor.execute("""INSERT INTO country_name VALUES ('us',
-                              '"name"=>"us","name:af"=>"us"')""")
-    temp_db_cursor.execute("""CREATE OR REPLACE FUNCTION getorcreate_country(lookup_word TEXT,
-                            lookup_country_code varchar(2))
-                            RETURNS INTEGER
-                            AS $$
-                            BEGIN
-                                INSERT INTO word VALUES (lookup_country_code);
-                                RETURN 5;
-                            END;
-                            $$
-                            LANGUAGE plpgsql;
-                               """)
-    database_import.create_country_names(temp_db_conn, def_config)
+
+@pytest.mark.parametrize("languages", (None, ' fr,en'))
+def test_create_country_names(temp_db_with_extensions, temp_db_conn, temp_db_cursor,
+                              table_factory, tokenizer_mock, languages):
+
+    table_factory('country_name', 'country_code varchar(2), name hstore',
+                  content=(('us', '"name"=>"us1","name:af"=>"us2"'),
+                           ('fr', '"name"=>"Fra", "name:en"=>"Fren"')))
+
+    assert temp_db_cursor.scalar("SELECT count(*) FROM country_name") == 2
+
+    tokenizer = tokenizer_mock()
+
+    database_import.create_country_names(temp_db_conn, tokenizer, languages)
+
+    assert len(tokenizer.analyser_cache['countries']) == 2
+
+    result_set = {k: set(v) for k, v in tokenizer.analyser_cache['countries']}
+
      if languages:
-        assert temp_db_cursor.table_rows('word') == 4
+        assert result_set == {'us' : set(('us', 'us1', 'United States')),
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
      else:
-        assert temp_db_cursor.table_rows('word') == 5
+        assert result_set == {'us' : set(('us', 'us1', 'us2', 'United States')),
+                              'fr' : set(('fr', 'Fra', 'Fren'))}
diff --git a/test/python/test_tools_refresh_create_functions.py b/test/python/test_tools_refresh_create_functions.py

index 53ea2b520a2cb207c0d9ac85ddada67604be5d5e..3f9bccbdd4162f81165dcdddd64f88879b179ad7 100644 (file)
--- a/test/python/test_tools_refresh_create_functions.py
+++ b/test/python/test_tools_refresh_create_functions.py
@@ -11,9 +11,7 @@ def sql_tmp_path(tmp_path, def_config):
      return tmp_path
  
  @pytest.fixture
-def conn(temp_db_conn, table_factory, monkeypatch):
-    monkeypatch.setenv('NOMINATIM_DATABASE_MODULE_PATH', '.')
-    table_factory('country_name', 'partition INT', (0, 1, 2))
+def conn(sql_preprocessor, temp_db_conn):
      return temp_db_conn
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 27 Apr 2021 09:37:18 +0000 (11:37 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Fri, 30 Apr 2021 09:30:51 +0000 (11:30 +0200)
lib-sql/tokenizer/legacy_tokenizer.sql		patch \| blob \| history
nominatim/clicmd/setup.py		patch \| blob \| history
nominatim/tokenizer/legacy_tokenizer.py		patch \| blob \| history
nominatim/tools/database_import.py		patch \| blob \| history
test/python/conftest.py		patch \| blob \| history
test/python/dummy_tokenizer.py		patch \| blob \| history
test/python/test_tools_database_import.py		patch \| blob \| history
test/python/test_tools_refresh_create_functions.py		patch \| blob \| history