+++ /dev/null
-DROP TABLE IF EXISTS word_frequencies;
-CREATE TABLE word_frequencies AS
- SELECT unnest(name_vector) as id, count(*) FROM search_name GROUP BY id;
-
-CREATE INDEX idx_word_frequencies ON word_frequencies(id);
-
-UPDATE word SET search_name_count = count
- FROM word_frequencies
- WHERE word_token like ' %' and word_id = id;
-
-DROP TABLE word_frequencies;
"Postcode updates on a frozen database is not possible.")
if args.word_counts:
- LOG.warning('Recompute frequency of full-word search terms')
- refresh.recompute_word_counts(args.config.get_libpq_dsn(), args.sqllib_dir)
+ LOG.warning('Recompute word statistics')
+ self._get_tokenizer(args.config).update_statistics()
if args.address_levels:
cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
pass
+ @abstractmethod
+ def update_statistics(self) -> None:
+ """ Recompute any tokenizer statistics necessary for efficient lookup.
+ This function is meant to be called from time to time by the user
+ to improve performance. However, the tokenizer must not depend on
+ it to be called in order to work.
+ """
+ pass
+
+
@abstractmethod
def name_analyzer(self) -> AbstractAnalyzer:
""" Create a new analyzer for tokenizing names and queries
Tokenizer implementing normalisation as used before Nominatim 4 but using
libICU instead of the PostgreSQL module.
"""
-from collections import Counter
import itertools
import json
import logging
return None
+ def update_statistics(self):
+ """ Recompute frequencies for all name words.
+ """
+ with connect(self.dsn) as conn:
+ with conn.cursor() as cur:
+ cur.drop_table("word_frequencies")
+ LOG.info("Computing word frequencies")
+ cur.execute("""CREATE TEMP TABLE word_frequencies AS
+ SELECT unnest(name_vector) as id, count(*)
+ FROM search_name GROUP BY id""")
+ cur.execute("CREATE INDEX ON word_frequencies(id)")
+ LOG.info("Update word table with recomputed frequencies")
+ cur.execute("""UPDATE word
+ SET info = info || jsonb_build_object('count', count)
+ FROM word_frequencies WHERE word_id = id""")
+ cur.drop_table("word_frequencies")
+ conn.commit()
+
+
def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
conn.commit()
- LOG.warning("Precomputing word tokens")
-
- # get partial words and their frequencies
- words = self._count_partial_terms(conn)
-
- # copy them back into the word table
- with CopyBuffer() as copystr:
- for term, cnt in words.items():
- copystr.add('w', term, json.dumps({'count': cnt}))
-
- with conn.cursor() as cur:
- copystr.copy_out(cur, 'word',
- columns=['type', 'word_token', 'info'])
- cur.execute("""UPDATE word SET word_id = nextval('seq_word')
- WHERE word_id is null and type = 'w'""")
-
- conn.commit()
-
- def _count_partial_terms(self, conn):
- """ Count the partial terms from the names in the place table.
- """
- words = Counter()
- analysis = self.loader.make_token_analysis()
-
- with conn.cursor(name="words") as cur:
- cur.execute(""" SELECT v, count(*) FROM
- (SELECT svals(name) as v FROM place)x
- WHERE length(v) < 75 GROUP BY v""")
-
- for name, cnt in cur:
- word = analysis.search.transliterate(name)
- if word and ' ' in word:
- for term in set(word.split()):
- words[term] += cnt
-
- return words
-
class LegacyICUNameAnalyzer(AbstractAnalyzer):
""" The legacy analyzer uses the ICU library for splitting names.
self._save_config(conn, config)
+ def update_statistics(self):
+ """ Recompute the frequency of full words.
+ """
+ with connect(self.dsn) as conn:
+ with conn.cursor() as cur:
+ cur.drop_table("word_frequencies")
+ LOG.info("Computing word frequencies")
+ cur.execute("""CREATE TEMP TABLE word_frequencies AS
+ SELECT unnest(name_vector) as id, count(*)
+ FROM search_name GROUP BY id""")
+ cur.execute("CREATE INDEX ON word_frequencies(id)")
+ LOG.info("Update word table with recomputed frequencies")
+ cur.execute("""UPDATE word SET search_name_count = count
+ FROM word_frequencies
+ WHERE word_token like ' %' and word_id = id""")
+ cur.drop_table("word_frequencies")
+ conn.commit()
+
def name_analyzer(self):
""" Create a new analyzer for tokenizing names and queries
using this tokinzer. Analyzers are context managers and should
LOG = logging.getLogger()
-def recompute_word_counts(dsn, sql_dir):
- """ Compute the frequency of full-word search terms.
- """
- execute_file(dsn, sql_dir / 'words_from_search_name.sql')
-
-
def _add_address_level_rows_from_entry(rows, entry):
""" Converts a single entry from the JSON format for address rank
descriptions into a flat format suitable for inserting into a
def __init__(self, *args, **kwargs):
self.update_sql_functions_called = False
self.finalize_import_called = False
+ self.update_statistics_called = False
def update_sql_functions(self, *args):
self.update_sql_functions_called = True
def finalize_import(self, *args):
self.finalize_import_called = True
+ def update_statistics(self):
+ self.update_statistics_called = True
+
+
tok = DummyTokenizer()
monkeypatch.setattr(nominatim.tokenizer.factory, 'get_tokenizer_for_db',
lambda *args: tok)
assert func.called == 1
@pytest.mark.parametrize("command,func", [
- ('word-counts', 'recompute_word_counts'),
('address-levels', 'load_address_levels_from_file'),
('wiki-data', 'import_wikipedia_articles'),
('importance', 'recompute_importance'),
assert func_mock.called == 1
+ def test_refresh_word_count(self):
+ assert self.call_nominatim('refresh', '--word-count') == 0
+ assert self.tokenizer_mock.update_statistics_called
+
+
def test_refresh_postcodes(self, mock_func_factory, place_table):
func_mock = mock_func_factory(nominatim.tools.postcodes, 'update_postcodes')
idx_mock = mock_func_factory(nominatim.indexer.indexer.Indexer, 'index_postcodes')
assert db_prop(icu_tokenizer.DBCFG_TERM_NORMALIZATION) == ':: lower();'
-def test_init_word_table(tokenizer_factory, test_config, place_row, word_table):
+def test_init_word_table(tokenizer_factory, test_config, place_row, temp_db_cursor):
place_row(names={'name' : 'Test Area', 'ref' : '52'})
place_row(names={'name' : 'No Area'})
place_row(names={'name' : 'Holzstrasse'})
tok = tokenizer_factory()
tok.init_new_db(test_config)
- assert word_table.get_partial_words() == {('test', 1),
- ('no', 1), ('area', 2)}
+ assert temp_db_cursor.table_exists('word')
def test_init_from_project(monkeypatch, test_config, tokenizer_factory):