icu: no longer precompute terms

author Sarah Hoffmann <lonvia@denofr.de>

Tue, 19 Oct 2021 09:50:06 +0000 (11:50 +0200)

committer Sarah Hoffmann <lonvia@denofr.de>

Tue, 19 Oct 2021 09:52:28 +0000 (11:52 +0200)
author Sarah Hoffmann <lonvia@denofr.de>
Tue, 19 Oct 2021 09:50:06 +0000 (11:50 +0200)
committer Sarah Hoffmann <lonvia@denofr.de>
Tue, 19 Oct 2021 09:52:28 +0000 (11:52 +0200)
diff --git a/nominatim/tokenizer/icu_tokenizer.py b/nominatim/tokenizer/icu_tokenizer.py

index 686fbd7939ee70b10a5a7557cec334ba0e324733..2af0bcb257ad214f3e67621a7ac1aaa83b7092d1 100644 (file)
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -2,7 +2,6 @@
  Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
-from collections import Counter
  import itertools
  import json
  import logging
@@ -161,43 +160,6 @@ class LegacyICUTokenizer(AbstractTokenizer):
              sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
-            LOG.warning("Precomputing word tokens")
-
-            # get partial words and their frequencies
-            words = self._count_partial_terms(conn)
-
-            # copy them back into the word table
-            with CopyBuffer() as copystr:
-                for term, cnt in words.items():
-                    copystr.add('w', term, json.dumps({'count': cnt}))
-
-                with conn.cursor() as cur:
-                    copystr.copy_out(cur, 'word',
-                                     columns=['type', 'word_token', 'info'])
-                    cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null and type = 'w'""")
-
-            conn.commit()
-
-    def _count_partial_terms(self, conn):
-        """ Count the partial terms from the names in the place table.
-        """
-        words = Counter()
-        analysis = self.loader.make_token_analysis()
-
-        with conn.cursor(name="words") as cur:
-            cur.execute(""" SELECT v, count(*) FROM
-                              (SELECT svals(name) as v FROM place)x
-                            WHERE length(v) < 75 GROUP BY v""")
-
-            for name, cnt in cur:
-                word = analysis.search.transliterate(name)
-                if word and ' ' in word:
-                    for term in set(word.split()):
-                        words[term] += cnt
-
-        return words
-
  
  class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
author	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 19 Oct 2021 09:50:06 +0000 (11:50 +0200)
committer	Sarah Hoffmann <lonvia@denofr.de>
	Tue, 19 Oct 2021 09:52:28 +0000 (11:52 +0200)