php: make word list a first-class object

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 6bf409cca3ab3674b41605b06e8dfe49eda40e41..44034f842622f08257878b69d392af1f47b00df7 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -4,19 +4,19 @@ libICU instead of the PostgreSQL module.
  """
  from collections import Counter
  import itertools
  """
  from collections import Counter
  import itertools
+import json
  import logging
  import re
  from textwrap import dedent
  from pathlib import Path
  
  import logging
  import re
  from textwrap import dedent
  from pathlib import Path
  
-import psycopg2.extras
-
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  from nominatim.db.connection import connect
  from nominatim.db.properties import set_property, get_property
  from nominatim.db.utils import CopyBuffer
  from nominatim.db.sql_preprocessor import SQLPreprocessor
  from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
+from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  
  DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
@@ -29,7 +29,7 @@ def create(dsn, data_dir):
      return LegacyICUTokenizer(dsn, data_dir)
  
  
      return LegacyICUTokenizer(dsn, data_dir)
  
  
-class LegacyICUTokenizer:
+class LegacyICUTokenizer(AbstractTokenizer):
      """ This tokenizer uses libICU to covert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
      """ This tokenizer uses libICU to covert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
          normalization routines in Nominatim 3.
@@ -76,13 +76,10 @@ class LegacyICUTokenizer:
              self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  
  
              self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  
  
-    def finalize_import(self, config):
+    def finalize_import(self, _):
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
          """ Do any required postprocessing to make the tokenizer data ready
              for use.
          """
-        with connect(self.dsn) as conn:
-            sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  
  
      def update_sql_functions(self, config):
  
  
      def update_sql_functions(self, config):
@@ -123,18 +120,17 @@ class LegacyICUTokenizer:
          """
          return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
          """
          return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
  
-    # pylint: disable=missing-format-attribute
+
      def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
          php_file = self.data_dir / "tokenizer.php"
      def _install_php(self, phpdir):
          """ Install the php script for the tokenizer.
          """
          php_file = self.data_dir / "tokenizer.php"
-        php_file.write_text(dedent("""\
+        php_file.write_text(dedent(f"""\
              <?php
              <?php
-            @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
-            @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
-            @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
-            require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
-            """.format(self, phpdir)))
+            @define('CONST_Max_Word_Frequency', {self.max_word_frequency});
+            @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
+            @define('CONST_Transliteration', "{self.naming_rules.search_rules}");
+            require_once('{phpdir}/tokenizer/legacy_icu_tokenizer.php');"""))
  
  
      def _save_config(self, config):
  
  
      def _save_config(self, config):
@@ -154,40 +150,50 @@ class LegacyICUTokenizer:
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
          """
          with connect(self.dsn) as conn:
              sqlp = SQLPreprocessor(conn, config)
-            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
+            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
              conn.commit()
  
              LOG.warning("Precomputing word tokens")
  
              # get partial words and their frequencies
              conn.commit()
  
              LOG.warning("Precomputing word tokens")
  
              # get partial words and their frequencies
-            words = Counter()
-            name_proc = ICUNameProcessor(self.naming_rules)
-            with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
-
-                for name, cnt in cur:
-                    terms = set()
-                    for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
-                        if ' ' in word:
-                            terms.update(word.split())
-                    for term in terms:
-                        words[term] += cnt
+            words = self._count_partial_terms(conn)
  
              # copy them back into the word table
              with CopyBuffer() as copystr:
  
              # copy them back into the word table
              with CopyBuffer() as copystr:
-                for args in words.items():
-                    copystr.add(*args)
+                for term, cnt in words.items():
+                    copystr.add('w', term, json.dumps({'count': cnt}))
  
                  with conn.cursor() as cur:
                      copystr.copy_out(cur, 'word',
  
                  with conn.cursor() as cur:
                      copystr.copy_out(cur, 'word',
-                                     columns=['word_token', 'search_name_count'])
+                                     columns=['type', 'word_token', 'info'])
                      cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                      cur.execute("""UPDATE word SET word_id = nextval('seq_word')
-                                   WHERE word_id is null""")
+                                   WHERE word_id is null and type = 'w'""")
  
              conn.commit()
  
  
              conn.commit()
  
+    def _count_partial_terms(self, conn):
+        """ Count the partial terms from the names in the place table.
+        """
+        words = Counter()
+        name_proc = ICUNameProcessor(self.naming_rules)
+
+        with conn.cursor(name="words") as cur:
+            cur.execute(""" SELECT v, count(*) FROM
+                              (SELECT svals(name) as v FROM place)x
+                            WHERE length(v) < 75 GROUP BY v""")
  
  
-class LegacyICUNameAnalyzer:
+            for name, cnt in cur:
+                terms = set()
+                for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
+                    if ' ' in word:
+                        terms.update(word.split())
+                for term in terms:
+                    words[term] += cnt
+
+        return words
+
+
+class LegacyICUNameAnalyzer(AbstractAnalyzer):
      """ The legacy analyzer uses the ICU library for splitting names.
  
          Each instance opens a connection to the database to request the
      """ The legacy analyzer uses the ICU library for splitting names.
  
          Each instance opens a connection to the database to request the
@@ -202,14 +208,6 @@ class LegacyICUNameAnalyzer:
          self._cache = _TokenCache()
  
  
          self._cache = _TokenCache()
  
  
-    def __enter__(self):
-        return self
-
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.close()
-
-
      def close(self):
          """ Free all resources used by the analyzer.
          """
      def close(self):
          """ Free all resources used by the analyzer.
          """
@@ -229,22 +227,26 @@ class LegacyICUNameAnalyzer:
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
              The function is used for testing and debugging only
              and not necessarily efficient.
          """
-        tokens = {}
+        full_tokens = {}
+        partial_tokens = {}
          for word in words:
              if word.startswith('#'):
          for word in words:
              if word.startswith('#'):
-                tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
+                full_tokens[word] = self.name_processor.get_search_normalized(word[1:])
              else:
              else:
-                tokens[word] = self.name_processor.get_search_normalized(word)
+                partial_tokens[word] = self.name_processor.get_search_normalized(word)
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
  
          with self.conn.cursor() as cur:
              cur.execute("""SELECT word_token, word_id
-                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
-                           WHERE word_token = t.term
-                                 and class is null and country_code is null""",
-                        (list(tokens.values()), ))
-            ids = {r[0]: r[1] for r in cur}
+                            FROM word WHERE word_token = ANY(%s) and type = 'W'
+                        """, (list(full_tokens.values()),))
+            full_ids = {r[0]: r[1] for r in cur}
+            cur.execute("""SELECT word_token, word_id
+                            FROM word WHERE word_token = ANY(%s) and type = 'w'""",
+                        (list(partial_tokens.values()),))
+            part_ids = {r[0]: r[1] for r in cur}
  
  
-        return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
+        return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
+               + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
  
  
      @staticmethod
  
  
      @staticmethod
@@ -276,8 +278,7 @@ class LegacyICUNameAnalyzer:
                              (SELECT pc, word FROM
                                (SELECT distinct(postcode) as pc FROM location_postcode) p
                                FULL JOIN
                              (SELECT pc, word FROM
                                (SELECT distinct(postcode) as pc FROM location_postcode) p
                                FULL JOIN
-                              (SELECT word FROM word
-                                WHERE class ='place' and type = 'postcode') w
+                              (SELECT word FROM word WHERE type = 'P') w
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
                                ON pc = word) x
                             WHERE pc is null or word is null""")
  
@@ -286,24 +287,23 @@ class LegacyICUNameAnalyzer:
                      if postcode is None:
                          to_delete.append(word)
                      else:
                      if postcode is None:
                          to_delete.append(word)
                      else:
-                        copystr.add(
-                            postcode,
-                            ' ' + self.name_processor.get_search_normalized(postcode),
-                            'place', 'postcode', 0)
+                        copystr.add(self.name_processor.get_search_normalized(postcode),
+                                    'P', postcode)
  
                  if to_delete:
                      cur.execute("""DELETE FROM WORD
  
                  if to_delete:
                      cur.execute("""DELETE FROM WORD
-                                   WHERE class ='place' and type = 'postcode'
-                                         and word = any(%s)
+                                   WHERE type ='P' and word = any(%s)
                                  """, (to_delete, ))
  
                  copystr.copy_out(cur, 'word',
                                  """, (to_delete, ))
  
                  copystr.copy_out(cur, 'word',
-                                 columns=['word', 'word_token', 'class', 'type',
-                                          'search_name_count'])
+                                 columns=['word_token', 'type', 'word'])
  
  
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
  
  
      def update_special_phrases(self, phrases, should_replace):
          """ Replace the search index for special phrases with the new phrases.
+            If `should_replace` is True, then the previous set of will be
+            completely replaced. Otherwise the phrases are added to the
+            already existing ones.
          """
          norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
          """
          norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
                              for p in phrases))
@@ -311,11 +311,10 @@ class LegacyICUNameAnalyzer:
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
          with self.conn.cursor() as cur:
              # Get the old phrases.
              existing_phrases = set()
-            cur.execute("""SELECT word, class, type, operator FROM word
-                           WHERE class != 'place'
-                                 OR (type != 'house' AND type != 'postcode')""")
-            for label, cls, typ, oper in cur:
-                existing_phrases.add((label, cls, typ, oper or '-'))
+            cur.execute("SELECT word, info FROM word WHERE type = 'S'")
+            for word, info in cur:
+                existing_phrases.add((word, info['class'], info['type'],
+                                      info.get('op') or '-'))
  
              added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
              if should_replace:
  
              added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
              if should_replace:
@@ -338,13 +337,13 @@ class LegacyICUNameAnalyzer:
              for word, cls, typ, oper in to_add:
                  term = self.name_processor.get_search_normalized(word)
                  if term:
              for word, cls, typ, oper in to_add:
                  term = self.name_processor.get_search_normalized(word)
                  if term:
-                    copystr.add(word, ' ' + term, cls, typ,
-                                oper if oper in ('in', 'near')  else None, 0)
+                    copystr.add(term, 'S', word,
+                                json.dumps({'class': cls, 'type': typ,
+                                            'op': oper if oper in ('in', 'near') else None}))
                      added += 1
  
              copystr.copy_out(cursor, 'word',
                      added += 1
  
              copystr.copy_out(cursor, 'word',
-                             columns=['word', 'word_token', 'class', 'type',
-                                      'operator', 'search_name_count'])
+                             columns=['word_token', 'type', 'word', 'info'])
  
          return added
  
  
          return added
  
@@ -357,12 +356,12 @@ class LegacyICUNameAnalyzer:
          to_delete = existing_phrases - new_phrases
  
          if to_delete:
          to_delete = existing_phrases - new_phrases
  
          if to_delete:
-            psycopg2.extras.execute_values(
-                cursor,
+            cursor.execute_values(
                  """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
                  """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
-                    WHERE word = name and class = in_class and type = in_type
-                          and ((op = '-' and operator is null) or op = operator)""",
-                to_delete)
+                    WHERE type = 'S' and word = name
+                          and info->>'class' = in_class and info->>'type' = in_type
+                          and ((op = '-' and info->>'op' is null) or op = info->>'op')
+                """, to_delete)
  
          return len(to_delete)
  
  
          return len(to_delete)
  
@@ -372,21 +371,27 @@ class LegacyICUNameAnalyzer:
          """
          word_tokens = set()
          for name in self._compute_full_names(names):
          """
          word_tokens = set()
          for name in self._compute_full_names(names):
-            if name:
-                word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
+            norm_name = self.name_processor.get_search_normalized(name)
+            if norm_name:
+                word_tokens.add(norm_name)
  
          with self.conn.cursor() as cur:
              # Get existing names
  
          with self.conn.cursor() as cur:
              # Get existing names
-            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
+            cur.execute("""SELECT word_token FROM word
+                            WHERE type = 'C' and word = %s""",
                          (country_code, ))
              word_tokens.difference_update((t[0] for t in cur))
  
                          (country_code, ))
              word_tokens.difference_update((t[0] for t in cur))
  
+            # Only add those names that are not yet in the list.
              if word_tokens:
              if word_tokens:
-                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
-                                                 search_name_count)
-                               (SELECT nextval('seq_word'), token, '{}', 0
+                cur.execute("""INSERT INTO word (word_token, type, word)
+                               (SELECT token, 'C', %s
                                  FROM unnest(%s) as token)
                                  FROM unnest(%s) as token)
-                            """.format(country_code), (list(word_tokens),))
+                            """, (country_code, list(word_tokens)))
+
+            # No names are deleted at the moment.
+            # If deletion is made possible, then the static names from the
+            # initial 'country_name' table should be kept.
  
  
      def process_place(self, place):
  
  
      def process_place(self, place):
@@ -409,33 +414,36 @@ class LegacyICUNameAnalyzer:
                  self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
                  self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
-
          if address:
          if address:
-            hnrs = []
-            addr_terms = []
-            for key, value in address.items():
-                if key == 'postcode':
-                    self._add_postcode(value)
-                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                    hnrs.append(value)
-                elif key == 'street':
-                    token_info.add_street(*self._compute_name_tokens({'name': value}))
-                elif key == 'place':
-                    token_info.add_place(*self._compute_name_tokens({'name': value}))
-                elif not key.startswith('_') and \
-                     key not in ('country', 'full'):
-                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))
-
-            if hnrs:
-                hnrs = self._split_housenumbers(hnrs)
-                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
-            if addr_terms:
-                token_info.add_address_terms(addr_terms)
+            self._process_place_address(token_info, address)
  
          return token_info.data
  
  
  
          return token_info.data
  
  
+    def _process_place_address(self, token_info, address):
+        hnrs = []
+        addr_terms = []
+        for key, value in address.items():
+            if key == 'postcode':
+                self._add_postcode(value)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(*self._compute_name_tokens({'name': value}))
+            elif key == 'place':
+                token_info.add_place(*self._compute_name_tokens({'name': value}))
+            elif not key.startswith('_') and \
+                 key not in ('country', 'full'):
+                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+        if hnrs:
+            hnrs = self._split_housenumbers(hnrs)
+            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+        if addr_terms:
+            token_info.add_address_terms(addr_terms)
+
+
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.
@@ -495,14 +503,12 @@ class LegacyICUNameAnalyzer:
  
                  with self.conn.cursor() as cur:
                      # no word_id needed for postcodes
  
                  with self.conn.cursor() as cur:
                      # no word_id needed for postcodes
-                    cur.execute("""INSERT INTO word (word, word_token, class, type,
-                                                     search_name_count)
-                                   (SELECT pc, %s, 'place', 'postcode', 0
-                                    FROM (VALUES (%s)) as v(pc)
+                    cur.execute("""INSERT INTO word (word_token, type, word)
+                                   (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
                                      WHERE NOT EXISTS
                                       (SELECT * FROM word
                                      WHERE NOT EXISTS
                                       (SELECT * FROM word
-                                      WHERE word = pc and class='place' and type='postcode'))
-                                """, (' ' + term, postcode))
+                                      WHERE type = 'P' and word = pc))
+                                """, (term, postcode))
                  self._cache.postcodes.add(postcode)
  
  
                  self._cache.postcodes.add(postcode)
  
  
@@ -593,7 +599,8 @@ class _TokenCache:
  
      def get_hnr_tokens(self, conn, terms):
          """ Get token ids for a list of housenumbers, looking them up in the
  
      def get_hnr_tokens(self, conn, terms):
          """ Get token ids for a list of housenumbers, looking them up in the
-            database if necessary.
+            database if necessary. `terms` is an iterable of normalized
+            housenumbers.
          """
          tokens = []
          askdb = []
          """
          tokens = []
          askdb = []