Merge pull request #2322 from mtmail/type-label-already-lowercased

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 064b395c6a8a88d7d1f22230fbd463852241f3b9..065fdb03a27041eb79bb435db387fdfd316d6801 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -3,6 +3,7 @@ Tokenizer implementing normalisation as used before Nominatim 4 but using
  libICU instead of the PostgreSQL module.
  """
  from collections import Counter
+import functools
  import io
  import itertools
  import json
@@ -34,7 +35,7 @@ def create(dsn, data_dir):
  class LegacyICUTokenizer:
      """ This tokenizer uses libICU to covert names and queries to ASCII.
          Otherwise it uses the same algorithms and data structures as the
-        normalization routines in Nominatm 3.
+        normalization routines in Nominatim 3.
      """
  
      def __init__(self, dsn, data_dir):
@@ -126,7 +127,7 @@ class LegacyICUTokenizer:
              Analyzers are not thread-safe. You need to instantiate one per thread.
          """
          norm = Transliterator.createFromRules("normalizer", self.normalization)
-        trans = Transliterator.createFromRules("normalizer", self.transliteration)
+        trans = Transliterator.createFromRules("trans", self.transliteration)
          return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
  
  
@@ -184,7 +185,9 @@ class LegacyICUTokenizer:
              # copy them back into the word table
              copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
  
+
              with conn.cursor() as cur:
+                copystr.seek(0)
                  cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
                  cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                                 WHERE word_id is null""")
@@ -205,7 +208,6 @@ class LegacyICUNameAnalyzer:
          self.normalizer = normalizer
          self.transliterator = transliterator
          self.abbreviations = abbreviations
-        #psycopg2.extras.register_hstore(self.conn)
  
          self._cache = _TokenCache()
  
@@ -226,14 +228,44 @@ class LegacyICUNameAnalyzer:
              self.conn = None
  
  
+    def get_word_token_info(self, conn, words):
+        """ Return token information for the given list of words.
+            If a word starts with # it is assumed to be a full name
+            otherwise is a partial name.
+
+            The function returns a list of tuples with
+            (original word, word token, word id).
+
+            The function is used for testing and debugging only
+            and not necessarily efficient.
+        """
+        tokens = {}
+        for word in words:
+            if word.startswith('#'):
+                tokens[word] = ' ' + self.make_standard_word(word[1:])
+            else:
+                tokens[word] = self.make_standard_word(word)
+
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word_token, word_id
+                           FROM word, (SELECT unnest(%s::TEXT[]) as term) t
+                           WHERE word_token = t.term
+                                 and class is null and country_code is null""",
+                        (list(tokens.values()), ))
+            ids = {r[0]: r[1] for r in cur}
+
+        return [(k, v, ids[v]) for k, v in tokens.items()]
+
+
      def normalize(self, phrase):
          """ Normalize the given phrase, i.e. remove all properties that
              are irrelevant for search.
          """
          return self.normalizer.transliterate(phrase)
  
+    @functools.lru_cache(maxsize=1024)
      def make_standard_word(self, name):
-        """ Create the normalised version of the name.
+        """ Create the normalised version of the input.
          """
          norm = ' ' + self.transliterator.transliterate(name) + ' '
          for full, abbr in self.abbreviations:
@@ -265,6 +297,7 @@ class LegacyICUNameAnalyzer:
                  copystr.write(self.transliterator.transliterate(postcode))
                  copystr.write('\tplace\tpostcode\t0\n')
  
+            copystr.seek(0)
              cur.copy_from(copystr, 'word',
                            columns=['word', 'word_token', 'class', 'type',
                                     'search_name_count'])
@@ -307,6 +340,7 @@ class LegacyICUNameAnalyzer:
                          copystr.write(oper if oper in ('in', 'near')  else '\\N')
                          copystr.write('\t0\n')
  
+                copystr.seek(0)
                  cur.copy_from(copystr, 'word',
                                columns=['word', 'word_token', 'class', 'type',
                                         'operator', 'search_name_count'])
@@ -328,24 +362,25 @@ class LegacyICUNameAnalyzer:
          """
          full_names = set((self.make_standard_word(n) for n in names))
          full_names.discard('')
-        self._add_normalised_country_names(country_code, full_names)
+        self._add_normalized_country_names(country_code, full_names)
  
  
-    def _add_normalised_country_names(self, country_code, names):
+    def _add_normalized_country_names(self, country_code, names):
          """ Add names for the given country to the search index.
          """
+        word_tokens = set((' ' + name for name in names))
          with self.conn.cursor() as cur:
              # Get existing names
              cur.execute("SELECT word_token FROM word WHERE country_code = %s",
                          (country_code, ))
-            new_names = names.difference((t[0] for t in cur))
+            word_tokens.difference_update((t[0] for t in cur))
  
-            if new_names:
+            if word_tokens:
                  cur.execute("""INSERT INTO word (word_id, word_token, country_code,
                                                   search_name_count)
                                 (SELECT nextval('seq_word'), token, '{}', 0
                                  FROM unnest(%s) as token)
-                            """.format(country_code), (list(new_names),))
+                            """.format(country_code), (list(word_tokens),))
  
  
      def process_place(self, place):
@@ -366,7 +401,7 @@ class LegacyICUNameAnalyzer:
  
              country_feature = place.get('country_feature')
              if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
-                self._add_normalised_country_names(country_feature.lower(),
+                self._add_normalized_country_names(country_feature.lower(),
                                                     full_names)
  
          address = place.get('address')