more formatting fixes

[nominatim.git] / nominatim / tokenizer / legacy_icu_tokenizer.py
diff --git a/nominatim/tokenizer/legacy_icu_tokenizer.py b/nominatim/tokenizer/legacy_icu_tokenizer.py

index 6bf409cca3ab3674b41605b06e8dfe49eda40e41..31e8c44b70352cbf633f13c230279f3a986cdca1 100644 (file)
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -163,7 +163,9 @@ class LegacyICUTokenizer:
              words = Counter()
              name_proc = ICUNameProcessor(self.naming_rules)
              with conn.cursor(name="words") as cur:
-                cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
+                cur.execute(""" SELECT v, count(*) FROM
+                                  (SELECT svals(name) as v FROM place)x
+                                WHERE length(v) < 75 GROUP BY v""")
  
                  for name, cnt in cur:
                      terms = set()
@@ -339,7 +341,7 @@ class LegacyICUNameAnalyzer:
                  term = self.name_processor.get_search_normalized(word)
                  if term:
                      copystr.add(word, ' ' + term, cls, typ,
-                                oper if oper in ('in', 'near')  else None, 0)
+                                oper if oper in ('in', 'near') else None, 0)
                      added += 1
  
              copystr.copy_out(cursor, 'word',
@@ -409,33 +411,36 @@ class LegacyICUNameAnalyzer:
                  self.add_country_names(country_feature.lower(), names)
  
          address = place.get('address')
-
          if address:
-            hnrs = []
-            addr_terms = []
-            for key, value in address.items():
-                if key == 'postcode':
-                    self._add_postcode(value)
-                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
-                    hnrs.append(value)
-                elif key == 'street':
-                    token_info.add_street(*self._compute_name_tokens({'name': value}))
-                elif key == 'place':
-                    token_info.add_place(*self._compute_name_tokens({'name': value}))
-                elif not key.startswith('_') and \
-                     key not in ('country', 'full'):
-                    addr_terms.append((key, *self._compute_name_tokens({'name': value})))
-
-            if hnrs:
-                hnrs = self._split_housenumbers(hnrs)
-                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
-
-            if addr_terms:
-                token_info.add_address_terms(addr_terms)
+            self._process_place_address(token_info, address)
  
          return token_info.data
  
  
+    def _process_place_address(self, token_info, address):
+        hnrs = []
+        addr_terms = []
+        for key, value in address.items():
+            if key == 'postcode':
+                self._add_postcode(value)
+            elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
+                hnrs.append(value)
+            elif key == 'street':
+                token_info.add_street(*self._compute_name_tokens({'name': value}))
+            elif key == 'place':
+                token_info.add_place(*self._compute_name_tokens({'name': value}))
+            elif not key.startswith('_') and \
+                 key not in ('country', 'full'):
+                addr_terms.append((key, *self._compute_name_tokens({'name': value})))
+
+        if hnrs:
+            hnrs = self._split_housenumbers(hnrs)
+            token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
+
+        if addr_terms:
+            token_info.add_address_terms(addr_terms)
+
+
      def _compute_name_tokens(self, names):
          """ Computes the full name and partial name tokens for the given
              dictionary of names.