]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/icu_tokenizer.py
adapt housenumber cleanup to new word table structure
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 import itertools
12 import json
13 import logging
14 import re
15 from textwrap import dedent
16
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
23
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
25
26 LOG = logging.getLogger()
27
28 def create(dsn, data_dir):
29     """ Create a new instance of the tokenizer provided by this module.
30     """
31     return LegacyICUTokenizer(dsn, data_dir)
32
33
34 class LegacyICUTokenizer(AbstractTokenizer):
35     """ This tokenizer uses libICU to covert names and queries to ASCII.
36         Otherwise it uses the same algorithms and data structures as the
37         normalization routines in Nominatim 3.
38     """
39
40     def __init__(self, dsn, data_dir):
41         self.dsn = dsn
42         self.data_dir = data_dir
43         self.loader = None
44
45
46     def init_new_db(self, config, init_db=True):
47         """ Set up a new tokenizer for the database.
48
49             This copies all necessary data in the project directory to make
50             sure the tokenizer remains stable even over updates.
51         """
52         self.loader = ICURuleLoader(config)
53
54         self._install_php(config.lib_dir.php)
55         self._save_config()
56
57         if init_db:
58             self.update_sql_functions(config)
59             self._init_db_tables(config)
60
61
62     def init_from_project(self, config):
63         """ Initialise the tokenizer from the project directory.
64         """
65         self.loader = ICURuleLoader(config)
66
67         with connect(self.dsn) as conn:
68             self.loader.load_config_from_db(conn)
69
70
71     def finalize_import(self, config):
72         """ Do any required postprocessing to make the tokenizer data ready
73             for use.
74         """
75         with connect(self.dsn) as conn:
76             sqlp = SQLPreprocessor(conn, config)
77             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
78
79
80     def update_sql_functions(self, config):
81         """ Reimport the SQL functions for this tokenizer.
82         """
83         with connect(self.dsn) as conn:
84             sqlp = SQLPreprocessor(conn, config)
85             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
86
87
88     def check_database(self, config):
89         """ Check that the tokenizer is set up correctly.
90         """
91         # Will throw an error if there is an issue.
92         self.init_from_project(config)
93
94
95     def update_statistics(self):
96         """ Recompute frequencies for all name words.
97         """
98         with connect(self.dsn) as conn:
99             if conn.table_exists('search_name'):
100                 with conn.cursor() as cur:
101                     cur.drop_table("word_frequencies")
102                     LOG.info("Computing word frequencies")
103                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
104                                      SELECT unnest(name_vector) as id, count(*)
105                                      FROM search_name GROUP BY id""")
106                     cur.execute("CREATE INDEX ON word_frequencies(id)")
107                     LOG.info("Update word table with recomputed frequencies")
108                     cur.execute("""UPDATE word
109                                    SET info = info || jsonb_build_object('count', count)
110                                    FROM word_frequencies WHERE word_id = id""")
111                     cur.drop_table("word_frequencies")
112             conn.commit()
113
114
115     def _cleanup_housenumbers(self):
116         """ Remove unused house numbers.
117         """
118         with connect(self.dsn) as conn:
119             if not conn.table_exists('search_name'):
120                 return
121             with conn.cursor(name="hnr_counter") as cur:
122                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token) FROM word
123                                WHERE type = 'H'
124                                  AND NOT EXISTS(SELECT * FROM search_name
125                                                 WHERE ARRAY[word.word_id] && name_vector)
126                                  AND (char_length(coalesce(word, word_token)) > 6
127                                       OR coalesce(word, word_token) not similar to '\\d+')
128                             """)
129                 candidates = {token: wid for wid, token in cur}
130             with conn.cursor(name="hnr_counter") as cur:
131                 cur.execute("""SELECT housenumber FROM placex
132                                WHERE housenumber is not null
133                                      AND (char_length(housenumber) > 6
134                                           OR housenumber not similar to '\\d+')
135                             """)
136                 for row in cur:
137                     for hnr in row[0].split(';'):
138                         candidates.pop(hnr, None)
139             LOG.info("There are %s outdated housenumbers.", len(candidates))
140             LOG.debug("Outdated housenumbers: %s", candidates.keys())
141             if candidates:
142                 with conn.cursor() as cur:
143                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
144                                 (list(candidates.values()), ))
145                 conn.commit()
146
147
148
149     def update_word_tokens(self):
150         """ Remove unused tokens.
151         """
152         LOG.warning("Cleaning up housenumber tokens.")
153         self._cleanup_housenumbers()
154         LOG.warning("Tokenizer house-keeping done.")
155
156
157     def name_analyzer(self):
158         """ Create a new analyzer for tokenizing names and queries
159             using this tokinzer. Analyzers are context managers and should
160             be used accordingly:
161
162             ```
163             with tokenizer.name_analyzer() as analyzer:
164                 analyser.tokenize()
165             ```
166
167             When used outside the with construct, the caller must ensure to
168             call the close() function before destructing the analyzer.
169
170             Analyzers are not thread-safe. You need to instantiate one per thread.
171         """
172         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
173                                      self.loader.make_token_analysis())
174
175
176     def _install_php(self, phpdir):
177         """ Install the php script for the tokenizer.
178         """
179         php_file = self.data_dir / "tokenizer.php"
180         php_file.write_text(dedent(f"""\
181             <?php
182             @define('CONST_Max_Word_Frequency', 10000000);
183             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
184             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
185             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
186
187
188     def _save_config(self):
189         """ Save the configuration that needs to remain stable for the given
190             database as database properties.
191         """
192         with connect(self.dsn) as conn:
193             self.loader.save_config_to_db(conn)
194
195
196     def _init_db_tables(self, config):
197         """ Set up the word table and fill it with pre-computed word
198             frequencies.
199         """
200         with connect(self.dsn) as conn:
201             sqlp = SQLPreprocessor(conn, config)
202             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
203             conn.commit()
204
205
206 class LegacyICUNameAnalyzer(AbstractAnalyzer):
207     """ The legacy analyzer uses the ICU library for splitting names.
208
209         Each instance opens a connection to the database to request the
210         normalization.
211     """
212
213     def __init__(self, dsn, sanitizer, token_analysis):
214         self.conn = connect(dsn).connection
215         self.conn.autocommit = True
216         self.sanitizer = sanitizer
217         self.token_analysis = token_analysis
218
219         self._cache = _TokenCache()
220
221
222     def close(self):
223         """ Free all resources used by the analyzer.
224         """
225         if self.conn:
226             self.conn.close()
227             self.conn = None
228
229
230     def _search_normalized(self, name):
231         """ Return the search token transliteration of the given name.
232         """
233         return self.token_analysis.search.transliterate(name).strip()
234
235
236     def _normalized(self, name):
237         """ Return the normalized version of the given name with all
238             non-relevant information removed.
239         """
240         return self.token_analysis.normalizer.transliterate(name).strip()
241
242
243     def get_word_token_info(self, words):
244         """ Return token information for the given list of words.
245             If a word starts with # it is assumed to be a full name
246             otherwise is a partial name.
247
248             The function returns a list of tuples with
249             (original word, word token, word id).
250
251             The function is used for testing and debugging only
252             and not necessarily efficient.
253         """
254         full_tokens = {}
255         partial_tokens = {}
256         for word in words:
257             if word.startswith('#'):
258                 full_tokens[word] = self._search_normalized(word[1:])
259             else:
260                 partial_tokens[word] = self._search_normalized(word)
261
262         with self.conn.cursor() as cur:
263             cur.execute("""SELECT word_token, word_id
264                             FROM word WHERE word_token = ANY(%s) and type = 'W'
265                         """, (list(full_tokens.values()),))
266             full_ids = {r[0]: r[1] for r in cur}
267             cur.execute("""SELECT word_token, word_id
268                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
269                         (list(partial_tokens.values()),))
270             part_ids = {r[0]: r[1] for r in cur}
271
272         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
273                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
274
275
276     @staticmethod
277     def normalize_postcode(postcode):
278         """ Convert the postcode to a standardized form.
279
280             This function must yield exactly the same result as the SQL function
281             'token_normalized_postcode()'.
282         """
283         return postcode.strip().upper()
284
285
286     def update_postcodes_from_db(self):
287         """ Update postcode tokens in the word table from the location_postcode
288             table.
289         """
290         to_delete = []
291         with self.conn.cursor() as cur:
292             # This finds us the rows in location_postcode and word that are
293             # missing in the other table.
294             cur.execute("""SELECT * FROM
295                             (SELECT pc, word FROM
296                               (SELECT distinct(postcode) as pc FROM location_postcode) p
297                               FULL JOIN
298                               (SELECT word FROM word WHERE type = 'P') w
299                               ON pc = word) x
300                            WHERE pc is null or word is null""")
301
302             with CopyBuffer() as copystr:
303                 for postcode, word in cur:
304                     if postcode is None:
305                         to_delete.append(word)
306                     else:
307                         copystr.add(self._search_normalized(postcode),
308                                     'P', postcode)
309
310                 if to_delete:
311                     cur.execute("""DELETE FROM WORD
312                                    WHERE type ='P' and word = any(%s)
313                                 """, (to_delete, ))
314
315                 copystr.copy_out(cur, 'word',
316                                  columns=['word_token', 'type', 'word'])
317
318
319     def update_special_phrases(self, phrases, should_replace):
320         """ Replace the search index for special phrases with the new phrases.
321             If `should_replace` is True, then the previous set of will be
322             completely replaced. Otherwise the phrases are added to the
323             already existing ones.
324         """
325         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
326                             for p in phrases))
327
328         with self.conn.cursor() as cur:
329             # Get the old phrases.
330             existing_phrases = set()
331             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
332             for word, info in cur:
333                 existing_phrases.add((word, info['class'], info['type'],
334                                       info.get('op') or '-'))
335
336             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
337             if should_replace:
338                 deleted = self._remove_special_phrases(cur, norm_phrases,
339                                                        existing_phrases)
340             else:
341                 deleted = 0
342
343         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
344                  len(norm_phrases), added, deleted)
345
346
347     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
348         """ Add all phrases to the database that are not yet there.
349         """
350         to_add = new_phrases - existing_phrases
351
352         added = 0
353         with CopyBuffer() as copystr:
354             for word, cls, typ, oper in to_add:
355                 term = self._search_normalized(word)
356                 if term:
357                     copystr.add(term, 'S', word,
358                                 json.dumps({'class': cls, 'type': typ,
359                                             'op': oper if oper in ('in', 'near') else None}))
360                     added += 1
361
362             copystr.copy_out(cursor, 'word',
363                              columns=['word_token', 'type', 'word', 'info'])
364
365         return added
366
367
368     @staticmethod
369     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
370         """ Remove all phrases from the databse that are no longer in the
371             new phrase list.
372         """
373         to_delete = existing_phrases - new_phrases
374
375         if to_delete:
376             cursor.execute_values(
377                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
378                     WHERE type = 'S' and word = name
379                           and info->>'class' = in_class and info->>'type' = in_type
380                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
381                 """, to_delete)
382
383         return len(to_delete)
384
385
386     def add_country_names(self, country_code, names):
387         """ Add default names for the given country to the search index.
388         """
389         # Make sure any name preprocessing for country names applies.
390         info = PlaceInfo({'name': names, 'country_code': country_code,
391                           'rank_address': 4, 'class': 'boundary',
392                           'type': 'administrative'})
393         self._add_country_full_names(country_code,
394                                      self.sanitizer.process_names(info)[0],
395                                      internal=True)
396
397
398     def _add_country_full_names(self, country_code, names, internal=False):
399         """ Add names for the given country from an already sanitized
400             name list.
401         """
402         word_tokens = set()
403         for name in names:
404             norm_name = self._search_normalized(name.name)
405             if norm_name:
406                 word_tokens.add(norm_name)
407
408         with self.conn.cursor() as cur:
409             # Get existing names
410             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
411                              FROM word
412                              WHERE type = 'C' and word = %s""",
413                         (country_code, ))
414             existing_tokens = {True: set(), False: set()} # internal/external names
415             for word in cur:
416                 existing_tokens[word[1]].add(word[0])
417
418             # Delete names that no longer exist.
419             gone_tokens = existing_tokens[internal] - word_tokens
420             if internal:
421                 gone_tokens.update(existing_tokens[False] & word_tokens)
422             if gone_tokens:
423                 cur.execute("""DELETE FROM word
424                                USING unnest(%s) as token
425                                WHERE type = 'C' and word = %s
426                                      and word_token = token""",
427                             (list(gone_tokens), country_code))
428
429             # Only add those names that are not yet in the list.
430             new_tokens = word_tokens - existing_tokens[True]
431             if not internal:
432                 new_tokens -= existing_tokens[False]
433             if new_tokens:
434                 if internal:
435                     sql = """INSERT INTO word (word_token, type, word, info)
436                                (SELECT token, 'C', %s, '{"internal": "yes"}'
437                                   FROM unnest(%s) as token)
438                            """
439                 else:
440                     sql = """INSERT INTO word (word_token, type, word)
441                                    (SELECT token, 'C', %s
442                                     FROM unnest(%s) as token)
443                           """
444                 cur.execute(sql, (country_code, list(new_tokens)))
445
446
447     def process_place(self, place):
448         """ Determine tokenizer information about the given place.
449
450             Returns a JSON-serializable structure that will be handed into
451             the database via the token_info field.
452         """
453         token_info = _TokenInfo()
454
455         names, address = self.sanitizer.process_names(place)
456
457         if names:
458             token_info.set_names(*self._compute_name_tokens(names))
459
460             if place.is_country():
461                 self._add_country_full_names(place.country_code, names)
462
463         if address:
464             self._process_place_address(token_info, address)
465
466         return token_info.to_dict()
467
468
469     def _process_place_address(self, token_info, address):
470         for item in address:
471             if item.kind == 'postcode':
472                 self._add_postcode(item.name)
473             elif item.kind == 'housenumber':
474                 token_info.add_housenumber(*self._compute_housenumber_token(item))
475             elif item.kind == 'street':
476                 token_info.add_street(self._retrieve_full_tokens(item.name))
477             elif item.kind == 'place':
478                 if not item.suffix:
479                     token_info.add_place(self._compute_partial_tokens(item.name))
480             elif not item.kind.startswith('_') and not item.suffix and \
481                  item.kind not in ('country', 'full'):
482                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
483
484
485     def _compute_housenumber_token(self, hnr):
486         """ Normalize the housenumber and return the word token and the
487             canonical form.
488         """
489         analyzer = self.token_analysis.analysis.get('@housenumber')
490         result = None, None
491
492         if analyzer is None:
493             # When no custom analyzer is set, simply normalize and transliterate
494             norm_name = self._search_normalized(hnr.name)
495             if norm_name:
496                 result = self._cache.housenumbers.get(norm_name, result)
497                 if result[0] is None:
498                     with self.conn.cursor() as cur:
499                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
500                         result = cur.fetchone()[0], norm_name
501                         self._cache.housenumbers[norm_name] = result
502         else:
503             # Otherwise use the analyzer to determine the canonical name.
504             # Per convention we use the first variant as the 'lookup name', the
505             # name that gets saved in the housenumber field of the place.
506             norm_name = analyzer.normalize(hnr.name)
507             if norm_name:
508                 result = self._cache.housenumbers.get(norm_name, result)
509                 if result[0] is None:
510                     variants = analyzer.get_variants_ascii(norm_name)
511                     if variants:
512                         with self.conn.cursor() as cur:
513                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
514                                         (norm_name, list(variants)))
515                             result = cur.fetchone()[0], variants[0]
516                             self._cache.housenumbers[norm_name] = result
517
518         return result
519
520
521     def _compute_partial_tokens(self, name):
522         """ Normalize the given term, split it into partial words and return
523             then token list for them.
524         """
525         norm_name = self._search_normalized(name)
526
527         tokens = []
528         need_lookup = []
529         for partial in norm_name.split():
530             token = self._cache.partials.get(partial)
531             if token:
532                 tokens.append(token)
533             else:
534                 need_lookup.append(partial)
535
536         if need_lookup:
537             with self.conn.cursor() as cur:
538                 cur.execute("""SELECT word, getorcreate_partial_word(word)
539                                FROM unnest(%s) word""",
540                             (need_lookup, ))
541
542                 for partial, token in cur:
543                     tokens.append(token)
544                     self._cache.partials[partial] = token
545
546         return tokens
547
548
549     def _retrieve_full_tokens(self, name):
550         """ Get the full name token for the given name, if it exists.
551             The name is only retrived for the standard analyser.
552         """
553         norm_name = self._search_normalized(name)
554
555         # return cached if possible
556         if norm_name in self._cache.fulls:
557             return self._cache.fulls[norm_name]
558
559         with self.conn.cursor() as cur:
560             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
561                         (norm_name, ))
562             full = [row[0] for row in cur]
563
564         self._cache.fulls[norm_name] = full
565
566         return full
567
568
569     def _compute_name_tokens(self, names):
570         """ Computes the full name and partial name tokens for the given
571             dictionary of names.
572         """
573         full_tokens = set()
574         partial_tokens = set()
575
576         for name in names:
577             analyzer_id = name.get_attr('analyzer')
578             analyzer = self.token_analysis.get_analyzer(analyzer_id)
579             norm_name = analyzer.normalize(name.name)
580             if analyzer_id is None:
581                 token_id = norm_name
582             else:
583                 token_id = f'{norm_name}@{analyzer_id}'
584
585             full, part = self._cache.names.get(token_id, (None, None))
586             if full is None:
587                 variants = analyzer.get_variants_ascii(norm_name)
588                 if not variants:
589                     continue
590
591                 with self.conn.cursor() as cur:
592                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
593                                 (token_id, variants))
594                     full, part = cur.fetchone()
595
596                 self._cache.names[token_id] = (full, part)
597
598             full_tokens.add(full)
599             partial_tokens.update(part)
600
601         return full_tokens, partial_tokens
602
603
604     def _add_postcode(self, postcode):
605         """ Make sure the normalized postcode is present in the word table.
606         """
607         if re.search(r'[:,;]', postcode) is None:
608             postcode = self.normalize_postcode(postcode)
609
610             if postcode not in self._cache.postcodes:
611                 term = self._search_normalized(postcode)
612                 if not term:
613                     return
614
615                 with self.conn.cursor() as cur:
616                     # no word_id needed for postcodes
617                     cur.execute("""INSERT INTO word (word_token, type, word)
618                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
619                                     WHERE NOT EXISTS
620                                      (SELECT * FROM word
621                                       WHERE type = 'P' and word = pc))
622                                 """, (term, postcode))
623                 self._cache.postcodes.add(postcode)
624
625
626 class _TokenInfo:
627     """ Collect token information to be sent back to the database.
628     """
629     def __init__(self):
630         self.names = None
631         self.housenumbers = set()
632         self.housenumber_tokens = set()
633         self.street_tokens = set()
634         self.place_tokens = set()
635         self.address_tokens = {}
636
637
638     @staticmethod
639     def _mk_array(tokens):
640         return f"{{{','.join((str(s) for s in tokens))}}}"
641
642
643     def to_dict(self):
644         """ Return the token information in database importable format.
645         """
646         out = {}
647
648         if self.names:
649             out['names'] = self.names
650
651         if self.housenumbers:
652             out['hnr'] = ';'.join(self.housenumbers)
653             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
654
655         if self.street_tokens:
656             out['street'] = self._mk_array(self.street_tokens)
657
658         if self.place_tokens:
659             out['place'] = self._mk_array(self.place_tokens)
660
661         if self.address_tokens:
662             out['addr'] = self.address_tokens
663
664         return out
665
666
667     def set_names(self, fulls, partials):
668         """ Adds token information for the normalised names.
669         """
670         self.names = self._mk_array(itertools.chain(fulls, partials))
671
672
673     def add_housenumber(self, token, hnr):
674         """ Extract housenumber information from a list of normalised
675             housenumbers.
676         """
677         if token:
678             self.housenumbers.add(hnr)
679             self.housenumber_tokens.add(token)
680
681
682     def add_street(self, tokens):
683         """ Add addr:street match terms.
684         """
685         self.street_tokens.update(tokens)
686
687
688     def add_place(self, tokens):
689         """ Add addr:place search and match terms.
690         """
691         self.place_tokens.update(tokens)
692
693
694     def add_address_term(self, key, partials):
695         """ Add additional address terms.
696         """
697         if partials:
698             self.address_tokens[key] = self._mk_array(partials)
699
700
701 class _TokenCache:
702     """ Cache for token information to avoid repeated database queries.
703
704         This cache is not thread-safe and needs to be instantiated per
705         analyzer.
706     """
707     def __init__(self):
708         self.names = {}
709         self.partials = {}
710         self.fulls = {}
711         self.postcodes = set()
712         self.housenumbers = {}