]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tokenizer/icu_tokenizer.py
do not expand records in select list
[nominatim.git] / nominatim / tokenizer / icu_tokenizer.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Tokenizer implementing normalisation as used before Nominatim 4 but using
9 libICU instead of the PostgreSQL module.
10 """
11 import itertools
12 import json
13 import logging
14 import re
15 from textwrap import dedent
16
17 from nominatim.db.connection import connect
18 from nominatim.db.utils import CopyBuffer
19 from nominatim.db.sql_preprocessor import SQLPreprocessor
20 from nominatim.indexer.place_info import PlaceInfo
21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
22 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
23
24 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
25
26 LOG = logging.getLogger()
27
28 def create(dsn, data_dir):
29     """ Create a new instance of the tokenizer provided by this module.
30     """
31     return LegacyICUTokenizer(dsn, data_dir)
32
33
34 class LegacyICUTokenizer(AbstractTokenizer):
35     """ This tokenizer uses libICU to covert names and queries to ASCII.
36         Otherwise it uses the same algorithms and data structures as the
37         normalization routines in Nominatim 3.
38     """
39
40     def __init__(self, dsn, data_dir):
41         self.dsn = dsn
42         self.data_dir = data_dir
43         self.loader = None
44
45
46     def init_new_db(self, config, init_db=True):
47         """ Set up a new tokenizer for the database.
48
49             This copies all necessary data in the project directory to make
50             sure the tokenizer remains stable even over updates.
51         """
52         self.loader = ICURuleLoader(config)
53
54         self._install_php(config.lib_dir.php)
55         self._save_config()
56
57         if init_db:
58             self.update_sql_functions(config)
59             self._init_db_tables(config)
60
61
62     def init_from_project(self, config):
63         """ Initialise the tokenizer from the project directory.
64         """
65         self.loader = ICURuleLoader(config)
66
67         with connect(self.dsn) as conn:
68             self.loader.load_config_from_db(conn)
69
70
71     def finalize_import(self, config):
72         """ Do any required postprocessing to make the tokenizer data ready
73             for use.
74         """
75         with connect(self.dsn) as conn:
76             sqlp = SQLPreprocessor(conn, config)
77             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
78
79
80     def update_sql_functions(self, config):
81         """ Reimport the SQL functions for this tokenizer.
82         """
83         with connect(self.dsn) as conn:
84             sqlp = SQLPreprocessor(conn, config)
85             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
86
87
88     def check_database(self, config):
89         """ Check that the tokenizer is set up correctly.
90         """
91         # Will throw an error if there is an issue.
92         self.init_from_project(config)
93
94
95     def update_statistics(self):
96         """ Recompute frequencies for all name words.
97         """
98         with connect(self.dsn) as conn:
99             if conn.table_exists('search_name'):
100                 with conn.cursor() as cur:
101                     cur.drop_table("word_frequencies")
102                     LOG.info("Computing word frequencies")
103                     cur.execute("""CREATE TEMP TABLE word_frequencies AS
104                                      SELECT unnest(name_vector) as id, count(*)
105                                      FROM search_name GROUP BY id""")
106                     cur.execute("CREATE INDEX ON word_frequencies(id)")
107                     LOG.info("Update word table with recomputed frequencies")
108                     cur.execute("""UPDATE word
109                                    SET info = info || jsonb_build_object('count', count)
110                                    FROM word_frequencies WHERE word_id = id""")
111                     cur.drop_table("word_frequencies")
112             conn.commit()
113
114
115     def _cleanup_housenumbers(self):
116         """ Remove unused house numbers.
117         """
118         with connect(self.dsn) as conn:
119             if not conn.table_exists('search_name'):
120                 return
121             with conn.cursor(name="hnr_counter") as cur:
122                 cur.execute("""SELECT DISTINCT word_id, coalesce(info->>'lookup', word_token)
123                                FROM word
124                                WHERE type = 'H'
125                                  AND NOT EXISTS(SELECT * FROM search_name
126                                                 WHERE ARRAY[word.word_id] && name_vector)
127                                  AND (char_length(coalesce(word, word_token)) > 6
128                                       OR coalesce(word, word_token) not similar to '\\d+')
129                             """)
130                 candidates = {token: wid for wid, token in cur}
131             with conn.cursor(name="hnr_counter") as cur:
132                 cur.execute("""SELECT housenumber FROM placex
133                                WHERE housenumber is not null
134                                      AND (char_length(housenumber) > 6
135                                           OR housenumber not similar to '\\d+')
136                             """)
137                 for row in cur:
138                     for hnr in row[0].split(';'):
139                         candidates.pop(hnr, None)
140             LOG.info("There are %s outdated housenumbers.", len(candidates))
141             LOG.debug("Outdated housenumbers: %s", candidates.keys())
142             if candidates:
143                 with conn.cursor() as cur:
144                     cur.execute("""DELETE FROM word WHERE word_id = any(%s)""",
145                                 (list(candidates.values()), ))
146                 conn.commit()
147
148
149
150     def update_word_tokens(self):
151         """ Remove unused tokens.
152         """
153         LOG.warning("Cleaning up housenumber tokens.")
154         self._cleanup_housenumbers()
155         LOG.warning("Tokenizer house-keeping done.")
156
157
158     def name_analyzer(self):
159         """ Create a new analyzer for tokenizing names and queries
160             using this tokinzer. Analyzers are context managers and should
161             be used accordingly:
162
163             ```
164             with tokenizer.name_analyzer() as analyzer:
165                 analyser.tokenize()
166             ```
167
168             When used outside the with construct, the caller must ensure to
169             call the close() function before destructing the analyzer.
170
171             Analyzers are not thread-safe. You need to instantiate one per thread.
172         """
173         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
174                                      self.loader.make_token_analysis())
175
176
177     def _install_php(self, phpdir):
178         """ Install the php script for the tokenizer.
179         """
180         php_file = self.data_dir / "tokenizer.php"
181         php_file.write_text(dedent(f"""\
182             <?php
183             @define('CONST_Max_Word_Frequency', 10000000);
184             @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
185             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
186             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
187
188
189     def _save_config(self):
190         """ Save the configuration that needs to remain stable for the given
191             database as database properties.
192         """
193         with connect(self.dsn) as conn:
194             self.loader.save_config_to_db(conn)
195
196
197     def _init_db_tables(self, config):
198         """ Set up the word table and fill it with pre-computed word
199             frequencies.
200         """
201         with connect(self.dsn) as conn:
202             sqlp = SQLPreprocessor(conn, config)
203             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
204             conn.commit()
205
206
207 class LegacyICUNameAnalyzer(AbstractAnalyzer):
208     """ The legacy analyzer uses the ICU library for splitting names.
209
210         Each instance opens a connection to the database to request the
211         normalization.
212     """
213
214     def __init__(self, dsn, sanitizer, token_analysis):
215         self.conn = connect(dsn).connection
216         self.conn.autocommit = True
217         self.sanitizer = sanitizer
218         self.token_analysis = token_analysis
219
220         self._cache = _TokenCache()
221
222
223     def close(self):
224         """ Free all resources used by the analyzer.
225         """
226         if self.conn:
227             self.conn.close()
228             self.conn = None
229
230
231     def _search_normalized(self, name):
232         """ Return the search token transliteration of the given name.
233         """
234         return self.token_analysis.search.transliterate(name).strip()
235
236
237     def _normalized(self, name):
238         """ Return the normalized version of the given name with all
239             non-relevant information removed.
240         """
241         return self.token_analysis.normalizer.transliterate(name).strip()
242
243
244     def get_word_token_info(self, words):
245         """ Return token information for the given list of words.
246             If a word starts with # it is assumed to be a full name
247             otherwise is a partial name.
248
249             The function returns a list of tuples with
250             (original word, word token, word id).
251
252             The function is used for testing and debugging only
253             and not necessarily efficient.
254         """
255         full_tokens = {}
256         partial_tokens = {}
257         for word in words:
258             if word.startswith('#'):
259                 full_tokens[word] = self._search_normalized(word[1:])
260             else:
261                 partial_tokens[word] = self._search_normalized(word)
262
263         with self.conn.cursor() as cur:
264             cur.execute("""SELECT word_token, word_id
265                             FROM word WHERE word_token = ANY(%s) and type = 'W'
266                         """, (list(full_tokens.values()),))
267             full_ids = {r[0]: r[1] for r in cur}
268             cur.execute("""SELECT word_token, word_id
269                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
270                         (list(partial_tokens.values()),))
271             part_ids = {r[0]: r[1] for r in cur}
272
273         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
274                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
275
276
277     @staticmethod
278     def normalize_postcode(postcode):
279         """ Convert the postcode to a standardized form.
280
281             This function must yield exactly the same result as the SQL function
282             'token_normalized_postcode()'.
283         """
284         return postcode.strip().upper()
285
286
287     def update_postcodes_from_db(self):
288         """ Update postcode tokens in the word table from the location_postcode
289             table.
290         """
291         to_delete = []
292         with self.conn.cursor() as cur:
293             # This finds us the rows in location_postcode and word that are
294             # missing in the other table.
295             cur.execute("""SELECT * FROM
296                             (SELECT pc, word FROM
297                               (SELECT distinct(postcode) as pc FROM location_postcode) p
298                               FULL JOIN
299                               (SELECT word FROM word WHERE type = 'P') w
300                               ON pc = word) x
301                            WHERE pc is null or word is null""")
302
303             with CopyBuffer() as copystr:
304                 for postcode, word in cur:
305                     if postcode is None:
306                         to_delete.append(word)
307                     else:
308                         copystr.add(self._search_normalized(postcode),
309                                     'P', postcode)
310
311                 if to_delete:
312                     cur.execute("""DELETE FROM WORD
313                                    WHERE type ='P' and word = any(%s)
314                                 """, (to_delete, ))
315
316                 copystr.copy_out(cur, 'word',
317                                  columns=['word_token', 'type', 'word'])
318
319
320     def update_special_phrases(self, phrases, should_replace):
321         """ Replace the search index for special phrases with the new phrases.
322             If `should_replace` is True, then the previous set of will be
323             completely replaced. Otherwise the phrases are added to the
324             already existing ones.
325         """
326         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
327                             for p in phrases))
328
329         with self.conn.cursor() as cur:
330             # Get the old phrases.
331             existing_phrases = set()
332             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
333             for word, info in cur:
334                 existing_phrases.add((word, info['class'], info['type'],
335                                       info.get('op') or '-'))
336
337             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
338             if should_replace:
339                 deleted = self._remove_special_phrases(cur, norm_phrases,
340                                                        existing_phrases)
341             else:
342                 deleted = 0
343
344         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
345                  len(norm_phrases), added, deleted)
346
347
348     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
349         """ Add all phrases to the database that are not yet there.
350         """
351         to_add = new_phrases - existing_phrases
352
353         added = 0
354         with CopyBuffer() as copystr:
355             for word, cls, typ, oper in to_add:
356                 term = self._search_normalized(word)
357                 if term:
358                     copystr.add(term, 'S', word,
359                                 json.dumps({'class': cls, 'type': typ,
360                                             'op': oper if oper in ('in', 'near') else None}))
361                     added += 1
362
363             copystr.copy_out(cursor, 'word',
364                              columns=['word_token', 'type', 'word', 'info'])
365
366         return added
367
368
369     @staticmethod
370     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
371         """ Remove all phrases from the databse that are no longer in the
372             new phrase list.
373         """
374         to_delete = existing_phrases - new_phrases
375
376         if to_delete:
377             cursor.execute_values(
378                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
379                     WHERE type = 'S' and word = name
380                           and info->>'class' = in_class and info->>'type' = in_type
381                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
382                 """, to_delete)
383
384         return len(to_delete)
385
386
387     def add_country_names(self, country_code, names):
388         """ Add default names for the given country to the search index.
389         """
390         # Make sure any name preprocessing for country names applies.
391         info = PlaceInfo({'name': names, 'country_code': country_code,
392                           'rank_address': 4, 'class': 'boundary',
393                           'type': 'administrative'})
394         self._add_country_full_names(country_code,
395                                      self.sanitizer.process_names(info)[0],
396                                      internal=True)
397
398
399     def _add_country_full_names(self, country_code, names, internal=False):
400         """ Add names for the given country from an already sanitized
401             name list.
402         """
403         word_tokens = set()
404         for name in names:
405             norm_name = self._search_normalized(name.name)
406             if norm_name:
407                 word_tokens.add(norm_name)
408
409         with self.conn.cursor() as cur:
410             # Get existing names
411             cur.execute("""SELECT word_token, coalesce(info ? 'internal', false) as is_internal
412                              FROM word
413                              WHERE type = 'C' and word = %s""",
414                         (country_code, ))
415             existing_tokens = {True: set(), False: set()} # internal/external names
416             for word in cur:
417                 existing_tokens[word[1]].add(word[0])
418
419             # Delete names that no longer exist.
420             gone_tokens = existing_tokens[internal] - word_tokens
421             if internal:
422                 gone_tokens.update(existing_tokens[False] & word_tokens)
423             if gone_tokens:
424                 cur.execute("""DELETE FROM word
425                                USING unnest(%s) as token
426                                WHERE type = 'C' and word = %s
427                                      and word_token = token""",
428                             (list(gone_tokens), country_code))
429
430             # Only add those names that are not yet in the list.
431             new_tokens = word_tokens - existing_tokens[True]
432             if not internal:
433                 new_tokens -= existing_tokens[False]
434             if new_tokens:
435                 if internal:
436                     sql = """INSERT INTO word (word_token, type, word, info)
437                                (SELECT token, 'C', %s, '{"internal": "yes"}'
438                                   FROM unnest(%s) as token)
439                            """
440                 else:
441                     sql = """INSERT INTO word (word_token, type, word)
442                                    (SELECT token, 'C', %s
443                                     FROM unnest(%s) as token)
444                           """
445                 cur.execute(sql, (country_code, list(new_tokens)))
446
447
448     def process_place(self, place):
449         """ Determine tokenizer information about the given place.
450
451             Returns a JSON-serializable structure that will be handed into
452             the database via the token_info field.
453         """
454         token_info = _TokenInfo()
455
456         names, address = self.sanitizer.process_names(place)
457
458         if names:
459             token_info.set_names(*self._compute_name_tokens(names))
460
461             if place.is_country():
462                 self._add_country_full_names(place.country_code, names)
463
464         if address:
465             self._process_place_address(token_info, address)
466
467         return token_info.to_dict()
468
469
470     def _process_place_address(self, token_info, address):
471         for item in address:
472             if item.kind == 'postcode':
473                 self._add_postcode(item.name)
474             elif item.kind == 'housenumber':
475                 token_info.add_housenumber(*self._compute_housenumber_token(item))
476             elif item.kind == 'street':
477                 token_info.add_street(self._retrieve_full_tokens(item.name))
478             elif item.kind == 'place':
479                 if not item.suffix:
480                     token_info.add_place(self._compute_partial_tokens(item.name))
481             elif not item.kind.startswith('_') and not item.suffix and \
482                  item.kind not in ('country', 'full'):
483                 token_info.add_address_term(item.kind, self._compute_partial_tokens(item.name))
484
485
486     def _compute_housenumber_token(self, hnr):
487         """ Normalize the housenumber and return the word token and the
488             canonical form.
489         """
490         analyzer = self.token_analysis.analysis.get('@housenumber')
491         result = None, None
492
493         if analyzer is None:
494             # When no custom analyzer is set, simply normalize and transliterate
495             norm_name = self._search_normalized(hnr.name)
496             if norm_name:
497                 result = self._cache.housenumbers.get(norm_name, result)
498                 if result[0] is None:
499                     with self.conn.cursor() as cur:
500                         cur.execute("SELECT getorcreate_hnr_id(%s)", (norm_name, ))
501                         result = cur.fetchone()[0], norm_name
502                         self._cache.housenumbers[norm_name] = result
503         else:
504             # Otherwise use the analyzer to determine the canonical name.
505             # Per convention we use the first variant as the 'lookup name', the
506             # name that gets saved in the housenumber field of the place.
507             norm_name = analyzer.normalize(hnr.name)
508             if norm_name:
509                 result = self._cache.housenumbers.get(norm_name, result)
510                 if result[0] is None:
511                     variants = analyzer.get_variants_ascii(norm_name)
512                     if variants:
513                         with self.conn.cursor() as cur:
514                             cur.execute("SELECT create_analyzed_hnr_id(%s, %s)",
515                                         (norm_name, list(variants)))
516                             result = cur.fetchone()[0], variants[0]
517                             self._cache.housenumbers[norm_name] = result
518
519         return result
520
521
522     def _compute_partial_tokens(self, name):
523         """ Normalize the given term, split it into partial words and return
524             then token list for them.
525         """
526         norm_name = self._search_normalized(name)
527
528         tokens = []
529         need_lookup = []
530         for partial in norm_name.split():
531             token = self._cache.partials.get(partial)
532             if token:
533                 tokens.append(token)
534             else:
535                 need_lookup.append(partial)
536
537         if need_lookup:
538             with self.conn.cursor() as cur:
539                 cur.execute("""SELECT word, getorcreate_partial_word(word)
540                                FROM unnest(%s) word""",
541                             (need_lookup, ))
542
543                 for partial, token in cur:
544                     tokens.append(token)
545                     self._cache.partials[partial] = token
546
547         return tokens
548
549
550     def _retrieve_full_tokens(self, name):
551         """ Get the full name token for the given name, if it exists.
552             The name is only retrived for the standard analyser.
553         """
554         norm_name = self._search_normalized(name)
555
556         # return cached if possible
557         if norm_name in self._cache.fulls:
558             return self._cache.fulls[norm_name]
559
560         with self.conn.cursor() as cur:
561             cur.execute("SELECT word_id FROM word WHERE word_token = %s and type = 'W'",
562                         (norm_name, ))
563             full = [row[0] for row in cur]
564
565         self._cache.fulls[norm_name] = full
566
567         return full
568
569
570     def _compute_name_tokens(self, names):
571         """ Computes the full name and partial name tokens for the given
572             dictionary of names.
573         """
574         full_tokens = set()
575         partial_tokens = set()
576
577         for name in names:
578             analyzer_id = name.get_attr('analyzer')
579             analyzer = self.token_analysis.get_analyzer(analyzer_id)
580             norm_name = analyzer.normalize(name.name)
581             if analyzer_id is None:
582                 token_id = norm_name
583             else:
584                 token_id = f'{norm_name}@{analyzer_id}'
585
586             full, part = self._cache.names.get(token_id, (None, None))
587             if full is None:
588                 variants = analyzer.get_variants_ascii(norm_name)
589                 if not variants:
590                     continue
591
592                 with self.conn.cursor() as cur:
593                     cur.execute("SELECT * FROM getorcreate_full_word(%s, %s)",
594                                 (token_id, variants))
595                     full, part = cur.fetchone()
596
597                 self._cache.names[token_id] = (full, part)
598
599             full_tokens.add(full)
600             partial_tokens.update(part)
601
602         return full_tokens, partial_tokens
603
604
605     def _add_postcode(self, postcode):
606         """ Make sure the normalized postcode is present in the word table.
607         """
608         if re.search(r'[:,;]', postcode) is None:
609             postcode = self.normalize_postcode(postcode)
610
611             if postcode not in self._cache.postcodes:
612                 term = self._search_normalized(postcode)
613                 if not term:
614                     return
615
616                 with self.conn.cursor() as cur:
617                     # no word_id needed for postcodes
618                     cur.execute("""INSERT INTO word (word_token, type, word)
619                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
620                                     WHERE NOT EXISTS
621                                      (SELECT * FROM word
622                                       WHERE type = 'P' and word = pc))
623                                 """, (term, postcode))
624                 self._cache.postcodes.add(postcode)
625
626
627 class _TokenInfo:
628     """ Collect token information to be sent back to the database.
629     """
630     def __init__(self):
631         self.names = None
632         self.housenumbers = set()
633         self.housenumber_tokens = set()
634         self.street_tokens = set()
635         self.place_tokens = set()
636         self.address_tokens = {}
637
638
639     @staticmethod
640     def _mk_array(tokens):
641         return f"{{{','.join((str(s) for s in tokens))}}}"
642
643
644     def to_dict(self):
645         """ Return the token information in database importable format.
646         """
647         out = {}
648
649         if self.names:
650             out['names'] = self.names
651
652         if self.housenumbers:
653             out['hnr'] = ';'.join(self.housenumbers)
654             out['hnr_tokens'] = self._mk_array(self.housenumber_tokens)
655
656         if self.street_tokens:
657             out['street'] = self._mk_array(self.street_tokens)
658
659         if self.place_tokens:
660             out['place'] = self._mk_array(self.place_tokens)
661
662         if self.address_tokens:
663             out['addr'] = self.address_tokens
664
665         return out
666
667
668     def set_names(self, fulls, partials):
669         """ Adds token information for the normalised names.
670         """
671         self.names = self._mk_array(itertools.chain(fulls, partials))
672
673
674     def add_housenumber(self, token, hnr):
675         """ Extract housenumber information from a list of normalised
676             housenumbers.
677         """
678         if token:
679             self.housenumbers.add(hnr)
680             self.housenumber_tokens.add(token)
681
682
683     def add_street(self, tokens):
684         """ Add addr:street match terms.
685         """
686         self.street_tokens.update(tokens)
687
688
689     def add_place(self, tokens):
690         """ Add addr:place search and match terms.
691         """
692         self.place_tokens.update(tokens)
693
694
695     def add_address_term(self, key, partials):
696         """ Add additional address terms.
697         """
698         if partials:
699             self.address_tokens[key] = self._mk_array(partials)
700
701
702 class _TokenCache:
703     """ Cache for token information to avoid repeated database queries.
704
705         This cache is not thread-safe and needs to be instantiated per
706         analyzer.
707     """
708     def __init__(self):
709         self.names = {}
710         self.partials = {}
711         self.fulls = {}
712         self.postcodes = set()
713         self.housenumbers = {}