nominatim/tokenizer/icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 import itertools
   6 import json
   7 import logging
   8 import re
   9 from textwrap import dedent
  10
  11 from nominatim.db.connection import connect
  12 from nominatim.db.properties import set_property, get_property
  13 from nominatim.db.utils import CopyBuffer
  14 from nominatim.db.sql_preprocessor import SQLPreprocessor
  15 from nominatim.indexer.place_info import PlaceInfo
  16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  17 from nominatim.tokenizer.base import AbstractAnalyzer, AbstractTokenizer
  18
  19 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  20
  21 LOG = logging.getLogger()
  22
  23 def create(dsn, data_dir):
  24     """ Create a new instance of the tokenizer provided by this module.
  25     """
  26     return LegacyICUTokenizer(dsn, data_dir)
  27
  28
  29 class LegacyICUTokenizer(AbstractTokenizer):
  30     """ This tokenizer uses libICU to covert names and queries to ASCII.
  31         Otherwise it uses the same algorithms and data structures as the
  32         normalization routines in Nominatim 3.
  33     """
  34
  35     def __init__(self, dsn, data_dir):
  36         self.dsn = dsn
  37         self.data_dir = data_dir
  38         self.loader = None
  39         self.term_normalization = None
  40
  41
  42     def init_new_db(self, config, init_db=True):
  43         """ Set up a new tokenizer for the database.
  44
  45             This copies all necessary data in the project directory to make
  46             sure the tokenizer remains stable even over updates.
  47         """
  48         self.loader = ICURuleLoader(config)
  49
  50         self.term_normalization = config.TERM_NORMALIZATION
  51
  52         self._install_php(config.lib_dir.php)
  53         self._save_config()
  54
  55         if init_db:
  56             self.update_sql_functions(config)
  57             self._init_db_tables(config)
  58
  59
  60     def init_from_project(self, config):
  61         """ Initialise the tokenizer from the project directory.
  62         """
  63         self.loader = ICURuleLoader(config)
  64
  65         with connect(self.dsn) as conn:
  66             self.loader.load_config_from_db(conn)
  67             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  68
  69
  70     def finalize_import(self, config):
  71         """ Do any required postprocessing to make the tokenizer data ready
  72             for use.
  73         """
  74         with connect(self.dsn) as conn:
  75             sqlp = SQLPreprocessor(conn, config)
  76             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  77
  78
  79     def update_sql_functions(self, config):
  80         """ Reimport the SQL functions for this tokenizer.
  81         """
  82         with connect(self.dsn) as conn:
  83             sqlp = SQLPreprocessor(conn, config)
  84             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')
  85
  86
  87     def check_database(self, config):
  88         """ Check that the tokenizer is set up correctly.
  89         """
  90         self.init_from_project(config)
  91
  92         if self.term_normalization is None:
  93             return "Configuration for tokenizer 'icu' are missing."
  94
  95         return None
  96
  97
  98     def update_statistics(self):
  99         """ Recompute frequencies for all name words.
 100         """
 101         with connect(self.dsn) as conn:
 102             with conn.cursor() as cur:
 103                 cur.drop_table("word_frequencies")
 104                 LOG.info("Computing word frequencies")
 105                 cur.execute("""CREATE TEMP TABLE word_frequencies AS
 106                                  SELECT unnest(name_vector) as id, count(*)
 107                                  FROM search_name GROUP BY id""")
 108                 cur.execute("CREATE INDEX ON word_frequencies(id)")
 109                 LOG.info("Update word table with recomputed frequencies")
 110                 cur.execute("""UPDATE word
 111                                SET info = info || jsonb_build_object('count', count)
 112                                FROM word_frequencies WHERE word_id = id""")
 113                 cur.drop_table("word_frequencies")
 114             conn.commit()
 115
 116
 117     def name_analyzer(self):
 118         """ Create a new analyzer for tokenizing names and queries
 119             using this tokinzer. Analyzers are context managers and should
 120             be used accordingly:
 121
 122             ```
 123             with tokenizer.name_analyzer() as analyzer:
 124                 analyser.tokenize()
 125             ```
 126
 127             When used outside the with construct, the caller must ensure to
 128             call the close() function before destructing the analyzer.
 129
 130             Analyzers are not thread-safe. You need to instantiate one per thread.
 131         """
 132         return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
 133                                      self.loader.make_token_analysis())
 134
 135
 136     def _install_php(self, phpdir):
 137         """ Install the php script for the tokenizer.
 138         """
 139         php_file = self.data_dir / "tokenizer.php"
 140         php_file.write_text(dedent(f"""\
 141             <?php
 142             @define('CONST_Max_Word_Frequency', 10000000);
 143             @define('CONST_Term_Normalization_Rules', "{self.term_normalization}");
 144             @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
 145             require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))
 146
 147
 148     def _save_config(self):
 149         """ Save the configuration that needs to remain stable for the given
 150             database as database properties.
 151         """
 152         with connect(self.dsn) as conn:
 153             self.loader.save_config_to_db(conn)
 154             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 155
 156
 157     def _init_db_tables(self, config):
 158         """ Set up the word table and fill it with pre-computed word
 159             frequencies.
 160         """
 161         with connect(self.dsn) as conn:
 162             sqlp = SQLPreprocessor(conn, config)
 163             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 164             conn.commit()
 165
 166
 167 class LegacyICUNameAnalyzer(AbstractAnalyzer):
 168     """ The legacy analyzer uses the ICU library for splitting names.
 169
 170         Each instance opens a connection to the database to request the
 171         normalization.
 172     """
 173
 174     def __init__(self, dsn, sanitizer, token_analysis):
 175         self.conn = connect(dsn).connection
 176         self.conn.autocommit = True
 177         self.sanitizer = sanitizer
 178         self.token_analysis = token_analysis
 179
 180         self._cache = _TokenCache()
 181
 182
 183     def close(self):
 184         """ Free all resources used by the analyzer.
 185         """
 186         if self.conn:
 187             self.conn.close()
 188             self.conn = None
 189
 190
 191     def _search_normalized(self, name):
 192         """ Return the search token transliteration of the given name.
 193         """
 194         return self.token_analysis.search.transliterate(name).strip()
 195
 196
 197     def _normalized(self, name):
 198         """ Return the normalized version of the given name with all
 199             non-relevant information removed.
 200         """
 201         return self.token_analysis.normalizer.transliterate(name).strip()
 202
 203
 204     def get_word_token_info(self, words):
 205         """ Return token information for the given list of words.
 206             If a word starts with # it is assumed to be a full name
 207             otherwise is a partial name.
 208
 209             The function returns a list of tuples with
 210             (original word, word token, word id).
 211
 212             The function is used for testing and debugging only
 213             and not necessarily efficient.
 214         """
 215         full_tokens = {}
 216         partial_tokens = {}
 217         for word in words:
 218             if word.startswith('#'):
 219                 full_tokens[word] = self._search_normalized(word[1:])
 220             else:
 221                 partial_tokens[word] = self._search_normalized(word)
 222
 223         with self.conn.cursor() as cur:
 224             cur.execute("""SELECT word_token, word_id
 225                             FROM word WHERE word_token = ANY(%s) and type = 'W'
 226                         """, (list(full_tokens.values()),))
 227             full_ids = {r[0]: r[1] for r in cur}
 228             cur.execute("""SELECT word_token, word_id
 229                             FROM word WHERE word_token = ANY(%s) and type = 'w'""",
 230                         (list(partial_tokens.values()),))
 231             part_ids = {r[0]: r[1] for r in cur}
 232
 233         return [(k, v, full_ids.get(v, None)) for k, v in full_tokens.items()] \
 234                + [(k, v, part_ids.get(v, None)) for k, v in partial_tokens.items()]
 235
 236
 237     @staticmethod
 238     def normalize_postcode(postcode):
 239         """ Convert the postcode to a standardized form.
 240
 241             This function must yield exactly the same result as the SQL function
 242             'token_normalized_postcode()'.
 243         """
 244         return postcode.strip().upper()
 245
 246
 247     def _make_standard_hnr(self, hnr):
 248         """ Create a normalised version of a housenumber.
 249
 250             This function takes minor shortcuts on transliteration.
 251         """
 252         return self._search_normalized(hnr)
 253
 254     def update_postcodes_from_db(self):
 255         """ Update postcode tokens in the word table from the location_postcode
 256             table.
 257         """
 258         to_delete = []
 259         with self.conn.cursor() as cur:
 260             # This finds us the rows in location_postcode and word that are
 261             # missing in the other table.
 262             cur.execute("""SELECT * FROM
 263                             (SELECT pc, word FROM
 264                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 265                               FULL JOIN
 266                               (SELECT word FROM word WHERE type = 'P') w
 267                               ON pc = word) x
 268                            WHERE pc is null or word is null""")
 269
 270             with CopyBuffer() as copystr:
 271                 for postcode, word in cur:
 272                     if postcode is None:
 273                         to_delete.append(word)
 274                     else:
 275                         copystr.add(self._search_normalized(postcode),
 276                                     'P', postcode)
 277
 278                 if to_delete:
 279                     cur.execute("""DELETE FROM WORD
 280                                    WHERE type ='P' and word = any(%s)
 281                                 """, (to_delete, ))
 282
 283                 copystr.copy_out(cur, 'word',
 284                                  columns=['word_token', 'type', 'word'])
 285
 286
 287     def update_special_phrases(self, phrases, should_replace):
 288         """ Replace the search index for special phrases with the new phrases.
 289             If `should_replace` is True, then the previous set of will be
 290             completely replaced. Otherwise the phrases are added to the
 291             already existing ones.
 292         """
 293         norm_phrases = set(((self._normalized(p[0]), p[1], p[2], p[3])
 294                             for p in phrases))
 295
 296         with self.conn.cursor() as cur:
 297             # Get the old phrases.
 298             existing_phrases = set()
 299             cur.execute("SELECT word, info FROM word WHERE type = 'S'")
 300             for word, info in cur:
 301                 existing_phrases.add((word, info['class'], info['type'],
 302                                       info.get('op') or '-'))
 303
 304             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 305             if should_replace:
 306                 deleted = self._remove_special_phrases(cur, norm_phrases,
 307                                                        existing_phrases)
 308             else:
 309                 deleted = 0
 310
 311         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 312                  len(norm_phrases), added, deleted)
 313
 314
 315     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 316         """ Add all phrases to the database that are not yet there.
 317         """
 318         to_add = new_phrases - existing_phrases
 319
 320         added = 0
 321         with CopyBuffer() as copystr:
 322             for word, cls, typ, oper in to_add:
 323                 term = self._search_normalized(word)
 324                 if term:
 325                     copystr.add(term, 'S', word,
 326                                 json.dumps({'class': cls, 'type': typ,
 327                                             'op': oper if oper in ('in', 'near') else None}))
 328                     added += 1
 329
 330             copystr.copy_out(cursor, 'word',
 331                              columns=['word_token', 'type', 'word', 'info'])
 332
 333         return added
 334
 335
 336     @staticmethod
 337     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 338         """ Remove all phrases from the databse that are no longer in the
 339             new phrase list.
 340         """
 341         to_delete = existing_phrases - new_phrases
 342
 343         if to_delete:
 344             cursor.execute_values(
 345                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 346                     WHERE type = 'S' and word = name
 347                           and info->>'class' = in_class and info->>'type' = in_type
 348                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 349                 """, to_delete)
 350
 351         return len(to_delete)
 352
 353
 354     def add_country_names(self, country_code, names):
 355         """ Add names for the given country to the search index.
 356         """
 357         # Make sure any name preprocessing for country names applies.
 358         info = PlaceInfo({'name': names, 'country_code': country_code,
 359                           'rank_address': 4, 'class': 'boundary',
 360                           'type': 'administrative'})
 361         self._add_country_full_names(country_code,
 362                                      self.sanitizer.process_names(info)[0])
 363
 364
 365     def _add_country_full_names(self, country_code, names):
 366         """ Add names for the given country from an already sanitized
 367             name list.
 368         """
 369         word_tokens = set()
 370         for name in names:
 371             norm_name = self._search_normalized(name.name)
 372             if norm_name:
 373                 word_tokens.add(norm_name)
 374
 375         with self.conn.cursor() as cur:
 376             # Get existing names
 377             cur.execute("""SELECT word_token FROM word
 378                             WHERE type = 'C' and word = %s""",
 379                         (country_code, ))
 380             word_tokens.difference_update((t[0] for t in cur))
 381
 382             # Only add those names that are not yet in the list.
 383             if word_tokens:
 384                 cur.execute("""INSERT INTO word (word_token, type, word)
 385                                (SELECT token, 'C', %s
 386                                 FROM unnest(%s) as token)
 387                             """, (country_code, list(word_tokens)))
 388
 389             # No names are deleted at the moment.
 390             # If deletion is made possible, then the static names from the
 391             # initial 'country_name' table should be kept.
 392
 393
 394     def process_place(self, place):
 395         """ Determine tokenizer information about the given place.
 396
 397             Returns a JSON-serializable structure that will be handed into
 398             the database via the token_info field.
 399         """
 400         token_info = _TokenInfo(self._cache)
 401
 402         names, address = self.sanitizer.process_names(place)
 403
 404         if names:
 405             fulls, partials = self._compute_name_tokens(names)
 406
 407             token_info.add_names(fulls, partials)
 408
 409             if place.is_country():
 410                 self._add_country_full_names(place.country_code, names)
 411
 412         if address:
 413             self._process_place_address(token_info, address)
 414
 415         return token_info.data
 416
 417
 418     def _process_place_address(self, token_info, address):
 419         hnrs = []
 420         addr_terms = []
 421         for item in address:
 422             if item.kind == 'postcode':
 423                 self._add_postcode(item.name)
 424             elif item.kind in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 425                 hnrs.append(item.name)
 426             elif item.kind == 'street':
 427                 token_info.add_street(self._compute_partial_tokens(item.name))
 428             elif item.kind == 'place':
 429                 token_info.add_place(self._compute_partial_tokens(item.name))
 430             elif not item.kind.startswith('_') and \
 431                  item.kind not in ('country', 'full'):
 432                 addr_terms.append((item.kind, self._compute_partial_tokens(item.name)))
 433
 434         if hnrs:
 435             hnrs = self._split_housenumbers(hnrs)
 436             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 437
 438         if addr_terms:
 439             token_info.add_address_terms(addr_terms)
 440
 441
 442     def _compute_partial_tokens(self, name):
 443         """ Normalize the given term, split it into partial words and return
 444             then token list for them.
 445         """
 446         norm_name = self._search_normalized(name)
 447
 448         tokens = []
 449         need_lookup = []
 450         for partial in norm_name.split():
 451             token = self._cache.partials.get(partial)
 452             if token:
 453                 tokens.append(token)
 454             else:
 455                 need_lookup.append(partial)
 456
 457         if need_lookup:
 458             with self.conn.cursor() as cur:
 459                 cur.execute("""SELECT word, getorcreate_partial_word(word)
 460                                FROM unnest(%s) word""",
 461                             (need_lookup, ))
 462
 463                 for partial, token in cur:
 464                     tokens.append(token)
 465                     self._cache.partials[partial] = token
 466
 467         return tokens
 468
 469
 470     def _compute_name_tokens(self, names):
 471         """ Computes the full name and partial name tokens for the given
 472             dictionary of names.
 473         """
 474         full_tokens = set()
 475         partial_tokens = set()
 476
 477         for name in names:
 478             analyzer_id = name.get_attr('analyzer')
 479             norm_name = self._normalized(name.name)
 480             if analyzer_id is None:
 481                 token_id = norm_name
 482             else:
 483                 token_id = f'{norm_name}@{analyzer_id}'
 484
 485             full, part = self._cache.names.get(token_id, (None, None))
 486             if full is None:
 487                 variants = self.token_analysis.analysis[analyzer_id].get_variants_ascii(norm_name)
 488                 if not variants:
 489                     continue
 490
 491                 with self.conn.cursor() as cur:
 492                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 493                                 (token_id, variants))
 494                     full, part = cur.fetchone()
 495
 496                 self._cache.names[token_id] = (full, part)
 497
 498             full_tokens.add(full)
 499             partial_tokens.update(part)
 500
 501         return full_tokens, partial_tokens
 502
 503
 504     def _add_postcode(self, postcode):
 505         """ Make sure the normalized postcode is present in the word table.
 506         """
 507         if re.search(r'[:,;]', postcode) is None:
 508             postcode = self.normalize_postcode(postcode)
 509
 510             if postcode not in self._cache.postcodes:
 511                 term = self._search_normalized(postcode)
 512                 if not term:
 513                     return
 514
 515                 with self.conn.cursor() as cur:
 516                     # no word_id needed for postcodes
 517                     cur.execute("""INSERT INTO word (word_token, type, word)
 518                                    (SELECT %s, 'P', pc FROM (VALUES (%s)) as v(pc)
 519                                     WHERE NOT EXISTS
 520                                      (SELECT * FROM word
 521                                       WHERE type = 'P' and word = pc))
 522                                 """, (term, postcode))
 523                 self._cache.postcodes.add(postcode)
 524
 525
 526     @staticmethod
 527     def _split_housenumbers(hnrs):
 528         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 529             # split numbers if necessary
 530             simple_list = []
 531             for hnr in hnrs:
 532                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 533
 534             if len(simple_list) > 1:
 535                 hnrs = list(set(simple_list))
 536             else:
 537                 hnrs = simple_list
 538
 539         return hnrs
 540
 541
 542
 543
 544 class _TokenInfo:
 545     """ Collect token information to be sent back to the database.
 546     """
 547     def __init__(self, cache):
 548         self._cache = cache
 549         self.data = {}
 550
 551     @staticmethod
 552     def _mk_array(tokens):
 553         return '{%s}' % ','.join((str(s) for s in tokens))
 554
 555
 556     def add_names(self, fulls, partials):
 557         """ Adds token information for the normalised names.
 558         """
 559         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 560
 561
 562     def add_housenumbers(self, conn, hnrs):
 563         """ Extract housenumber information from a list of normalised
 564             housenumbers.
 565         """
 566         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 567         self.data['hnr'] = ';'.join(hnrs)
 568
 569
 570     def add_street(self, tokens):
 571         """ Add addr:street match terms.
 572         """
 573         if tokens:
 574             self.data['street'] = self._mk_array(tokens)
 575
 576
 577     def add_place(self, tokens):
 578         """ Add addr:place search and match terms.
 579         """
 580         if tokens:
 581             self.data['place'] = self._mk_array(tokens)
 582
 583
 584     def add_address_terms(self, terms):
 585         """ Add additional address terms.
 586         """
 587         tokens = {key: self._mk_array(partials)
 588                   for key, partials in terms if partials}
 589
 590         if tokens:
 591             self.data['addr'] = tokens
 592
 593
 594 class _TokenCache:
 595     """ Cache for token information to avoid repeated database queries.
 596
 597         This cache is not thread-safe and needs to be instantiated per
 598         analyzer.
 599     """
 600     def __init__(self):
 601         self.names = {}
 602         self.partials = {}
 603         self.postcodes = set()
 604         self.housenumbers = {}
 605
 606
 607     def get_hnr_tokens(self, conn, terms):
 608         """ Get token ids for a list of housenumbers, looking them up in the
 609             database if necessary. `terms` is an iterable of normalized
 610             housenumbers.
 611         """
 612         tokens = []
 613         askdb = []
 614
 615         for term in terms:
 616             token = self.housenumbers.get(term)
 617             if token is None:
 618                 askdb.append(term)
 619             else:
 620                 tokens.append(token)
 621
 622         if askdb:
 623             with conn.cursor() as cur:
 624                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 625                             (askdb, ))
 626                 for term, tid in cur:
 627                     self.housenumbers[term] = tid
 628                     tokens.append(tid)
 629
 630         return tokens