nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import logging
   8 import re
   9 from textwrap import dedent
  10 from pathlib import Path
  11
  12 from nominatim.db.connection import connect
  13 from nominatim.db.properties import set_property, get_property
  14 from nominatim.db.utils import CopyBuffer
  15 from nominatim.db.sql_preprocessor import SQLPreprocessor
  16 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  17 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  18
  19 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  20 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  21
  22 LOG = logging.getLogger()
  23
  24 def create(dsn, data_dir):
  25     """ Create a new instance of the tokenizer provided by this module.
  26     """
  27     return LegacyICUTokenizer(dsn, data_dir)
  28
  29
  30 class LegacyICUTokenizer:
  31     """ This tokenizer uses libICU to covert names and queries to ASCII.
  32         Otherwise it uses the same algorithms and data structures as the
  33         normalization routines in Nominatim 3.
  34     """
  35
  36     def __init__(self, dsn, data_dir):
  37         self.dsn = dsn
  38         self.data_dir = data_dir
  39         self.naming_rules = None
  40         self.term_normalization = None
  41         self.max_word_frequency = None
  42
  43
  44     def init_new_db(self, config, init_db=True):
  45         """ Set up a new tokenizer for the database.
  46
  47             This copies all necessary data in the project directory to make
  48             sure the tokenizer remains stable even over updates.
  49         """
  50         if config.TOKENIZER_CONFIG:
  51             cfgfile = Path(config.TOKENIZER_CONFIG)
  52         else:
  53             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  54
  55         loader = ICURuleLoader(cfgfile)
  56         self.naming_rules = ICUNameProcessorRules(loader=loader)
  57         self.term_normalization = config.TERM_NORMALIZATION
  58         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  59
  60         self._install_php(config.lib_dir.php)
  61         self._save_config(config)
  62
  63         if init_db:
  64             self.update_sql_functions(config)
  65             self._init_db_tables(config)
  66
  67
  68     def init_from_project(self):
  69         """ Initialise the tokenizer from the project directory.
  70         """
  71         with connect(self.dsn) as conn:
  72             self.naming_rules = ICUNameProcessorRules(conn=conn)
  73             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  74             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  75
  76
  77     def finalize_import(self, config):
  78         """ Do any required postprocessing to make the tokenizer data ready
  79             for use.
  80         """
  81         with connect(self.dsn) as conn:
  82             sqlp = SQLPreprocessor(conn, config)
  83             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  84
  85
  86     def update_sql_functions(self, config):
  87         """ Reimport the SQL functions for this tokenizer.
  88         """
  89         with connect(self.dsn) as conn:
  90             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  91             sqlp = SQLPreprocessor(conn, config)
  92             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  93                               max_word_freq=max_word_freq)
  94
  95
  96     def check_database(self):
  97         """ Check that the tokenizer is set up correctly.
  98         """
  99         self.init_from_project()
 100
 101         if self.naming_rules is None:
 102             return "Configuration for tokenizer 'legacy_icu' are missing."
 103
 104         return None
 105
 106
 107     def name_analyzer(self):
 108         """ Create a new analyzer for tokenizing names and queries
 109             using this tokinzer. Analyzers are context managers and should
 110             be used accordingly:
 111
 112             ```
 113             with tokenizer.name_analyzer() as analyzer:
 114                 analyser.tokenize()
 115             ```
 116
 117             When used outside the with construct, the caller must ensure to
 118             call the close() function before destructing the analyzer.
 119
 120             Analyzers are not thread-safe. You need to instantiate one per thread.
 121         """
 122         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 123
 124     # pylint: disable=missing-format-attribute
 125     def _install_php(self, phpdir):
 126         """ Install the php script for the tokenizer.
 127         """
 128         php_file = self.data_dir / "tokenizer.php"
 129         php_file.write_text(dedent("""\
 130             <?php
 131             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 132             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 133             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 134             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 135             """.format(self, phpdir)))
 136
 137
 138     def _save_config(self, config):
 139         """ Save the configuration that needs to remain stable for the given
 140             database as database properties.
 141         """
 142         with connect(self.dsn) as conn:
 143             self.naming_rules.save_rules(conn)
 144
 145             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 146             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 147
 148
 149     def _init_db_tables(self, config):
 150         """ Set up the word table and fill it with pre-computed word
 151             frequencies.
 152         """
 153         with connect(self.dsn) as conn:
 154             sqlp = SQLPreprocessor(conn, config)
 155             sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
 156             conn.commit()
 157
 158             LOG.warning("Precomputing word tokens")
 159
 160             # get partial words and their frequencies
 161             words = Counter()
 162             name_proc = ICUNameProcessor(self.naming_rules)
 163             with conn.cursor(name="words") as cur:
 164                 cur.execute(""" SELECT v, count(*) FROM
 165                                   (SELECT svals(name) as v FROM place)x
 166                                 WHERE length(v) < 75 GROUP BY v""")
 167
 168                 for name, cnt in cur:
 169                     terms = set()
 170                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 171                         if ' ' in word:
 172                             terms.update(word.split())
 173                     for term in terms:
 174                         words[term] += cnt
 175
 176             # copy them back into the word table
 177             with CopyBuffer() as copystr:
 178                 for args in words.items():
 179                     copystr.add(*args)
 180
 181                 with conn.cursor() as cur:
 182                     copystr.copy_out(cur, 'word',
 183                                      columns=['word_token', 'search_name_count'])
 184                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 185                                    WHERE word_id is null""")
 186
 187             conn.commit()
 188
 189
 190 class LegacyICUNameAnalyzer:
 191     """ The legacy analyzer uses the ICU library for splitting names.
 192
 193         Each instance opens a connection to the database to request the
 194         normalization.
 195     """
 196
 197     def __init__(self, dsn, name_proc):
 198         self.conn = connect(dsn).connection
 199         self.conn.autocommit = True
 200         self.name_processor = name_proc
 201
 202         self._cache = _TokenCache()
 203
 204
 205     def __enter__(self):
 206         return self
 207
 208
 209     def __exit__(self, exc_type, exc_value, traceback):
 210         self.close()
 211
 212
 213     def close(self):
 214         """ Free all resources used by the analyzer.
 215         """
 216         if self.conn:
 217             self.conn.close()
 218             self.conn = None
 219
 220
 221     def get_word_token_info(self, words):
 222         """ Return token information for the given list of words.
 223             If a word starts with # it is assumed to be a full name
 224             otherwise is a partial name.
 225
 226             The function returns a list of tuples with
 227             (original word, word token, word id).
 228
 229             The function is used for testing and debugging only
 230             and not necessarily efficient.
 231         """
 232         tokens = {}
 233         for word in words:
 234             if word.startswith('#'):
 235                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 236             else:
 237                 tokens[word] = self.name_processor.get_search_normalized(word)
 238
 239         with self.conn.cursor() as cur:
 240             cur.execute("""SELECT word_token, word_id
 241                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 242                            WHERE word_token = t.term
 243                                  and class is null and country_code is null""",
 244                         (list(tokens.values()), ))
 245             ids = {r[0]: r[1] for r in cur}
 246
 247         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 248
 249
 250     @staticmethod
 251     def normalize_postcode(postcode):
 252         """ Convert the postcode to a standardized form.
 253
 254             This function must yield exactly the same result as the SQL function
 255             'token_normalized_postcode()'.
 256         """
 257         return postcode.strip().upper()
 258
 259
 260     def _make_standard_hnr(self, hnr):
 261         """ Create a normalised version of a housenumber.
 262
 263             This function takes minor shortcuts on transliteration.
 264         """
 265         return self.name_processor.get_search_normalized(hnr)
 266
 267     def update_postcodes_from_db(self):
 268         """ Update postcode tokens in the word table from the location_postcode
 269             table.
 270         """
 271         to_delete = []
 272         with self.conn.cursor() as cur:
 273             # This finds us the rows in location_postcode and word that are
 274             # missing in the other table.
 275             cur.execute("""SELECT * FROM
 276                             (SELECT pc, word FROM
 277                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 278                               FULL JOIN
 279                               (SELECT info->>'postcode' as word FROM word WHERE type = 'P') w
 280                               ON pc = word) x
 281                            WHERE pc is null or word is null""")
 282
 283             with CopyBuffer() as copystr:
 284                 for postcode, word in cur:
 285                     if postcode is None:
 286                         to_delete.append(word)
 287                     else:
 288                         copystr.add(self.name_processor.get_search_normalized(postcode),
 289                                     'P', {'postcode': postcode})
 290
 291                 if to_delete:
 292                     cur.execute("""DELETE FROM WORD
 293                                    WHERE class ='P' and info->>'postcode' = any(%s)
 294                                 """, (to_delete, ))
 295
 296                 copystr.copy_out(cur, 'word',
 297                                  columns=['word_token', 'type', 'info'])
 298
 299
 300     def update_special_phrases(self, phrases, should_replace):
 301         """ Replace the search index for special phrases with the new phrases.
 302             If `should_replace` is True, then the previous set of will be
 303             completely replaced. Otherwise the phrases are added to the
 304             already existing ones.
 305         """
 306         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 307                             for p in phrases))
 308
 309         with self.conn.cursor() as cur:
 310             # Get the old phrases.
 311             existing_phrases = set()
 312             cur.execute("SELECT info FROM word WHERE type = 'S'")
 313             for (info, ) in cur:
 314                 existing_phrases.add((info['word'], info['class'], info['type'],
 315                                       info.get('op') or '-'))
 316
 317             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 318             if should_replace:
 319                 deleted = self._remove_special_phrases(cur, norm_phrases,
 320                                                        existing_phrases)
 321             else:
 322                 deleted = 0
 323
 324         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 325                  len(norm_phrases), added, deleted)
 326
 327
 328     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 329         """ Add all phrases to the database that are not yet there.
 330         """
 331         to_add = new_phrases - existing_phrases
 332
 333         added = 0
 334         with CopyBuffer() as copystr:
 335             for word, cls, typ, oper in to_add:
 336                 term = self.name_processor.get_search_normalized(word)
 337                 if term:
 338                     copystr.add(term, 'S',
 339                                 {'word': word, 'class': cls, 'type': typ,
 340                                  'op': oper if oper in ('in', 'near') else None})
 341                     added += 1
 342
 343             copystr.copy_out(cursor, 'word',
 344                              columns=['word_token', 'type', 'info'])
 345
 346         return added
 347
 348
 349     @staticmethod
 350     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 351         """ Remove all phrases from the databse that are no longer in the
 352             new phrase list.
 353         """
 354         to_delete = existing_phrases - new_phrases
 355
 356         if to_delete:
 357             cursor.execute_values(
 358                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 359                     WHERE info->>'word' = name
 360                           and info->>'class' = in_class and info->>'type' = in_type
 361                           and ((op = '-' and info->>'op' is null) or op = info->>'op')
 362                 """, to_delete)
 363
 364         return len(to_delete)
 365
 366
 367     def add_country_names(self, country_code, names):
 368         """ Add names for the given country to the search index.
 369         """
 370         word_tokens = set()
 371         for name in self._compute_full_names(names):
 372             norm_name = self.name_processor.get_search_normalized(name)
 373             if norm_name:
 374                 word_tokens.add(norm_name)
 375
 376         with self.conn.cursor() as cur:
 377             # Get existing names
 378             cur.execute("""SELECT word_token FROM word
 379                             WHERE type = 'C' and info->>'cc'= %s""",
 380                         (country_code, ))
 381             word_tokens.difference_update((t[0] for t in cur))
 382
 383             # Only add those names that are not yet in the list.
 384             if word_tokens:
 385                 cur.execute("""INSERT INTO word (word_token, type, info)
 386                                (SELECT token, 'C', json_build_object('cc', %s)
 387                                 FROM unnest(%s) as token)
 388                             """, (country_code, list(word_tokens)))
 389
 390             # No names are deleted at the moment.
 391             # If deletion is made possible, then the static names from the
 392             # initial 'country_name' table should be kept.
 393
 394
 395     def process_place(self, place):
 396         """ Determine tokenizer information about the given place.
 397
 398             Returns a JSON-serialisable structure that will be handed into
 399             the database via the token_info field.
 400         """
 401         token_info = _TokenInfo(self._cache)
 402
 403         names = place.get('name')
 404
 405         if names:
 406             fulls, partials = self._compute_name_tokens(names)
 407
 408             token_info.add_names(fulls, partials)
 409
 410             country_feature = place.get('country_feature')
 411             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 412                 self.add_country_names(country_feature.lower(), names)
 413
 414         address = place.get('address')
 415         if address:
 416             self._process_place_address(token_info, address)
 417
 418         return token_info.data
 419
 420
 421     def _process_place_address(self, token_info, address):
 422         hnrs = []
 423         addr_terms = []
 424         for key, value in address.items():
 425             if key == 'postcode':
 426                 self._add_postcode(value)
 427             elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 428                 hnrs.append(value)
 429             elif key == 'street':
 430                 token_info.add_street(*self._compute_name_tokens({'name': value}))
 431             elif key == 'place':
 432                 token_info.add_place(*self._compute_name_tokens({'name': value}))
 433             elif not key.startswith('_') and \
 434                  key not in ('country', 'full'):
 435                 addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 436
 437         if hnrs:
 438             hnrs = self._split_housenumbers(hnrs)
 439             token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 440
 441         if addr_terms:
 442             token_info.add_address_terms(addr_terms)
 443
 444
 445     def _compute_name_tokens(self, names):
 446         """ Computes the full name and partial name tokens for the given
 447             dictionary of names.
 448         """
 449         full_names = self._compute_full_names(names)
 450         full_tokens = set()
 451         partial_tokens = set()
 452
 453         for name in full_names:
 454             norm_name = self.name_processor.get_normalized(name)
 455             full, part = self._cache.names.get(norm_name, (None, None))
 456             if full is None:
 457                 variants = self.name_processor.get_variants_ascii(norm_name)
 458                 if not variants:
 459                     continue
 460
 461                 with self.conn.cursor() as cur:
 462                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 463                                 (norm_name, variants))
 464                     full, part = cur.fetchone()
 465
 466                 self._cache.names[norm_name] = (full, part)
 467
 468             full_tokens.add(full)
 469             partial_tokens.update(part)
 470
 471         return full_tokens, partial_tokens
 472
 473
 474     @staticmethod
 475     def _compute_full_names(names):
 476         """ Return the set of all full name word ids to be used with the
 477             given dictionary of names.
 478         """
 479         full_names = set()
 480         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 481             if name:
 482                 full_names.add(name)
 483
 484                 brace_idx = name.find('(')
 485                 if brace_idx >= 0:
 486                     full_names.add(name[:brace_idx].strip())
 487
 488         return full_names
 489
 490
 491     def _add_postcode(self, postcode):
 492         """ Make sure the normalized postcode is present in the word table.
 493         """
 494         if re.search(r'[:,;]', postcode) is None:
 495             postcode = self.normalize_postcode(postcode)
 496
 497             if postcode not in self._cache.postcodes:
 498                 term = self.name_processor.get_search_normalized(postcode)
 499                 if not term:
 500                     return
 501
 502                 with self.conn.cursor() as cur:
 503                     # no word_id needed for postcodes
 504                     cur.execute("""INSERT INTO word (word_token, type, info)
 505                                    (SELECT %s, 'P', json_build_object('postcode', pc)
 506                                     FROM (VALUES (%s)) as v(pc)
 507                                     WHERE NOT EXISTS
 508                                      (SELECT * FROM word
 509                                       WHERE type = 'P' and info->>postcode = pc))
 510                                 """, (term, postcode))
 511                 self._cache.postcodes.add(postcode)
 512
 513
 514     @staticmethod
 515     def _split_housenumbers(hnrs):
 516         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 517             # split numbers if necessary
 518             simple_list = []
 519             for hnr in hnrs:
 520                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 521
 522             if len(simple_list) > 1:
 523                 hnrs = list(set(simple_list))
 524             else:
 525                 hnrs = simple_list
 526
 527         return hnrs
 528
 529
 530
 531
 532 class _TokenInfo:
 533     """ Collect token information to be sent back to the database.
 534     """
 535     def __init__(self, cache):
 536         self._cache = cache
 537         self.data = {}
 538
 539     @staticmethod
 540     def _mk_array(tokens):
 541         return '{%s}' % ','.join((str(s) for s in tokens))
 542
 543
 544     def add_names(self, fulls, partials):
 545         """ Adds token information for the normalised names.
 546         """
 547         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 548
 549
 550     def add_housenumbers(self, conn, hnrs):
 551         """ Extract housenumber information from a list of normalised
 552             housenumbers.
 553         """
 554         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 555         self.data['hnr'] = ';'.join(hnrs)
 556
 557
 558     def add_street(self, fulls, _):
 559         """ Add addr:street match terms.
 560         """
 561         if fulls:
 562             self.data['street'] = self._mk_array(fulls)
 563
 564
 565     def add_place(self, fulls, partials):
 566         """ Add addr:place search and match terms.
 567         """
 568         if fulls:
 569             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 570             self.data['place_match'] = self._mk_array(fulls)
 571
 572
 573     def add_address_terms(self, terms):
 574         """ Add additional address terms.
 575         """
 576         tokens = {}
 577
 578         for key, fulls, partials in terms:
 579             if fulls:
 580                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 581                                self._mk_array(fulls)]
 582
 583         if tokens:
 584             self.data['addr'] = tokens
 585
 586
 587 class _TokenCache:
 588     """ Cache for token information to avoid repeated database queries.
 589
 590         This cache is not thread-safe and needs to be instantiated per
 591         analyzer.
 592     """
 593     def __init__(self):
 594         self.names = {}
 595         self.postcodes = set()
 596         self.housenumbers = {}
 597
 598
 599     def get_hnr_tokens(self, conn, terms):
 600         """ Get token ids for a list of housenumbers, looking them up in the
 601             database if necessary. `terms` is an iterable of normalized
 602             housenumbers.
 603         """
 604         tokens = []
 605         askdb = []
 606
 607         for term in terms:
 608             token = self.housenumbers.get(term)
 609             if token is None:
 610                 askdb.append(term)
 611             else:
 612                 tokens.append(token)
 613
 614         if askdb:
 615             with conn.cursor() as cur:
 616                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 617                             (askdb, ))
 618                 for term, tid in cur:
 619                     self.housenumbers[term] = tid
 620                     tokens.append(tid)
 621
 622         return tokens