nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import logging
   8 import re
   9 from textwrap import dedent
  10 from pathlib import Path
  11
  12 import psycopg2.extras
  13
  14 from nominatim.db.connection import connect
  15 from nominatim.db.properties import set_property, get_property
  16 from nominatim.db.utils import CopyBuffer
  17 from nominatim.db.sql_preprocessor import SQLPreprocessor
  18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  20
  21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  23
  24 LOG = logging.getLogger()
  25
  26 def create(dsn, data_dir):
  27     """ Create a new instance of the tokenizer provided by this module.
  28     """
  29     return LegacyICUTokenizer(dsn, data_dir)
  30
  31
  32 class LegacyICUTokenizer:
  33     """ This tokenizer uses libICU to covert names and queries to ASCII.
  34         Otherwise it uses the same algorithms and data structures as the
  35         normalization routines in Nominatim 3.
  36     """
  37
  38     def __init__(self, dsn, data_dir):
  39         self.dsn = dsn
  40         self.data_dir = data_dir
  41         self.naming_rules = None
  42         self.term_normalization = None
  43         self.max_word_frequency = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         if config.TOKENIZER_CONFIG:
  53             cfgfile = Path(config.TOKENIZER_CONFIG)
  54         else:
  55             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  56
  57         loader = ICURuleLoader(cfgfile)
  58         self.naming_rules = ICUNameProcessorRules(loader=loader)
  59         self.term_normalization = config.TERM_NORMALIZATION
  60         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  61
  62         self._install_php(config.lib_dir.php)
  63         self._save_config(config)
  64
  65         if init_db:
  66             self.update_sql_functions(config)
  67             self._init_db_tables(config)
  68
  69
  70     def init_from_project(self):
  71         """ Initialise the tokenizer from the project directory.
  72         """
  73         with connect(self.dsn) as conn:
  74             self.naming_rules = ICUNameProcessorRules(conn=conn)
  75             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  76             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  77
  78
  79     def finalize_import(self, config):
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  86
  87
  88     def update_sql_functions(self, config):
  89         """ Reimport the SQL functions for this tokenizer.
  90         """
  91         with connect(self.dsn) as conn:
  92             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  93             sqlp = SQLPreprocessor(conn, config)
  94             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  95                               max_word_freq=max_word_freq)
  96
  97
  98     def check_database(self):
  99         """ Check that the tokenizer is set up correctly.
 100         """
 101         self.init_from_project()
 102
 103         if self.naming_rules is None:
 104             return "Configuration for tokenizer 'legacy_icu' are missing."
 105
 106         return None
 107
 108
 109     def name_analyzer(self):
 110         """ Create a new analyzer for tokenizing names and queries
 111             using this tokinzer. Analyzers are context managers and should
 112             be used accordingly:
 113
 114             ```
 115             with tokenizer.name_analyzer() as analyzer:
 116                 analyser.tokenize()
 117             ```
 118
 119             When used outside the with construct, the caller must ensure to
 120             call the close() function before destructing the analyzer.
 121
 122             Analyzers are not thread-safe. You need to instantiate one per thread.
 123         """
 124         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 125
 126     # pylint: disable=missing-format-attribute
 127     def _install_php(self, phpdir):
 128         """ Install the php script for the tokenizer.
 129         """
 130         php_file = self.data_dir / "tokenizer.php"
 131         php_file.write_text(dedent("""\
 132             <?php
 133             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 134             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 135             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 136             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 137             """.format(self, phpdir)))
 138
 139
 140     def _save_config(self, config):
 141         """ Save the configuration that needs to remain stable for the given
 142             database as database properties.
 143         """
 144         with connect(self.dsn) as conn:
 145             self.naming_rules.save_rules(conn)
 146
 147             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 148             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 149
 150
 151     def _init_db_tables(self, config):
 152         """ Set up the word table and fill it with pre-computed word
 153             frequencies.
 154         """
 155         with connect(self.dsn) as conn:
 156             sqlp = SQLPreprocessor(conn, config)
 157             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 158             conn.commit()
 159
 160             LOG.warning("Precomputing word tokens")
 161
 162             # get partial words and their frequencies
 163             words = Counter()
 164             name_proc = ICUNameProcessor(self.naming_rules)
 165             with conn.cursor(name="words") as cur:
 166                 cur.execute(""" SELECT v, count(*) FROM
 167                                   (SELECT svals(name) as v FROM place)x
 168                                 WHERE length(v) < 75 GROUP BY v""")
 169
 170                 for name, cnt in cur:
 171                     terms = set()
 172                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 173                         if ' ' in word:
 174                             terms.update(word.split())
 175                     for term in terms:
 176                         words[term] += cnt
 177
 178             # copy them back into the word table
 179             with CopyBuffer() as copystr:
 180                 for args in words.items():
 181                     copystr.add(*args)
 182
 183                 with conn.cursor() as cur:
 184                     copystr.copy_out(cur, 'word',
 185                                      columns=['word_token', 'search_name_count'])
 186                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 187                                    WHERE word_id is null""")
 188
 189             conn.commit()
 190
 191
 192 class LegacyICUNameAnalyzer:
 193     """ The legacy analyzer uses the ICU library for splitting names.
 194
 195         Each instance opens a connection to the database to request the
 196         normalization.
 197     """
 198
 199     def __init__(self, dsn, name_proc):
 200         self.conn = connect(dsn).connection
 201         self.conn.autocommit = True
 202         self.name_processor = name_proc
 203
 204         self._cache = _TokenCache()
 205
 206
 207     def __enter__(self):
 208         return self
 209
 210
 211     def __exit__(self, exc_type, exc_value, traceback):
 212         self.close()
 213
 214
 215     def close(self):
 216         """ Free all resources used by the analyzer.
 217         """
 218         if self.conn:
 219             self.conn.close()
 220             self.conn = None
 221
 222
 223     def get_word_token_info(self, words):
 224         """ Return token information for the given list of words.
 225             If a word starts with # it is assumed to be a full name
 226             otherwise is a partial name.
 227
 228             The function returns a list of tuples with
 229             (original word, word token, word id).
 230
 231             The function is used for testing and debugging only
 232             and not necessarily efficient.
 233         """
 234         tokens = {}
 235         for word in words:
 236             if word.startswith('#'):
 237                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 238             else:
 239                 tokens[word] = self.name_processor.get_search_normalized(word)
 240
 241         with self.conn.cursor() as cur:
 242             cur.execute("""SELECT word_token, word_id
 243                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 244                            WHERE word_token = t.term
 245                                  and class is null and country_code is null""",
 246                         (list(tokens.values()), ))
 247             ids = {r[0]: r[1] for r in cur}
 248
 249         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 250
 251
 252     @staticmethod
 253     def normalize_postcode(postcode):
 254         """ Convert the postcode to a standardized form.
 255
 256             This function must yield exactly the same result as the SQL function
 257             'token_normalized_postcode()'.
 258         """
 259         return postcode.strip().upper()
 260
 261
 262     def _make_standard_hnr(self, hnr):
 263         """ Create a normalised version of a housenumber.
 264
 265             This function takes minor shortcuts on transliteration.
 266         """
 267         return self.name_processor.get_search_normalized(hnr)
 268
 269     def update_postcodes_from_db(self):
 270         """ Update postcode tokens in the word table from the location_postcode
 271             table.
 272         """
 273         to_delete = []
 274         with self.conn.cursor() as cur:
 275             # This finds us the rows in location_postcode and word that are
 276             # missing in the other table.
 277             cur.execute("""SELECT * FROM
 278                             (SELECT pc, word FROM
 279                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 280                               FULL JOIN
 281                               (SELECT word FROM word
 282                                 WHERE class ='place' and type = 'postcode') w
 283                               ON pc = word) x
 284                            WHERE pc is null or word is null""")
 285
 286             with CopyBuffer() as copystr:
 287                 for postcode, word in cur:
 288                     if postcode is None:
 289                         to_delete.append(word)
 290                     else:
 291                         copystr.add(
 292                             postcode,
 293                             ' ' + self.name_processor.get_search_normalized(postcode),
 294                             'place', 'postcode', 0)
 295
 296                 if to_delete:
 297                     cur.execute("""DELETE FROM WORD
 298                                    WHERE class ='place' and type = 'postcode'
 299                                          and word = any(%s)
 300                                 """, (to_delete, ))
 301
 302                 copystr.copy_out(cur, 'word',
 303                                  columns=['word', 'word_token', 'class', 'type',
 304                                           'search_name_count'])
 305
 306
 307     def update_special_phrases(self, phrases, should_replace):
 308         """ Replace the search index for special phrases with the new phrases.
 309         """
 310         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 311                             for p in phrases))
 312
 313         with self.conn.cursor() as cur:
 314             # Get the old phrases.
 315             existing_phrases = set()
 316             cur.execute("""SELECT word, class, type, operator FROM word
 317                            WHERE class != 'place'
 318                                  OR (type != 'house' AND type != 'postcode')""")
 319             for label, cls, typ, oper in cur:
 320                 existing_phrases.add((label, cls, typ, oper or '-'))
 321
 322             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 323             if should_replace:
 324                 deleted = self._remove_special_phrases(cur, norm_phrases,
 325                                                        existing_phrases)
 326             else:
 327                 deleted = 0
 328
 329         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 330                  len(norm_phrases), added, deleted)
 331
 332
 333     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 334         """ Add all phrases to the database that are not yet there.
 335         """
 336         to_add = new_phrases - existing_phrases
 337
 338         added = 0
 339         with CopyBuffer() as copystr:
 340             for word, cls, typ, oper in to_add:
 341                 term = self.name_processor.get_search_normalized(word)
 342                 if term:
 343                     copystr.add(word, ' ' + term, cls, typ,
 344                                 oper if oper in ('in', 'near')  else None, 0)
 345                     added += 1
 346
 347             copystr.copy_out(cursor, 'word',
 348                              columns=['word', 'word_token', 'class', 'type',
 349                                       'operator', 'search_name_count'])
 350
 351         return added
 352
 353
 354     @staticmethod
 355     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 356         """ Remove all phrases from the databse that are no longer in the
 357             new phrase list.
 358         """
 359         to_delete = existing_phrases - new_phrases
 360
 361         if to_delete:
 362             psycopg2.extras.execute_values(
 363                 cursor,
 364                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 365                     WHERE word = name and class = in_class and type = in_type
 366                           and ((op = '-' and operator is null) or op = operator)""",
 367                 to_delete)
 368
 369         return len(to_delete)
 370
 371
 372     def add_country_names(self, country_code, names):
 373         """ Add names for the given country to the search index.
 374         """
 375         word_tokens = set()
 376         for name in self._compute_full_names(names):
 377             if name:
 378                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 379
 380         with self.conn.cursor() as cur:
 381             # Get existing names
 382             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 383                         (country_code, ))
 384             word_tokens.difference_update((t[0] for t in cur))
 385
 386             if word_tokens:
 387                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 388                                                  search_name_count)
 389                                (SELECT nextval('seq_word'), token, '{}', 0
 390                                 FROM unnest(%s) as token)
 391                             """.format(country_code), (list(word_tokens),))
 392
 393
 394     def process_place(self, place):
 395         """ Determine tokenizer information about the given place.
 396
 397             Returns a JSON-serialisable structure that will be handed into
 398             the database via the token_info field.
 399         """
 400         token_info = _TokenInfo(self._cache)
 401
 402         names = place.get('name')
 403
 404         if names:
 405             fulls, partials = self._compute_name_tokens(names)
 406
 407             token_info.add_names(fulls, partials)
 408
 409             country_feature = place.get('country_feature')
 410             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 411                 self.add_country_names(country_feature.lower(), names)
 412
 413         address = place.get('address')
 414
 415         if address:
 416             hnrs = []
 417             addr_terms = []
 418             for key, value in address.items():
 419                 if key == 'postcode':
 420                     self._add_postcode(value)
 421                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 422                     hnrs.append(value)
 423                 elif key == 'street':
 424                     token_info.add_street(*self._compute_name_tokens({'name': value}))
 425                 elif key == 'place':
 426                     token_info.add_place(*self._compute_name_tokens({'name': value}))
 427                 elif not key.startswith('_') and \
 428                      key not in ('country', 'full'):
 429                     addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 430
 431             if hnrs:
 432                 hnrs = self._split_housenumbers(hnrs)
 433                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 434
 435             if addr_terms:
 436                 token_info.add_address_terms(addr_terms)
 437
 438         return token_info.data
 439
 440
 441     def _compute_name_tokens(self, names):
 442         """ Computes the full name and partial name tokens for the given
 443             dictionary of names.
 444         """
 445         full_names = self._compute_full_names(names)
 446         full_tokens = set()
 447         partial_tokens = set()
 448
 449         for name in full_names:
 450             norm_name = self.name_processor.get_normalized(name)
 451             full, part = self._cache.names.get(norm_name, (None, None))
 452             if full is None:
 453                 variants = self.name_processor.get_variants_ascii(norm_name)
 454                 if not variants:
 455                     continue
 456
 457                 with self.conn.cursor() as cur:
 458                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 459                                 (norm_name, variants))
 460                     full, part = cur.fetchone()
 461
 462                 self._cache.names[norm_name] = (full, part)
 463
 464             full_tokens.add(full)
 465             partial_tokens.update(part)
 466
 467         return full_tokens, partial_tokens
 468
 469
 470     @staticmethod
 471     def _compute_full_names(names):
 472         """ Return the set of all full name word ids to be used with the
 473             given dictionary of names.
 474         """
 475         full_names = set()
 476         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 477             if name:
 478                 full_names.add(name)
 479
 480                 brace_idx = name.find('(')
 481                 if brace_idx >= 0:
 482                     full_names.add(name[:brace_idx].strip())
 483
 484         return full_names
 485
 486
 487     def _add_postcode(self, postcode):
 488         """ Make sure the normalized postcode is present in the word table.
 489         """
 490         if re.search(r'[:,;]', postcode) is None:
 491             postcode = self.normalize_postcode(postcode)
 492
 493             if postcode not in self._cache.postcodes:
 494                 term = self.name_processor.get_search_normalized(postcode)
 495                 if not term:
 496                     return
 497
 498                 with self.conn.cursor() as cur:
 499                     # no word_id needed for postcodes
 500                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 501                                                      search_name_count)
 502                                    (SELECT pc, %s, 'place', 'postcode', 0
 503                                     FROM (VALUES (%s)) as v(pc)
 504                                     WHERE NOT EXISTS
 505                                      (SELECT * FROM word
 506                                       WHERE word = pc and class='place' and type='postcode'))
 507                                 """, (' ' + term, postcode))
 508                 self._cache.postcodes.add(postcode)
 509
 510
 511     @staticmethod
 512     def _split_housenumbers(hnrs):
 513         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 514             # split numbers if necessary
 515             simple_list = []
 516             for hnr in hnrs:
 517                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 518
 519             if len(simple_list) > 1:
 520                 hnrs = list(set(simple_list))
 521             else:
 522                 hnrs = simple_list
 523
 524         return hnrs
 525
 526
 527
 528
 529 class _TokenInfo:
 530     """ Collect token information to be sent back to the database.
 531     """
 532     def __init__(self, cache):
 533         self._cache = cache
 534         self.data = {}
 535
 536     @staticmethod
 537     def _mk_array(tokens):
 538         return '{%s}' % ','.join((str(s) for s in tokens))
 539
 540
 541     def add_names(self, fulls, partials):
 542         """ Adds token information for the normalised names.
 543         """
 544         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 545
 546
 547     def add_housenumbers(self, conn, hnrs):
 548         """ Extract housenumber information from a list of normalised
 549             housenumbers.
 550         """
 551         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 552         self.data['hnr'] = ';'.join(hnrs)
 553
 554
 555     def add_street(self, fulls, _):
 556         """ Add addr:street match terms.
 557         """
 558         if fulls:
 559             self.data['street'] = self._mk_array(fulls)
 560
 561
 562     def add_place(self, fulls, partials):
 563         """ Add addr:place search and match terms.
 564         """
 565         if fulls:
 566             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 567             self.data['place_match'] = self._mk_array(fulls)
 568
 569
 570     def add_address_terms(self, terms):
 571         """ Add additional address terms.
 572         """
 573         tokens = {}
 574
 575         for key, fulls, partials in terms:
 576             if fulls:
 577                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 578                                self._mk_array(fulls)]
 579
 580         if tokens:
 581             self.data['addr'] = tokens
 582
 583
 584 class _TokenCache:
 585     """ Cache for token information to avoid repeated database queries.
 586
 587         This cache is not thread-safe and needs to be instantiated per
 588         analyzer.
 589     """
 590     def __init__(self):
 591         self.names = {}
 592         self.postcodes = set()
 593         self.housenumbers = {}
 594
 595
 596     def get_hnr_tokens(self, conn, terms):
 597         """ Get token ids for a list of housenumbers, looking them up in the
 598             database if necessary.
 599         """
 600         tokens = []
 601         askdb = []
 602
 603         for term in terms:
 604             token = self.housenumbers.get(term)
 605             if token is None:
 606                 askdb.append(term)
 607             else:
 608                 tokens.append(token)
 609
 610         if askdb:
 611             with conn.cursor() as cur:
 612                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 613                             (askdb, ))
 614                 for term, tid in cur:
 615                     self.housenumbers[term] = tid
 616                     tokens.append(tid)
 617
 618         return tokens