nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import itertools
   7 import logging
   8 import re
   9 from textwrap import dedent
  10 from pathlib import Path
  11
  12 import psycopg2.extras
  13
  14 from nominatim.db.connection import connect
  15 from nominatim.db.properties import set_property, get_property
  16 from nominatim.db.utils import CopyBuffer
  17 from nominatim.db.sql_preprocessor import SQLPreprocessor
  18 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  19 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  20
  21 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  22 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  23
  24 LOG = logging.getLogger()
  25
  26 def create(dsn, data_dir):
  27     """ Create a new instance of the tokenizer provided by this module.
  28     """
  29     return LegacyICUTokenizer(dsn, data_dir)
  30
  31
  32 class LegacyICUTokenizer:
  33     """ This tokenizer uses libICU to covert names and queries to ASCII.
  34         Otherwise it uses the same algorithms and data structures as the
  35         normalization routines in Nominatim 3.
  36     """
  37
  38     def __init__(self, dsn, data_dir):
  39         self.dsn = dsn
  40         self.data_dir = data_dir
  41         self.naming_rules = None
  42         self.term_normalization = None
  43         self.max_word_frequency = None
  44
  45
  46     def init_new_db(self, config, init_db=True):
  47         """ Set up a new tokenizer for the database.
  48
  49             This copies all necessary data in the project directory to make
  50             sure the tokenizer remains stable even over updates.
  51         """
  52         if config.TOKENIZER_CONFIG:
  53             cfgfile = Path(config.TOKENIZER_CONFIG)
  54         else:
  55             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  56
  57         loader = ICURuleLoader(cfgfile)
  58         self.naming_rules = ICUNameProcessorRules(loader=loader)
  59         self.term_normalization = config.TERM_NORMALIZATION
  60         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  61
  62         self._install_php(config.lib_dir.php)
  63         self._save_config(config)
  64
  65         if init_db:
  66             self.update_sql_functions(config)
  67             self._init_db_tables(config)
  68
  69
  70     def init_from_project(self):
  71         """ Initialise the tokenizer from the project directory.
  72         """
  73         with connect(self.dsn) as conn:
  74             self.naming_rules = ICUNameProcessorRules(conn=conn)
  75             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  76             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  77
  78
  79     def finalize_import(self, config):
  80         """ Do any required postprocessing to make the tokenizer data ready
  81             for use.
  82         """
  83         with connect(self.dsn) as conn:
  84             sqlp = SQLPreprocessor(conn, config)
  85             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  86
  87
  88     def update_sql_functions(self, config):
  89         """ Reimport the SQL functions for this tokenizer.
  90         """
  91         with connect(self.dsn) as conn:
  92             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  93             sqlp = SQLPreprocessor(conn, config)
  94             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  95                               max_word_freq=max_word_freq)
  96
  97
  98     def check_database(self):
  99         """ Check that the tokenizer is set up correctly.
 100         """
 101         self.init_from_project()
 102
 103         if self.naming_rules is None:
 104             return "Configuration for tokenizer 'legacy_icu' are missing."
 105
 106         return None
 107
 108
 109     def name_analyzer(self):
 110         """ Create a new analyzer for tokenizing names and queries
 111             using this tokinzer. Analyzers are context managers and should
 112             be used accordingly:
 113
 114             ```
 115             with tokenizer.name_analyzer() as analyzer:
 116                 analyser.tokenize()
 117             ```
 118
 119             When used outside the with construct, the caller must ensure to
 120             call the close() function before destructing the analyzer.
 121
 122             Analyzers are not thread-safe. You need to instantiate one per thread.
 123         """
 124         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 125
 126     # pylint: disable=missing-format-attribute
 127     def _install_php(self, phpdir):
 128         """ Install the php script for the tokenizer.
 129         """
 130         php_file = self.data_dir / "tokenizer.php"
 131         php_file.write_text(dedent("""\
 132             <?php
 133             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 134             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 135             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 136             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 137             """.format(self, phpdir)))
 138
 139
 140     def _save_config(self, config):
 141         """ Save the configuration that needs to remain stable for the given
 142             database as database properties.
 143         """
 144         with connect(self.dsn) as conn:
 145             self.naming_rules.save_rules(conn)
 146
 147             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 148             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 149
 150
 151     def _init_db_tables(self, config):
 152         """ Set up the word table and fill it with pre-computed word
 153             frequencies.
 154         """
 155         with connect(self.dsn) as conn:
 156             sqlp = SQLPreprocessor(conn, config)
 157             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 158             conn.commit()
 159
 160             LOG.warning("Precomputing word tokens")
 161
 162             # get partial words and their frequencies
 163             words = Counter()
 164             name_proc = ICUNameProcessor(self.naming_rules)
 165             with conn.cursor(name="words") as cur:
 166                 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 167
 168                 for name, cnt in cur:
 169                     terms = set()
 170                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 171                         terms.update(word.split())
 172                     for term in terms:
 173                         words[term] += cnt
 174
 175             # copy them back into the word table
 176             with CopyBuffer() as copystr:
 177                 for args in words.items():
 178                     copystr.add(*args)
 179
 180                 with conn.cursor() as cur:
 181                     copystr.copy_out(cur, 'word',
 182                                      columns=['word_token', 'search_name_count'])
 183                     cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 184                                    WHERE word_id is null""")
 185
 186             conn.commit()
 187
 188
 189 class LegacyICUNameAnalyzer:
 190     """ The legacy analyzer uses the ICU library for splitting names.
 191
 192         Each instance opens a connection to the database to request the
 193         normalization.
 194     """
 195
 196     def __init__(self, dsn, name_proc):
 197         self.conn = connect(dsn).connection
 198         self.conn.autocommit = True
 199         self.name_processor = name_proc
 200
 201         self._cache = _TokenCache()
 202
 203
 204     def __enter__(self):
 205         return self
 206
 207
 208     def __exit__(self, exc_type, exc_value, traceback):
 209         self.close()
 210
 211
 212     def close(self):
 213         """ Free all resources used by the analyzer.
 214         """
 215         if self.conn:
 216             self.conn.close()
 217             self.conn = None
 218
 219
 220     def get_word_token_info(self, words):
 221         """ Return token information for the given list of words.
 222             If a word starts with # it is assumed to be a full name
 223             otherwise is a partial name.
 224
 225             The function returns a list of tuples with
 226             (original word, word token, word id).
 227
 228             The function is used for testing and debugging only
 229             and not necessarily efficient.
 230         """
 231         tokens = {}
 232         for word in words:
 233             if word.startswith('#'):
 234                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 235             else:
 236                 tokens[word] = self.name_processor.get_search_normalized(word)
 237
 238         with self.conn.cursor() as cur:
 239             cur.execute("""SELECT word_token, word_id
 240                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 241                            WHERE word_token = t.term
 242                                  and class is null and country_code is null""",
 243                         (list(tokens.values()), ))
 244             ids = {r[0]: r[1] for r in cur}
 245
 246         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 247
 248
 249     @staticmethod
 250     def normalize_postcode(postcode):
 251         """ Convert the postcode to a standardized form.
 252
 253             This function must yield exactly the same result as the SQL function
 254             'token_normalized_postcode()'.
 255         """
 256         return postcode.strip().upper()
 257
 258
 259     def _make_standard_hnr(self, hnr):
 260         """ Create a normalised version of a housenumber.
 261
 262             This function takes minor shortcuts on transliteration.
 263         """
 264         return self.name_processor.get_search_normalized(hnr)
 265
 266     def update_postcodes_from_db(self):
 267         """ Update postcode tokens in the word table from the location_postcode
 268             table.
 269         """
 270         to_delete = []
 271         with self.conn.cursor() as cur:
 272             # This finds us the rows in location_postcode and word that are
 273             # missing in the other table.
 274             cur.execute("""SELECT * FROM
 275                             (SELECT pc, word FROM
 276                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 277                               FULL JOIN
 278                               (SELECT word FROM word
 279                                 WHERE class ='place' and type = 'postcode') w
 280                               ON pc = word) x
 281                            WHERE pc is null or word is null""")
 282
 283             with CopyBuffer() as copystr:
 284                 for postcode, word in cur:
 285                     if postcode is None:
 286                         to_delete.append(word)
 287                     else:
 288                         copystr.add(
 289                             postcode,
 290                             ' ' + self.name_processor.get_search_normalized(postcode),
 291                             'place', 'postcode', 0)
 292
 293                 if to_delete:
 294                     cur.execute("""DELETE FROM WORD
 295                                    WHERE class ='place' and type = 'postcode'
 296                                          and word = any(%s)
 297                                 """, (to_delete, ))
 298
 299                 copystr.copy_out(cur, 'word',
 300                                  columns=['word', 'word_token', 'class', 'type',
 301                                           'search_name_count'])
 302
 303
 304     def update_special_phrases(self, phrases, should_replace):
 305         """ Replace the search index for special phrases with the new phrases.
 306         """
 307         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 308                             for p in phrases))
 309
 310         with self.conn.cursor() as cur:
 311             # Get the old phrases.
 312             existing_phrases = set()
 313             cur.execute("""SELECT word, class, type, operator FROM word
 314                            WHERE class != 'place'
 315                                  OR (type != 'house' AND type != 'postcode')""")
 316             for label, cls, typ, oper in cur:
 317                 existing_phrases.add((label, cls, typ, oper or '-'))
 318
 319             added = self._add_special_phrases(cur, norm_phrases, existing_phrases)
 320             if should_replace:
 321                 deleted = self._remove_special_phrases(cur, norm_phrases,
 322                                                        existing_phrases)
 323             else:
 324                 deleted = 0
 325
 326         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 327                  len(norm_phrases), added, deleted)
 328
 329
 330     def _add_special_phrases(self, cursor, new_phrases, existing_phrases):
 331         """ Add all phrases to the database that are not yet there.
 332         """
 333         to_add = new_phrases - existing_phrases
 334
 335         added = 0
 336         with CopyBuffer() as copystr:
 337             for word, cls, typ, oper in to_add:
 338                 term = self.name_processor.get_search_normalized(word)
 339                 if term:
 340                     copystr.add(word, ' ' + term, cls, typ,
 341                                 oper if oper in ('in', 'near')  else None, 0)
 342                     added += 1
 343
 344             copystr.copy_out(cursor, 'word',
 345                              columns=['word', 'word_token', 'class', 'type',
 346                                       'operator', 'search_name_count'])
 347
 348         return added
 349
 350
 351     @staticmethod
 352     def _remove_special_phrases(cursor, new_phrases, existing_phrases):
 353         """ Remove all phrases from the databse that are no longer in the
 354             new phrase list.
 355         """
 356         to_delete = existing_phrases - new_phrases
 357
 358         if to_delete:
 359             psycopg2.extras.execute_values(
 360                 cursor,
 361                 """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 362                     WHERE word = name and class = in_class and type = in_type
 363                           and ((op = '-' and operator is null) or op = operator)""",
 364                 to_delete)
 365
 366         return len(to_delete)
 367
 368
 369     def add_country_names(self, country_code, names):
 370         """ Add names for the given country to the search index.
 371         """
 372         word_tokens = set()
 373         for name in self._compute_full_names(names):
 374             if name:
 375                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 376
 377         with self.conn.cursor() as cur:
 378             # Get existing names
 379             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 380                         (country_code, ))
 381             word_tokens.difference_update((t[0] for t in cur))
 382
 383             if word_tokens:
 384                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 385                                                  search_name_count)
 386                                (SELECT nextval('seq_word'), token, '{}', 0
 387                                 FROM unnest(%s) as token)
 388                             """.format(country_code), (list(word_tokens),))
 389
 390
 391     def process_place(self, place):
 392         """ Determine tokenizer information about the given place.
 393
 394             Returns a JSON-serialisable structure that will be handed into
 395             the database via the token_info field.
 396         """
 397         token_info = _TokenInfo(self._cache)
 398
 399         names = place.get('name')
 400
 401         if names:
 402             fulls, partials = self._compute_name_tokens(names)
 403
 404             token_info.add_names(fulls, partials)
 405
 406             country_feature = place.get('country_feature')
 407             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 408                 self.add_country_names(country_feature.lower(), names)
 409
 410         address = place.get('address')
 411
 412         if address:
 413             hnrs = []
 414             addr_terms = []
 415             for key, value in address.items():
 416                 if key == 'postcode':
 417                     self._add_postcode(value)
 418                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 419                     hnrs.append(value)
 420                 elif key == 'street':
 421                     token_info.add_street(*self._compute_name_tokens({'name': value}))
 422                 elif key == 'place':
 423                     token_info.add_place(*self._compute_name_tokens({'name': value}))
 424                 elif not key.startswith('_') and \
 425                      key not in ('country', 'full'):
 426                     addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 427
 428             if hnrs:
 429                 hnrs = self._split_housenumbers(hnrs)
 430                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 431
 432             if addr_terms:
 433                 token_info.add_address_terms(addr_terms)
 434
 435         return token_info.data
 436
 437
 438     def _compute_name_tokens(self, names):
 439         """ Computes the full name and partial name tokens for the given
 440             dictionary of names.
 441         """
 442         full_names = self._compute_full_names(names)
 443         full_tokens = set()
 444         partial_tokens = set()
 445
 446         for name in full_names:
 447             norm_name = self.name_processor.get_normalized(name)
 448             full, part = self._cache.names.get(norm_name, (None, None))
 449             if full is None:
 450                 variants = self.name_processor.get_variants_ascii(norm_name)
 451                 if not variants:
 452                     continue
 453
 454                 with self.conn.cursor() as cur:
 455                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 456                                 (norm_name, variants))
 457                     full, part = cur.fetchone()
 458
 459                 self._cache.names[norm_name] = (full, part)
 460
 461             full_tokens.add(full)
 462             partial_tokens.update(part)
 463
 464         return full_tokens, partial_tokens
 465
 466
 467     @staticmethod
 468     def _compute_full_names(names):
 469         """ Return the set of all full name word ids to be used with the
 470             given dictionary of names.
 471         """
 472         full_names = set()
 473         for name in (n.strip() for ns in names.values() for n in re.split('[;,]', ns)):
 474             if name:
 475                 full_names.add(name)
 476
 477                 brace_idx = name.find('(')
 478                 if brace_idx >= 0:
 479                     full_names.add(name[:brace_idx].strip())
 480
 481         return full_names
 482
 483
 484     def _add_postcode(self, postcode):
 485         """ Make sure the normalized postcode is present in the word table.
 486         """
 487         if re.search(r'[:,;]', postcode) is None:
 488             postcode = self.normalize_postcode(postcode)
 489
 490             if postcode not in self._cache.postcodes:
 491                 term = self.name_processor.get_search_normalized(postcode)
 492                 if not term:
 493                     return
 494
 495                 with self.conn.cursor() as cur:
 496                     # no word_id needed for postcodes
 497                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 498                                                      search_name_count)
 499                                    (SELECT pc, %s, 'place', 'postcode', 0
 500                                     FROM (VALUES (%s)) as v(pc)
 501                                     WHERE NOT EXISTS
 502                                      (SELECT * FROM word
 503                                       WHERE word = pc and class='place' and type='postcode'))
 504                                 """, (' ' + term, postcode))
 505                 self._cache.postcodes.add(postcode)
 506
 507
 508     @staticmethod
 509     def _split_housenumbers(hnrs):
 510         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 511             # split numbers if necessary
 512             simple_list = []
 513             for hnr in hnrs:
 514                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 515
 516             if len(simple_list) > 1:
 517                 hnrs = list(set(simple_list))
 518             else:
 519                 hnrs = simple_list
 520
 521         return hnrs
 522
 523
 524
 525
 526 class _TokenInfo:
 527     """ Collect token information to be sent back to the database.
 528     """
 529     def __init__(self, cache):
 530         self._cache = cache
 531         self.data = {}
 532
 533     @staticmethod
 534     def _mk_array(tokens):
 535         return '{%s}' % ','.join((str(s) for s in tokens))
 536
 537
 538     def add_names(self, fulls, partials):
 539         """ Adds token information for the normalised names.
 540         """
 541         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 542
 543
 544     def add_housenumbers(self, conn, hnrs):
 545         """ Extract housenumber information from a list of normalised
 546             housenumbers.
 547         """
 548         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 549         self.data['hnr'] = ';'.join(hnrs)
 550
 551
 552     def add_street(self, fulls, _):
 553         """ Add addr:street match terms.
 554         """
 555         if fulls:
 556             self.data['street'] = self._mk_array(fulls)
 557
 558
 559     def add_place(self, fulls, partials):
 560         """ Add addr:place search and match terms.
 561         """
 562         if fulls:
 563             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 564             self.data['place_match'] = self._mk_array(fulls)
 565
 566
 567     def add_address_terms(self, terms):
 568         """ Add additional address terms.
 569         """
 570         tokens = {}
 571
 572         for key, fulls, partials in terms:
 573             if fulls:
 574                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 575                                self._mk_array(fulls)]
 576
 577         if tokens:
 578             self.data['addr'] = tokens
 579
 580
 581 class _TokenCache:
 582     """ Cache for token information to avoid repeated database queries.
 583
 584         This cache is not thread-safe and needs to be instantiated per
 585         analyzer.
 586     """
 587     def __init__(self):
 588         self.names = {}
 589         self.postcodes = set()
 590         self.housenumbers = {}
 591
 592
 593     def get_hnr_tokens(self, conn, terms):
 594         """ Get token ids for a list of housenumbers, looking them up in the
 595             database if necessary.
 596         """
 597         tokens = []
 598         askdb = []
 599
 600         for term in terms:
 601             token = self.housenumbers.get(term)
 602             if token is None:
 603                 askdb.append(term)
 604             else:
 605                 tokens.append(token)
 606
 607         if askdb:
 608             with conn.cursor() as cur:
 609                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 610                             (askdb, ))
 611                 for term, tid in cur:
 612                     self.housenumbers[term] = tid
 613                     tokens.append(tid)
 614
 615         return tokens