nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import functools
   7 import io
   8 import itertools
   9 import json
  10 import logging
  11 import re
  12 from textwrap import dedent
  13 from pathlib import Path
  14
  15 from icu import Transliterator
  16 import psycopg2.extras
  17
  18 from nominatim.db.connection import connect
  19 from nominatim.db.properties import set_property, get_property
  20 from nominatim.db.sql_preprocessor import SQLPreprocessor
  21 from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
  22 from nominatim.tokenizer.icu_name_processor import ICUNameProcessor, ICUNameProcessorRules
  23
  24 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  25 DBCFG_TERM_NORMALIZATION = "tokenizer_term_normalization"
  26
  27 LOG = logging.getLogger()
  28
  29 def create(dsn, data_dir):
  30     """ Create a new instance of the tokenizer provided by this module.
  31     """
  32     return LegacyICUTokenizer(dsn, data_dir)
  33
  34
  35 class LegacyICUTokenizer:
  36     """ This tokenizer uses libICU to covert names and queries to ASCII.
  37         Otherwise it uses the same algorithms and data structures as the
  38         normalization routines in Nominatim 3.
  39     """
  40
  41     def __init__(self, dsn, data_dir):
  42         self.dsn = dsn
  43         self.data_dir = data_dir
  44         self.naming_rules = None
  45         self.term_normalization = None
  46         self.max_word_frequency = None
  47
  48
  49     def init_new_db(self, config, init_db=True):
  50         """ Set up a new tokenizer for the database.
  51
  52             This copies all necessary data in the project directory to make
  53             sure the tokenizer remains stable even over updates.
  54         """
  55         if config.TOKENIZER_CONFIG:
  56             cfgfile = Path(config.TOKENIZER_CONFIG)
  57         else:
  58             cfgfile = config.config_dir / 'legacy_icu_tokenizer.yaml'
  59
  60         loader = ICURuleLoader(cfgfile)
  61         self.naming_rules = ICUNameProcessorRules(loader=loader)
  62         self.term_normalization = config.TERM_NORMALIZATION
  63         self.max_word_frequency = config.MAX_WORD_FREQUENCY
  64
  65         self._install_php(config.lib_dir.php)
  66         self._save_config(config)
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._init_db_tables(config)
  71
  72
  73     def init_from_project(self):
  74         """ Initialise the tokenizer from the project directory.
  75         """
  76         with connect(self.dsn) as conn:
  77             self.naming_rules = ICUNameProcessorRules(conn=conn)
  78             self.term_normalization = get_property(conn, DBCFG_TERM_NORMALIZATION)
  79             self.max_word_frequency = get_property(conn, DBCFG_MAXWORDFREQ)
  80
  81
  82     def finalize_import(self, config):
  83         """ Do any required postprocessing to make the tokenizer data ready
  84             for use.
  85         """
  86         with connect(self.dsn) as conn:
  87             sqlp = SQLPreprocessor(conn, config)
  88             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  89
  90
  91     def update_sql_functions(self, config):
  92         """ Reimport the SQL functions for this tokenizer.
  93         """
  94         with connect(self.dsn) as conn:
  95             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  98                               max_word_freq=max_word_freq)
  99
 100
 101     def check_database(self):
 102         """ Check that the tokenizer is set up correctly.
 103         """
 104         self.init_from_project()
 105
 106         if self.normalization is None\
 107            or self.transliteration is None\
 108            or self.abbreviations is None:
 109             return "Configuration for tokenizer 'legacy_icu' are missing."
 110
 111         return None
 112
 113
 114     def name_analyzer(self):
 115         """ Create a new analyzer for tokenizing names and queries
 116             using this tokinzer. Analyzers are context managers and should
 117             be used accordingly:
 118
 119             ```
 120             with tokenizer.name_analyzer() as analyzer:
 121                 analyser.tokenize()
 122             ```
 123
 124             When used outside the with construct, the caller must ensure to
 125             call the close() function before destructing the analyzer.
 126
 127             Analyzers are not thread-safe. You need to instantiate one per thread.
 128         """
 129         return LegacyICUNameAnalyzer(self.dsn, ICUNameProcessor(self.naming_rules))
 130
 131
 132     def _install_php(self, phpdir):
 133         """ Install the php script for the tokenizer.
 134         """
 135         php_file = self.data_dir / "tokenizer.php"
 136         php_file.write_text(dedent("""\
 137             <?php
 138             @define('CONST_Max_Word_Frequency', {0.max_word_frequency});
 139             @define('CONST_Term_Normalization_Rules', "{0.term_normalization}");
 140             @define('CONST_Transliteration', "{0.naming_rules.search_rules}");
 141             require_once('{1}/tokenizer/legacy_icu_tokenizer.php');
 142             """.format(self, phpdir)))
 143
 144
 145     def _save_config(self, config):
 146         """ Save the configuration that needs to remain stable for the given
 147             database as database properties.
 148         """
 149         with connect(self.dsn) as conn:
 150             self.naming_rules.save_rules(conn)
 151
 152             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 153             set_property(conn, DBCFG_TERM_NORMALIZATION, self.term_normalization)
 154
 155
 156     def _init_db_tables(self, config):
 157         """ Set up the word table and fill it with pre-computed word
 158             frequencies.
 159         """
 160         with connect(self.dsn) as conn:
 161             sqlp = SQLPreprocessor(conn, config)
 162             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 163             conn.commit()
 164
 165             LOG.warning("Precomputing word tokens")
 166
 167             # get partial words and their frequencies
 168             words = Counter()
 169             name_proc = ICUNameProcessor(self.naming_rules)
 170             with conn.cursor(name="words") as cur:
 171                 cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 172
 173                 for name, cnt in cur:
 174                     for word in name_proc.get_variants_ascii(name_proc.get_normalized(name)):
 175                         for term in word.split():
 176                             words[term] += cnt
 177
 178             # copy them back into the word table
 179             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 180
 181
 182             with conn.cursor() as cur:
 183                 copystr.seek(0)
 184                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 185                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 186                                WHERE word_id is null""")
 187
 188             conn.commit()
 189
 190
 191 class LegacyICUNameAnalyzer:
 192     """ The legacy analyzer uses the ICU library for splitting names.
 193
 194         Each instance opens a connection to the database to request the
 195         normalization.
 196     """
 197
 198     def __init__(self, dsn, name_proc):
 199         self.conn = connect(dsn).connection
 200         self.conn.autocommit = True
 201         self.name_processor = name_proc
 202
 203         self._cache = _TokenCache()
 204
 205
 206     def __enter__(self):
 207         return self
 208
 209
 210     def __exit__(self, exc_type, exc_value, traceback):
 211         self.close()
 212
 213
 214     def close(self):
 215         """ Free all resources used by the analyzer.
 216         """
 217         if self.conn:
 218             self.conn.close()
 219             self.conn = None
 220
 221
 222     def get_word_token_info(self, words):
 223         """ Return token information for the given list of words.
 224             If a word starts with # it is assumed to be a full name
 225             otherwise is a partial name.
 226
 227             The function returns a list of tuples with
 228             (original word, word token, word id).
 229
 230             The function is used for testing and debugging only
 231             and not necessarily efficient.
 232         """
 233         tokens = {}
 234         for word in words:
 235             if word.startswith('#'):
 236                 tokens[word] = ' ' + self.name_processor.get_search_normalized(word[1:])
 237             else:
 238                 tokens[word] = self.name_processor.get_search_normalized(word)
 239
 240         with self.conn.cursor() as cur:
 241             cur.execute("""SELECT word_token, word_id
 242                            FROM word, (SELECT unnest(%s::TEXT[]) as term) t
 243                            WHERE word_token = t.term
 244                                  and class is null and country_code is null""",
 245                         (list(tokens.values()), ))
 246             ids = {r[0]: r[1] for r in cur}
 247
 248         return [(k, v, ids.get(v, None)) for k, v in tokens.items()]
 249
 250
 251     @staticmethod
 252     def normalize_postcode(postcode):
 253         """ Convert the postcode to a standardized form.
 254
 255             This function must yield exactly the same result as the SQL function
 256             'token_normalized_postcode()'.
 257         """
 258         return postcode.strip().upper()
 259
 260
 261     def _make_standard_hnr(self, hnr):
 262         """ Create a normalised version of a housenumber.
 263
 264             This function takes minor shortcuts on transliteration.
 265         """
 266         return self.name_processor.get_search_normalized(hnr)
 267
 268     def update_postcodes_from_db(self):
 269         """ Update postcode tokens in the word table from the location_postcode
 270             table.
 271         """
 272         to_delete = []
 273         copystr = io.StringIO()
 274         with self.conn.cursor() as cur:
 275             # This finds us the rows in location_postcode and word that are
 276             # missing in the other table.
 277             cur.execute("""SELECT * FROM
 278                             (SELECT pc, word FROM
 279                               (SELECT distinct(postcode) as pc FROM location_postcode) p
 280                               FULL JOIN
 281                               (SELECT word FROM word
 282                                 WHERE class ='place' and type = 'postcode') w
 283                               ON pc = word) x
 284                            WHERE pc is null or word is null""")
 285
 286             for postcode, word in cur:
 287                 if postcode is None:
 288                     to_delete.append(word)
 289                 else:
 290                     copystr.write(postcode)
 291                     copystr.write('\t ')
 292                     copystr.write(self.name_processor.get_search_normalized(postcode))
 293                     copystr.write('\tplace\tpostcode\t0\n')
 294
 295             if to_delete:
 296                 cur.execute("""DELETE FROM WORD
 297                                WHERE class ='place' and type = 'postcode'
 298                                      and word = any(%s)
 299                             """, (to_delete, ))
 300
 301             if copystr.getvalue():
 302                 copystr.seek(0)
 303                 cur.copy_from(copystr, 'word',
 304                               columns=['word', 'word_token', 'class', 'type',
 305                                        'search_name_count'])
 306
 307
 308     def update_special_phrases(self, phrases, should_replace):
 309         """ Replace the search index for special phrases with the new phrases.
 310         """
 311         norm_phrases = set(((self.name_processor.get_normalized(p[0]), p[1], p[2], p[3])
 312                             for p in phrases))
 313
 314         with self.conn.cursor() as cur:
 315             # Get the old phrases.
 316             existing_phrases = set()
 317             cur.execute("""SELECT word, class, type, operator FROM word
 318                            WHERE class != 'place'
 319                                  OR (type != 'house' AND type != 'postcode')""")
 320             for label, cls, typ, oper in cur:
 321                 existing_phrases.add((label, cls, typ, oper or '-'))
 322
 323             to_add = norm_phrases - existing_phrases
 324             to_delete = existing_phrases - norm_phrases
 325
 326             if to_add:
 327                 copystr = io.StringIO()
 328                 for word, cls, typ, oper in to_add:
 329                     term = self.name_processor.get_search_normalized(word)
 330                     if term:
 331                         copystr.write(word)
 332                         copystr.write('\t ')
 333                         copystr.write(term)
 334                         copystr.write('\t')
 335                         copystr.write(cls)
 336                         copystr.write('\t')
 337                         copystr.write(typ)
 338                         copystr.write('\t')
 339                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
 340                         copystr.write('\t0\n')
 341
 342                 copystr.seek(0)
 343                 cur.copy_from(copystr, 'word',
 344                               columns=['word', 'word_token', 'class', 'type',
 345                                        'operator', 'search_name_count'])
 346
 347             if to_delete and should_replace:
 348                 psycopg2.extras.execute_values(
 349                     cur,
 350                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 351                         WHERE word = name and class = in_class and type = in_type
 352                               and ((op = '-' and operator is null) or op = operator)""",
 353                     to_delete)
 354
 355         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 356                  len(norm_phrases), len(to_add), len(to_delete))
 357
 358
 359     def add_country_names(self, country_code, names):
 360         """ Add names for the given country to the search index.
 361         """
 362         word_tokens = set()
 363         for name in self._compute_full_names(names):
 364             if name:
 365                 word_tokens.add(' ' + self.name_processor.get_search_normalized(name))
 366
 367         with self.conn.cursor() as cur:
 368             # Get existing names
 369             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 370                         (country_code, ))
 371             word_tokens.difference_update((t[0] for t in cur))
 372
 373             if word_tokens:
 374                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 375                                                  search_name_count)
 376                                (SELECT nextval('seq_word'), token, '{}', 0
 377                                 FROM unnest(%s) as token)
 378                             """.format(country_code), (list(word_tokens),))
 379
 380
 381     def process_place(self, place):
 382         """ Determine tokenizer information about the given place.
 383
 384             Returns a JSON-serialisable structure that will be handed into
 385             the database via the token_info field.
 386         """
 387         token_info = _TokenInfo(self._cache)
 388
 389         names = place.get('name')
 390
 391         if names:
 392             fulls, partials = self._compute_name_tokens(names)
 393
 394             token_info.add_names(fulls, partials)
 395
 396             country_feature = place.get('country_feature')
 397             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 398                 self.add_country_names(country_feature.lower(), names)
 399
 400         address = place.get('address')
 401
 402         if address:
 403             hnrs = []
 404             addr_terms = []
 405             for key, value in address.items():
 406                 if key == 'postcode':
 407                     self._add_postcode(value)
 408                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 409                     hnrs.append(value)
 410                 elif key == 'street':
 411                     token_info.add_street(*self._compute_name_tokens({'name': value}))
 412                 elif key == 'place':
 413                     token_info.add_place(*self._compute_name_tokens({'name': value}))
 414                 elif not key.startswith('_') and \
 415                      key not in ('country', 'full'):
 416                     addr_terms.append((key, *self._compute_name_tokens({'name': value})))
 417
 418             if hnrs:
 419                 hnrs = self._split_housenumbers(hnrs)
 420                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 421
 422             if addr_terms:
 423                 token_info.add_address_terms(addr_terms)
 424
 425         return token_info.data
 426
 427
 428     def _compute_name_tokens(self, names):
 429         """ Computes the full name and partial name tokens for the given
 430             dictionary of names.
 431         """
 432         full_names = self._compute_full_names(names)
 433         full_tokens = set()
 434         partial_tokens = set()
 435
 436         for name in full_names:
 437             norm_name = self.name_processor.get_normalized(name)
 438             full, part = self._cache.names.get(norm_name, (None, None))
 439             if full is None:
 440                 variants = self.name_processor.get_variants_ascii(norm_name)
 441                 with self.conn.cursor() as cur:
 442                     cur.execute("SELECT (getorcreate_full_word(%s, %s)).*",
 443                                 (norm_name, variants))
 444                     full, part = cur.fetchone()
 445
 446                 self._cache.names[norm_name] = (full, part)
 447
 448             full_tokens.add(full)
 449             partial_tokens.update(part)
 450
 451         return full_tokens, partial_tokens
 452
 453
 454     def _compute_full_names(self, names):
 455         """ Return the set of all full name word ids to be used with the
 456             given dictionary of names.
 457         """
 458         full_names = set()
 459         for name in (n for ns in names.values() for n in re.split('[;,]', ns)):
 460             full_names.add(name.strip())
 461
 462             brace_idx = name.find('(')
 463             if brace_idx >= 0:
 464                 full_names.add(name[:brace_idx].strip())
 465
 466         return full_names
 467
 468
 469     def _add_postcode(self, postcode):
 470         """ Make sure the normalized postcode is present in the word table.
 471         """
 472         if re.search(r'[:,;]', postcode) is None:
 473             postcode = self.normalize_postcode(postcode)
 474
 475             if postcode not in self._cache.postcodes:
 476                 term = self.name_processor.get_search_normalized(postcode)
 477                 if not term:
 478                     return
 479
 480                 with self.conn.cursor() as cur:
 481                     # no word_id needed for postcodes
 482                     cur.execute("""INSERT INTO word (word, word_token, class, type,
 483                                                      search_name_count)
 484                                    (SELECT pc, %s, 'place', 'postcode', 0
 485                                     FROM (VALUES (%s)) as v(pc)
 486                                     WHERE NOT EXISTS
 487                                      (SELECT * FROM word
 488                                       WHERE word = pc and class='place' and type='postcode'))
 489                                 """, (' ' + term, postcode))
 490                 self._cache.postcodes.add(postcode)
 491
 492
 493     @staticmethod
 494     def _split_housenumbers(hnrs):
 495         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 496             # split numbers if necessary
 497             simple_list = []
 498             for hnr in hnrs:
 499                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 500
 501             if len(simple_list) > 1:
 502                 hnrs = list(set(simple_list))
 503             else:
 504                 hnrs = simple_list
 505
 506         return hnrs
 507
 508
 509
 510
 511 class _TokenInfo:
 512     """ Collect token information to be sent back to the database.
 513     """
 514     def __init__(self, cache):
 515         self._cache = cache
 516         self.data = {}
 517
 518     @staticmethod
 519     def _mk_array(tokens):
 520         return '{%s}' % ','.join((str(s) for s in tokens))
 521
 522
 523     def add_names(self, fulls, partials):
 524         """ Adds token information for the normalised names.
 525         """
 526         self.data['names'] = self._mk_array(itertools.chain(fulls, partials))
 527
 528
 529     def add_housenumbers(self, conn, hnrs):
 530         """ Extract housenumber information from a list of normalised
 531             housenumbers.
 532         """
 533         self.data['hnr_tokens'] = self._mk_array(self._cache.get_hnr_tokens(conn, hnrs))
 534         self.data['hnr'] = ';'.join(hnrs)
 535
 536
 537     def add_street(self, fulls, partials):
 538         """ Add addr:street match terms.
 539         """
 540         if fulls:
 541             self.data['street'] = self._mk_array(fulls)
 542
 543
 544     def add_place(self, fulls, partials):
 545         """ Add addr:place search and match terms.
 546         """
 547         if fulls:
 548             self.data['place_search'] = self._mk_array(itertools.chain(fulls, partials))
 549             self.data['place_match'] = self._mk_array(fulls)
 550
 551
 552     def add_address_terms(self, terms):
 553         """ Add additional address terms.
 554         """
 555         tokens = {}
 556
 557         for key, fulls, partials in terms:
 558             if fulls:
 559                 tokens[key] = [self._mk_array(itertools.chain(fulls, partials)),
 560                                self._mk_array(fulls)]
 561
 562         if tokens:
 563             self.data['addr'] = tokens
 564
 565
 566 class _TokenCache:
 567     """ Cache for token information to avoid repeated database queries.
 568
 569         This cache is not thread-safe and needs to be instantiated per
 570         analyzer.
 571     """
 572     def __init__(self):
 573         self.names = {}
 574         self.postcodes = set()
 575         self.housenumbers = {}
 576
 577
 578     def get_hnr_tokens(self, conn, terms):
 579         """ Get token ids for a list of housenumbers, looking them up in the
 580             database if necessary.
 581         """
 582         tokens = []
 583         askdb = []
 584
 585         for term in terms:
 586             token = self.housenumbers.get(term)
 587             if token is None:
 588                 askdb.append(term)
 589             else:
 590                 tokens.append(token)
 591
 592         if askdb:
 593             with conn.cursor() as cur:
 594                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 595                             (askdb, ))
 596                 for term, tid in cur:
 597                     self.housenumbers[term] = tid
 598                     tokens.append(tid)
 599
 600         return tokens