nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import functools
   7 import io
   8 import itertools
   9 import json
  10 import logging
  11 import re
  12 from textwrap import dedent
  13 from pathlib import Path
  14
  15 from icu import Transliterator
  16 import psycopg2.extras
  17
  18 from nominatim.db.connection import connect
  19 from nominatim.db.properties import set_property, get_property
  20 from nominatim.db.sql_preprocessor import SQLPreprocessor
  21
  22 DBCFG_NORMALIZATION = "tokenizer_normalization"
  23 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  24 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
  25 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
  26
  27 LOG = logging.getLogger()
  28
  29 def create(dsn, data_dir):
  30     """ Create a new instance of the tokenizer provided by this module.
  31     """
  32     return LegacyICUTokenizer(dsn, data_dir)
  33
  34
  35 class LegacyICUTokenizer:
  36     """ This tokenizer uses libICU to covert names and queries to ASCII.
  37         Otherwise it uses the same algorithms and data structures as the
  38         normalization routines in Nominatm 3.
  39     """
  40
  41     def __init__(self, dsn, data_dir):
  42         self.dsn = dsn
  43         self.data_dir = data_dir
  44         self.normalization = None
  45         self.transliteration = None
  46         self.abbreviations = None
  47
  48
  49     def init_new_db(self, config, init_db=True):
  50         """ Set up a new tokenizer for the database.
  51
  52             This copies all necessary data in the project directory to make
  53             sure the tokenizer remains stable even over updates.
  54         """
  55         if config.TOKENIZER_CONFIG:
  56             cfgfile = Path(config.TOKENIZER_CONFIG)
  57         else:
  58             cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
  59
  60         rules = json.loads(cfgfile.read_text())
  61         self.transliteration = ';'.join(rules['normalization']) + ';'
  62         self.abbreviations = rules["abbreviations"]
  63         self.normalization = config.TERM_NORMALIZATION
  64
  65         self._install_php(config)
  66         self._save_config(config)
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._init_db_tables(config)
  71
  72
  73     def init_from_project(self):
  74         """ Initialise the tokenizer from the project directory.
  75         """
  76         with connect(self.dsn) as conn:
  77             self.normalization = get_property(conn, DBCFG_NORMALIZATION)
  78             self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
  79             self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
  80
  81
  82     def finalize_import(self, config):
  83         """ Do any required postprocessing to make the tokenizer data ready
  84             for use.
  85         """
  86         with connect(self.dsn) as conn:
  87             sqlp = SQLPreprocessor(conn, config)
  88             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  89
  90
  91     def update_sql_functions(self, config):
  92         """ Reimport the SQL functions for this tokenizer.
  93         """
  94         with connect(self.dsn) as conn:
  95             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  98                               max_word_freq=max_word_freq)
  99
 100
 101     def check_database(self):
 102         """ Check that the tokenizer is set up correctly.
 103         """
 104         self.init_from_project()
 105
 106         if self.normalization is None\
 107            or self.transliteration is None\
 108            or self.abbreviations is None:
 109             return "Configuration for tokenizer 'legacy_icu' are missing."
 110
 111         return None
 112
 113
 114     def name_analyzer(self):
 115         """ Create a new analyzer for tokenizing names and queries
 116             using this tokinzer. Analyzers are context managers and should
 117             be used accordingly:
 118
 119             ```
 120             with tokenizer.name_analyzer() as analyzer:
 121                 analyser.tokenize()
 122             ```
 123
 124             When used outside the with construct, the caller must ensure to
 125             call the close() function before destructing the analyzer.
 126
 127             Analyzers are not thread-safe. You need to instantiate one per thread.
 128         """
 129         norm = Transliterator.createFromRules("normalizer", self.normalization)
 130         trans = Transliterator.createFromRules("normalizer", self.transliteration)
 131         return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
 132
 133
 134     def _install_php(self, config):
 135         """ Install the php script for the tokenizer.
 136         """
 137         abbr_inverse = list(zip(*self.abbreviations))
 138         php_file = self.data_dir / "tokenizer.php"
 139         php_file.write_text(dedent("""\
 140             <?php
 141             @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
 142             @define('CONST_Term_Normalization_Rules', "{0.normalization}");
 143             @define('CONST_Transliteration', "{0.transliteration}");
 144             @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
 145             require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
 146             """.format(self, config,
 147                        "','".join(abbr_inverse[0]),
 148                        "','".join(abbr_inverse[1]))))
 149
 150
 151     def _save_config(self, config):
 152         """ Save the configuration that needs to remain stable for the given
 153             database as database properties.
 154         """
 155         with connect(self.dsn) as conn:
 156             set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 157             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 158             set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
 159             set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
 160
 161
 162     def _init_db_tables(self, config):
 163         """ Set up the word table and fill it with pre-computed word
 164             frequencies.
 165         """
 166         with connect(self.dsn) as conn:
 167             sqlp = SQLPreprocessor(conn, config)
 168             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 169             conn.commit()
 170
 171             LOG.warning("Precomputing word tokens")
 172
 173             # get partial words and their frequencies
 174             words = Counter()
 175             with self.name_analyzer() as analyzer:
 176                 with conn.cursor(name="words") as cur:
 177                     cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 178
 179                     for name, cnt in cur:
 180                         term = analyzer.make_standard_word(name)
 181                         if term:
 182                             for word in term.split():
 183                                 words[word] += cnt
 184
 185             # copy them back into the word table
 186             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 187
 188
 189             with conn.cursor() as cur:
 190                 copystr.seek(0)
 191                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 192                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 193                                WHERE word_id is null""")
 194
 195             conn.commit()
 196
 197
 198 class LegacyICUNameAnalyzer:
 199     """ The legacy analyzer uses the ICU library for splitting names.
 200
 201         Each instance opens a connection to the database to request the
 202         normalization.
 203     """
 204
 205     def __init__(self, dsn, normalizer, transliterator, abbreviations):
 206         self.conn = connect(dsn).connection
 207         self.conn.autocommit = True
 208         self.normalizer = normalizer
 209         self.transliterator = transliterator
 210         self.abbreviations = abbreviations
 211
 212         self._cache = _TokenCache()
 213
 214
 215     def __enter__(self):
 216         return self
 217
 218
 219     def __exit__(self, exc_type, exc_value, traceback):
 220         self.close()
 221
 222
 223     def close(self):
 224         """ Free all resources used by the analyzer.
 225         """
 226         if self.conn:
 227             self.conn.close()
 228             self.conn = None
 229
 230
 231     def normalize(self, phrase):
 232         """ Normalize the given phrase, i.e. remove all properties that
 233             are irrelevant for search.
 234         """
 235         return self.normalizer.transliterate(phrase)
 236
 237     @functools.lru_cache(maxsize=1024)
 238     def make_standard_word(self, name):
 239         """ Create the normalised version of the name.
 240         """
 241         norm = ' ' + self.transliterator.transliterate(name) + ' '
 242         for full, abbr in self.abbreviations:
 243             if full in norm:
 244                 norm = norm.replace(full, abbr)
 245
 246         return norm.strip()
 247
 248
 249     def _make_standard_hnr(self, hnr):
 250         """ Create a normalised version of a housenumber.
 251
 252             This function takes minor shortcuts on transliteration.
 253         """
 254         if hnr.isdigit():
 255             return hnr
 256
 257         return self.transliterator.transliterate(hnr)
 258
 259     def add_postcodes_from_db(self):
 260         """ Add postcodes from the location_postcode table to the word table.
 261         """
 262         copystr = io.StringIO()
 263         with self.conn.cursor() as cur:
 264             cur.execute("SELECT distinct(postcode) FROM location_postcode")
 265             for (postcode, ) in cur:
 266                 copystr.write(postcode)
 267                 copystr.write('\t ')
 268                 copystr.write(self.transliterator.transliterate(postcode))
 269                 copystr.write('\tplace\tpostcode\t0\n')
 270
 271             copystr.seek(0)
 272             cur.copy_from(copystr, 'word',
 273                           columns=['word', 'word_token', 'class', 'type',
 274                                    'search_name_count'])
 275             # Don't really need an ID for postcodes....
 276             # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 277             #                WHERE word_id is null and type = 'postcode'""")
 278
 279
 280     def update_special_phrases(self, phrases):
 281         """ Replace the search index for special phrases with the new phrases.
 282         """
 283         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 284                             for p in phrases))
 285
 286         with self.conn.cursor() as cur:
 287             # Get the old phrases.
 288             existing_phrases = set()
 289             cur.execute("""SELECT word, class, type, operator FROM word
 290                            WHERE class != 'place'
 291                                  OR (type != 'house' AND type != 'postcode')""")
 292             for label, cls, typ, oper in cur:
 293                 existing_phrases.add((label, cls, typ, oper or '-'))
 294
 295             to_add = norm_phrases - existing_phrases
 296             to_delete = existing_phrases - norm_phrases
 297
 298             if to_add:
 299                 copystr = io.StringIO()
 300                 for word, cls, typ, oper in to_add:
 301                     term = self.make_standard_word(word)
 302                     if term:
 303                         copystr.write(word)
 304                         copystr.write('\t ')
 305                         copystr.write(term)
 306                         copystr.write('\t')
 307                         copystr.write(cls)
 308                         copystr.write('\t')
 309                         copystr.write(typ)
 310                         copystr.write('\t')
 311                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
 312                         copystr.write('\t0\n')
 313
 314                 copystr.seek(0)
 315                 cur.copy_from(copystr, 'word',
 316                               columns=['word', 'word_token', 'class', 'type',
 317                                        'operator', 'search_name_count'])
 318
 319             if to_delete:
 320                 psycopg2.extras.execute_values(
 321                     cur,
 322                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 323                         WHERE word = name and class = in_class and type = in_type
 324                               and ((op = '-' and operator is null) or op = operator)""",
 325                     to_delete)
 326
 327         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 328                  len(norm_phrases), len(to_add), len(to_delete))
 329
 330
 331     def add_country_names(self, country_code, names):
 332         """ Add names for the given country to the search index.
 333         """
 334         full_names = set((self.make_standard_word(n) for n in names))
 335         full_names.discard('')
 336         self._add_normalised_country_names(country_code, full_names)
 337
 338
 339     def _add_normalised_country_names(self, country_code, names):
 340         """ Add names for the given country to the search index.
 341         """
 342         with self.conn.cursor() as cur:
 343             # Get existing names
 344             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 345                         (country_code, ))
 346             new_names = names.difference((t[0] for t in cur))
 347
 348             if new_names:
 349                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 350                                                  search_name_count)
 351                                (SELECT nextval('seq_word'), token, '{}', 0
 352                                 FROM unnest(%s) as token)
 353                             """.format(country_code), (list(new_names),))
 354
 355
 356     def process_place(self, place):
 357         """ Determine tokenizer information about the given place.
 358
 359             Returns a JSON-serialisable structure that will be handed into
 360             the database via the token_info field.
 361         """
 362         token_info = _TokenInfo(self._cache)
 363
 364         names = place.get('name')
 365
 366         if names:
 367             full_names = set((self.make_standard_word(name) for name in names.values()))
 368             full_names.discard('')
 369
 370             token_info.add_names(self.conn, full_names)
 371
 372             country_feature = place.get('country_feature')
 373             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 374                 self._add_normalised_country_names(country_feature.lower(),
 375                                                    full_names)
 376
 377         address = place.get('address')
 378
 379         if address:
 380             hnrs = []
 381             addr_terms = []
 382             for key, value in address.items():
 383                 if key == 'postcode':
 384                     self._add_postcode(value)
 385                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 386                     hnrs.append(value)
 387                 elif key == 'street':
 388                     token_info.add_street(self.conn, self.make_standard_word(value))
 389                 elif key == 'place':
 390                     token_info.add_place(self.conn, self.make_standard_word(value))
 391                 elif not key.startswith('_') and \
 392                      key not in ('country', 'full'):
 393                     addr_terms.append((key, self.make_standard_word(value)))
 394
 395             if hnrs:
 396                 hnrs = self._split_housenumbers(hnrs)
 397                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 398
 399             if addr_terms:
 400                 token_info.add_address_terms(self.conn, addr_terms)
 401
 402         return token_info.data
 403
 404
 405     def _add_postcode(self, postcode):
 406         """ Make sure the normalized postcode is present in the word table.
 407         """
 408         if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
 409             term = self.make_standard_word(postcode)
 410             if not term:
 411                 return
 412
 413             with self.conn.cursor() as cur:
 414                 # no word_id needed for postcodes
 415                 cur.execute("""INSERT INTO word (word, word_token, class, type,
 416                                                  search_name_count)
 417                                (SELECT pc, %s, 'place', 'postcode', 0
 418                                 FROM (VALUES (%s)) as v(pc)
 419                                 WHERE NOT EXISTS
 420                                  (SELECT * FROM word
 421                                   WHERE word = pc and class='place' and type='postcode'))
 422                             """, (' ' + term, postcode))
 423             self._cache.postcodes.add(postcode)
 424
 425     @staticmethod
 426     def _split_housenumbers(hnrs):
 427         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 428             # split numbers if necessary
 429             simple_list = []
 430             for hnr in hnrs:
 431                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 432
 433             if len(simple_list) > 1:
 434                 hnrs = list(set(simple_list))
 435             else:
 436                 hnrs = simple_list
 437
 438         return hnrs
 439
 440
 441
 442
 443 class _TokenInfo:
 444     """ Collect token information to be sent back to the database.
 445     """
 446     def __init__(self, cache):
 447         self.cache = cache
 448         self.data = {}
 449
 450     @staticmethod
 451     def _mk_array(tokens):
 452         return '{%s}' % ','.join((str(s) for s in tokens))
 453
 454
 455     def add_names(self, conn, names):
 456         """ Adds token information for the normalised names.
 457         """
 458         # Start with all partial names
 459         terms = set((part for ns in names for part in ns.split()))
 460         # Add partials for the full terms (TO BE REMOVED)
 461         terms.update((n for n in names))
 462         # Add the full names
 463         terms.update((' ' + n for n in names))
 464
 465         self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
 466
 467
 468     def add_housenumbers(self, conn, hnrs):
 469         """ Extract housenumber information from a list of normalised
 470             housenumbers.
 471         """
 472         self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
 473         self.data['hnr'] = ';'.join(hnrs)
 474
 475
 476     def add_street(self, conn, street):
 477         """ Add addr:street match terms.
 478         """
 479         if not street:
 480             return
 481
 482         term = ' ' + street
 483
 484         tid = self.cache.names.get(term)
 485
 486         if tid is None:
 487             with conn.cursor() as cur:
 488                 cur.execute("""SELECT word_id FROM word
 489                                 WHERE word_token = %s
 490                                       and class is null and type is null""",
 491                             (term, ))
 492                 if cur.rowcount > 0:
 493                     tid = cur.fetchone()[0]
 494                     self.cache.names[term] = tid
 495
 496         if tid is not None:
 497             self.data['street'] = '{%d}' % tid
 498
 499
 500     def add_place(self, conn, place):
 501         """ Add addr:place search and match terms.
 502         """
 503         if not place:
 504             return
 505
 506         partial_ids = self.cache.get_term_tokens(conn, place.split())
 507         tid = self.cache.get_term_tokens(conn, [' ' + place])
 508
 509         self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
 510         self.data['place_match'] = '{%s}' % tid[0]
 511
 512
 513     def add_address_terms(self, conn, terms):
 514         """ Add additional address terms.
 515         """
 516         tokens = {}
 517
 518         for key, value in terms:
 519             if not value:
 520                 continue
 521             partial_ids = self.cache.get_term_tokens(conn, value.split())
 522             term = ' ' + value
 523             tid = self.cache.names.get(term)
 524
 525             if tid is None:
 526                 with conn.cursor() as cur:
 527                     cur.execute("""SELECT word_id FROM word
 528                                     WHERE word_token = %s
 529                                           and class is null and type is null""",
 530                                 (term, ))
 531                     if cur.rowcount > 0:
 532                         tid = cur.fetchone()[0]
 533                         self.cache.names[term] = tid
 534
 535             tokens[key] = [self._mk_array(partial_ids),
 536                            '{%s}' % ('' if tid is None else str(tid))]
 537
 538         if tokens:
 539             self.data['addr'] = tokens
 540
 541
 542 class _TokenCache:
 543     """ Cache for token information to avoid repeated database queries.
 544
 545         This cache is not thread-safe and needs to be instantiated per
 546         analyzer.
 547     """
 548     def __init__(self):
 549         self.names = {}
 550         self.postcodes = set()
 551         self.housenumbers = {}
 552
 553
 554     def get_term_tokens(self, conn, terms):
 555         """ Get token ids for a list of terms, looking them up in the database
 556             if necessary.
 557         """
 558         tokens = []
 559         askdb = []
 560
 561         for term in terms:
 562             token = self.names.get(term)
 563             if token is None:
 564                 askdb.append(term)
 565             elif token != 0:
 566                 tokens.append(token)
 567
 568         if askdb:
 569             with conn.cursor() as cur:
 570                 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
 571                             (askdb, ))
 572                 for term, tid in cur:
 573                     self.names[term] = tid
 574                     if tid != 0:
 575                         tokens.append(tid)
 576
 577         return tokens
 578
 579
 580     def get_hnr_tokens(self, conn, terms):
 581         """ Get token ids for a list of housenumbers, looking them up in the
 582             database if necessary.
 583         """
 584         tokens = []
 585         askdb = []
 586
 587         for term in terms:
 588             token = self.housenumbers.get(term)
 589             if token is None:
 590                 askdb.append(term)
 591             else:
 592                 tokens.append(token)
 593
 594         if askdb:
 595             with conn.cursor() as cur:
 596                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 597                             (askdb, ))
 598                 for term, tid in cur:
 599                     self.housenumbers[term] = tid
 600                     tokens.append(tid)
 601
 602         return tokens