nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import functools
   7 import io
   8 import itertools
   9 import json
  10 import logging
  11 import re
  12 from textwrap import dedent
  13 from pathlib import Path
  14
  15 from icu import Transliterator
  16 import psycopg2.extras
  17
  18 from nominatim.db.connection import connect
  19 from nominatim.db.properties import set_property, get_property
  20 from nominatim.db.sql_preprocessor import SQLPreprocessor
  21
  22 DBCFG_NORMALIZATION = "tokenizer_normalization"
  23 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  24 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
  25 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
  26
  27 LOG = logging.getLogger()
  28
  29 def create(dsn, data_dir):
  30     """ Create a new instance of the tokenizer provided by this module.
  31     """
  32     return LegacyICUTokenizer(dsn, data_dir)
  33
  34
  35 class LegacyICUTokenizer:
  36     """ This tokenizer uses libICU to covert names and queries to ASCII.
  37         Otherwise it uses the same algorithms and data structures as the
  38         normalization routines in Nominatm 3.
  39     """
  40
  41     def __init__(self, dsn, data_dir):
  42         self.dsn = dsn
  43         self.data_dir = data_dir
  44         self.normalization = None
  45         self.transliteration = None
  46         self.abbreviations = None
  47
  48
  49     def init_new_db(self, config, init_db=True):
  50         """ Set up a new tokenizer for the database.
  51
  52             This copies all necessary data in the project directory to make
  53             sure the tokenizer remains stable even over updates.
  54         """
  55         if config.TOKENIZER_CONFIG:
  56             cfgfile = Path(config.TOKENIZER_CONFIG)
  57         else:
  58             cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
  59
  60         rules = json.loads(cfgfile.read_text())
  61         self.transliteration = ';'.join(rules['normalization']) + ';'
  62         self.abbreviations = rules["abbreviations"]
  63         self.normalization = config.TERM_NORMALIZATION
  64
  65         self._install_php(config)
  66         self._save_config(config)
  67
  68         if init_db:
  69             self.update_sql_functions(config)
  70             self._init_db_tables(config)
  71
  72
  73     def init_from_project(self):
  74         """ Initialise the tokenizer from the project directory.
  75         """
  76         with connect(self.dsn) as conn:
  77             self.normalization = get_property(conn, DBCFG_NORMALIZATION)
  78             self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
  79             self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
  80
  81
  82     def finalize_import(self, config):
  83         """ Do any required postprocessing to make the tokenizer data ready
  84             for use.
  85         """
  86         with connect(self.dsn) as conn:
  87             sqlp = SQLPreprocessor(conn, config)
  88             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  89
  90
  91     def update_sql_functions(self, config):
  92         """ Reimport the SQL functions for this tokenizer.
  93         """
  94         with connect(self.dsn) as conn:
  95             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  96             sqlp = SQLPreprocessor(conn, config)
  97             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  98                               max_word_freq=max_word_freq)
  99
 100
 101     def check_database(self):
 102         """ Check that the tokenizer is set up correctly.
 103         """
 104         self.init_from_project()
 105
 106         if self.normalization is None\
 107            or self.transliteration is None\
 108            or self.abbreviations is None:
 109             return "Configuration for tokenizer 'legacy_icu' are missing."
 110
 111         return None
 112
 113
 114     def name_analyzer(self):
 115         """ Create a new analyzer for tokenizing names and queries
 116             using this tokinzer. Analyzers are context managers and should
 117             be used accordingly:
 118
 119             ```
 120             with tokenizer.name_analyzer() as analyzer:
 121                 analyser.tokenize()
 122             ```
 123
 124             When used outside the with construct, the caller must ensure to
 125             call the close() function before destructing the analyzer.
 126
 127             Analyzers are not thread-safe. You need to instantiate one per thread.
 128         """
 129         norm = Transliterator.createFromRules("normalizer", self.normalization)
 130         trans = Transliterator.createFromRules("normalizer", self.transliteration)
 131         return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
 132
 133
 134     def _install_php(self, config):
 135         """ Install the php script for the tokenizer.
 136         """
 137         abbr_inverse = list(zip(*self.abbreviations))
 138         php_file = self.data_dir / "tokenizer.php"
 139         php_file.write_text(dedent("""\
 140             <?php
 141             @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
 142             @define('CONST_Term_Normalization_Rules', "{0.normalization}");
 143             @define('CONST_Transliteration', "{0.transliteration}");
 144             @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
 145             require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
 146             """.format(self, config,
 147                        "','".join(abbr_inverse[0]),
 148                        "','".join(abbr_inverse[1]))))
 149
 150
 151     def _save_config(self, config):
 152         """ Save the configuration that needs to remain stable for the given
 153             database as database properties.
 154         """
 155         with connect(self.dsn) as conn:
 156             set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 157             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 158             set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
 159             set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
 160
 161
 162     def _init_db_tables(self, config):
 163         """ Set up the word table and fill it with pre-computed word
 164             frequencies.
 165         """
 166         with connect(self.dsn) as conn:
 167             sqlp = SQLPreprocessor(conn, config)
 168             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 169             conn.commit()
 170
 171             LOG.warning("Precomputing word tokens")
 172
 173             # get partial words and their frequencies
 174             words = Counter()
 175             with self.name_analyzer() as analyzer:
 176                 with conn.cursor(name="words") as cur:
 177                     cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 178
 179                     for name, cnt in cur:
 180                         term = analyzer.make_standard_word(name)
 181                         if term:
 182                             for word in term.split():
 183                                 words[word] += cnt
 184
 185             # copy them back into the word table
 186             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 187
 188             with conn.cursor() as cur:
 189                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 190                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 191                                WHERE word_id is null""")
 192
 193             conn.commit()
 194
 195
 196 class LegacyICUNameAnalyzer:
 197     """ The legacy analyzer uses the ICU library for splitting names.
 198
 199         Each instance opens a connection to the database to request the
 200         normalization.
 201     """
 202
 203     def __init__(self, dsn, normalizer, transliterator, abbreviations):
 204         self.conn = connect(dsn).connection
 205         self.conn.autocommit = True
 206         self.normalizer = normalizer
 207         self.transliterator = transliterator
 208         self.abbreviations = abbreviations
 209         #psycopg2.extras.register_hstore(self.conn)
 210
 211         self._cache = _TokenCache()
 212
 213
 214     def __enter__(self):
 215         return self
 216
 217
 218     def __exit__(self, exc_type, exc_value, traceback):
 219         self.close()
 220
 221
 222     def close(self):
 223         """ Free all resources used by the analyzer.
 224         """
 225         if self.conn:
 226             self.conn.close()
 227             self.conn = None
 228
 229
 230     def normalize(self, phrase):
 231         """ Normalize the given phrase, i.e. remove all properties that
 232             are irrelevant for search.
 233         """
 234         return self.normalizer.transliterate(phrase)
 235
 236     @functools.lru_cache(maxsize=1024)
 237     def make_standard_word(self, name):
 238         """ Create the normalised version of the name.
 239         """
 240         norm = ' ' + self.transliterator.transliterate(name) + ' '
 241         for full, abbr in self.abbreviations:
 242             if full in norm:
 243                 norm = norm.replace(full, abbr)
 244
 245         return norm.strip()
 246
 247
 248     def _make_standard_hnr(self, hnr):
 249         """ Create a normalised version of a housenumber.
 250
 251             This function takes minor shortcuts on transliteration.
 252         """
 253         if hnr.isdigit():
 254             return hnr
 255
 256         return self.transliterator.transliterate(hnr)
 257
 258     def add_postcodes_from_db(self):
 259         """ Add postcodes from the location_postcode table to the word table.
 260         """
 261         copystr = io.StringIO()
 262         with self.conn.cursor() as cur:
 263             cur.execute("SELECT distinct(postcode) FROM location_postcode")
 264             for (postcode, ) in cur:
 265                 copystr.write(postcode)
 266                 copystr.write('\t ')
 267                 copystr.write(self.transliterator.transliterate(postcode))
 268                 copystr.write('\tplace\tpostcode\t0\n')
 269
 270             cur.copy_from(copystr, 'word',
 271                           columns=['word', 'word_token', 'class', 'type',
 272                                    'search_name_count'])
 273             # Don't really need an ID for postcodes....
 274             # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 275             #                WHERE word_id is null and type = 'postcode'""")
 276
 277
 278     def update_special_phrases(self, phrases):
 279         """ Replace the search index for special phrases with the new phrases.
 280         """
 281         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 282                             for p in phrases))
 283
 284         with self.conn.cursor() as cur:
 285             # Get the old phrases.
 286             existing_phrases = set()
 287             cur.execute("""SELECT word, class, type, operator FROM word
 288                            WHERE class != 'place'
 289                                  OR (type != 'house' AND type != 'postcode')""")
 290             for label, cls, typ, oper in cur:
 291                 existing_phrases.add((label, cls, typ, oper or '-'))
 292
 293             to_add = norm_phrases - existing_phrases
 294             to_delete = existing_phrases - norm_phrases
 295
 296             if to_add:
 297                 copystr = io.StringIO()
 298                 for word, cls, typ, oper in to_add:
 299                     term = self.make_standard_word(word)
 300                     if term:
 301                         copystr.write(word)
 302                         copystr.write('\t ')
 303                         copystr.write(term)
 304                         copystr.write('\t')
 305                         copystr.write(cls)
 306                         copystr.write('\t')
 307                         copystr.write(typ)
 308                         copystr.write('\t')
 309                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
 310                         copystr.write('\t0\n')
 311
 312                 cur.copy_from(copystr, 'word',
 313                               columns=['word', 'word_token', 'class', 'type',
 314                                        'operator', 'search_name_count'])
 315
 316             if to_delete:
 317                 psycopg2.extras.execute_values(
 318                     cur,
 319                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 320                         WHERE word = name and class = in_class and type = in_type
 321                               and ((op = '-' and operator is null) or op = operator)""",
 322                     to_delete)
 323
 324         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 325                  len(norm_phrases), len(to_add), len(to_delete))
 326
 327
 328     def add_country_names(self, country_code, names):
 329         """ Add names for the given country to the search index.
 330         """
 331         full_names = set((self.make_standard_word(n) for n in names))
 332         full_names.discard('')
 333         self._add_normalised_country_names(country_code, full_names)
 334
 335
 336     def _add_normalised_country_names(self, country_code, names):
 337         """ Add names for the given country to the search index.
 338         """
 339         with self.conn.cursor() as cur:
 340             # Get existing names
 341             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 342                         (country_code, ))
 343             new_names = names.difference((t[0] for t in cur))
 344
 345             if new_names:
 346                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 347                                                  search_name_count)
 348                                (SELECT nextval('seq_word'), token, '{}', 0
 349                                 FROM unnest(%s) as token)
 350                             """.format(country_code), (list(new_names),))
 351
 352
 353     def process_place(self, place):
 354         """ Determine tokenizer information about the given place.
 355
 356             Returns a JSON-serialisable structure that will be handed into
 357             the database via the token_info field.
 358         """
 359         token_info = _TokenInfo(self._cache)
 360
 361         names = place.get('name')
 362
 363         if names:
 364             full_names = set((self.make_standard_word(name) for name in names.values()))
 365             full_names.discard('')
 366
 367             token_info.add_names(self.conn, full_names)
 368
 369             country_feature = place.get('country_feature')
 370             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 371                 self._add_normalised_country_names(country_feature.lower(),
 372                                                    full_names)
 373
 374         address = place.get('address')
 375
 376         if address:
 377             hnrs = []
 378             addr_terms = []
 379             for key, value in address.items():
 380                 if key == 'postcode':
 381                     self._add_postcode(value)
 382                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 383                     hnrs.append(value)
 384                 elif key == 'street':
 385                     token_info.add_street(self.conn, self.make_standard_word(value))
 386                 elif key == 'place':
 387                     token_info.add_place(self.conn, self.make_standard_word(value))
 388                 elif not key.startswith('_') and \
 389                      key not in ('country', 'full'):
 390                     addr_terms.append((key, self.make_standard_word(value)))
 391
 392             if hnrs:
 393                 hnrs = self._split_housenumbers(hnrs)
 394                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 395
 396             if addr_terms:
 397                 token_info.add_address_terms(self.conn, addr_terms)
 398
 399         return token_info.data
 400
 401
 402     def _add_postcode(self, postcode):
 403         """ Make sure the normalized postcode is present in the word table.
 404         """
 405         if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
 406             term = self.make_standard_word(postcode)
 407             if not term:
 408                 return
 409
 410             with self.conn.cursor() as cur:
 411                 # no word_id needed for postcodes
 412                 cur.execute("""INSERT INTO word (word, word_token, class, type,
 413                                                  search_name_count)
 414                                (SELECT pc, %s, 'place', 'postcode', 0
 415                                 FROM (VALUES (%s)) as v(pc)
 416                                 WHERE NOT EXISTS
 417                                  (SELECT * FROM word
 418                                   WHERE word = pc and class='place' and type='postcode'))
 419                             """, (' ' + term, postcode))
 420             self._cache.postcodes.add(postcode)
 421
 422     @staticmethod
 423     def _split_housenumbers(hnrs):
 424         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 425             # split numbers if necessary
 426             simple_list = []
 427             for hnr in hnrs:
 428                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 429
 430             if len(simple_list) > 1:
 431                 hnrs = list(set(simple_list))
 432             else:
 433                 hnrs = simple_list
 434
 435         return hnrs
 436
 437
 438
 439
 440 class _TokenInfo:
 441     """ Collect token information to be sent back to the database.
 442     """
 443     def __init__(self, cache):
 444         self.cache = cache
 445         self.data = {}
 446
 447     @staticmethod
 448     def _mk_array(tokens):
 449         return '{%s}' % ','.join((str(s) for s in tokens))
 450
 451
 452     def add_names(self, conn, names):
 453         """ Adds token information for the normalised names.
 454         """
 455         # Start with all partial names
 456         terms = set((part for ns in names for part in ns.split()))
 457         # Add partials for the full terms (TO BE REMOVED)
 458         terms.update((n for n in names))
 459         # Add the full names
 460         terms.update((' ' + n for n in names))
 461
 462         self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
 463
 464
 465     def add_housenumbers(self, conn, hnrs):
 466         """ Extract housenumber information from a list of normalised
 467             housenumbers.
 468         """
 469         self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
 470         self.data['hnr'] = ';'.join(hnrs)
 471
 472
 473     def add_street(self, conn, street):
 474         """ Add addr:street match terms.
 475         """
 476         if not street:
 477             return
 478
 479         term = ' ' + street
 480
 481         tid = self.cache.names.get(term)
 482
 483         if tid is None:
 484             with conn.cursor() as cur:
 485                 cur.execute("""SELECT word_id FROM word
 486                                 WHERE word_token = %s
 487                                       and class is null and type is null""",
 488                             (term, ))
 489                 if cur.rowcount > 0:
 490                     tid = cur.fetchone()[0]
 491                     self.cache.names[term] = tid
 492
 493         if tid is not None:
 494             self.data['street'] = '{%d}' % tid
 495
 496
 497     def add_place(self, conn, place):
 498         """ Add addr:place search and match terms.
 499         """
 500         if not place:
 501             return
 502
 503         partial_ids = self.cache.get_term_tokens(conn, place.split())
 504         tid = self.cache.get_term_tokens(conn, [' ' + place])
 505
 506         self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
 507         self.data['place_match'] = '{%s}' % tid[0]
 508
 509
 510     def add_address_terms(self, conn, terms):
 511         """ Add additional address terms.
 512         """
 513         tokens = {}
 514
 515         for key, value in terms:
 516             if not value:
 517                 continue
 518             partial_ids = self.cache.get_term_tokens(conn, value.split())
 519             term = ' ' + value
 520             tid = self.cache.names.get(term)
 521
 522             if tid is None:
 523                 with conn.cursor() as cur:
 524                     cur.execute("""SELECT word_id FROM word
 525                                     WHERE word_token = %s
 526                                           and class is null and type is null""",
 527                                 (term, ))
 528                     if cur.rowcount > 0:
 529                         tid = cur.fetchone()[0]
 530                         self.cache.names[term] = tid
 531
 532             tokens[key] = [self._mk_array(partial_ids),
 533                            '{%s}' % ('' if tid is None else str(tid))]
 534
 535         if tokens:
 536             self.data['addr'] = tokens
 537
 538
 539 class _TokenCache:
 540     """ Cache for token information to avoid repeated database queries.
 541
 542         This cache is not thread-safe and needs to be instantiated per
 543         analyzer.
 544     """
 545     def __init__(self):
 546         self.names = {}
 547         self.postcodes = set()
 548         self.housenumbers = {}
 549
 550
 551     def get_term_tokens(self, conn, terms):
 552         """ Get token ids for a list of terms, looking them up in the database
 553             if necessary.
 554         """
 555         tokens = []
 556         askdb = []
 557
 558         for term in terms:
 559             token = self.names.get(term)
 560             if token is None:
 561                 askdb.append(term)
 562             elif token != 0:
 563                 tokens.append(token)
 564
 565         if askdb:
 566             with conn.cursor() as cur:
 567                 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
 568                             (askdb, ))
 569                 for term, tid in cur:
 570                     self.names[term] = tid
 571                     if tid != 0:
 572                         tokens.append(tid)
 573
 574         return tokens
 575
 576
 577     def get_hnr_tokens(self, conn, terms):
 578         """ Get token ids for a list of housenumbers, looking them up in the
 579             database if necessary.
 580         """
 581         tokens = []
 582         askdb = []
 583
 584         for term in terms:
 585             token = self.housenumbers.get(term)
 586             if token is None:
 587                 askdb.append(term)
 588             else:
 589                 tokens.append(token)
 590
 591         if askdb:
 592             with conn.cursor() as cur:
 593                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 594                             (askdb, ))
 595                 for term, tid in cur:
 596                     self.housenumbers[term] = tid
 597                     tokens.append(tid)
 598
 599         return tokens