nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import io
   7 import itertools
   8 import json
   9 import logging
  10 import re
  11 from textwrap import dedent
  12 from pathlib import Path
  13
  14 from icu import Transliterator
  15 import psycopg2.extras
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.properties import set_property, get_property
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20
  21 DBCFG_NORMALIZATION = "tokenizer_normalization"
  22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  23 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
  24 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer:
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatm 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.normalization = None
  44         self.transliteration = None
  45         self.abbreviations = None
  46
  47
  48     def init_new_db(self, config, init_db=True):
  49         """ Set up a new tokenizer for the database.
  50
  51             This copies all necessary data in the project directory to make
  52             sure the tokenizer remains stable even over updates.
  53         """
  54         if config.TOKENIZER_CONFIG:
  55             cfgfile = Path(config.TOKENIZER_CONFIG)
  56         else:
  57             cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
  58
  59         rules = json.loads(cfgfile.read_text())
  60         self.transliteration = ';'.join(rules['normalization']) + ';'
  61         self.abbreviations = rules["abbreviations"]
  62         self.normalization = config.TERM_NORMALIZATION
  63
  64         self._install_php(config)
  65         self._save_config(config)
  66
  67         if init_db:
  68             self.update_sql_functions(config)
  69             self._init_db_tables(config)
  70
  71
  72     def init_from_project(self):
  73         """ Initialise the tokenizer from the project directory.
  74         """
  75         with connect(self.dsn) as conn:
  76             self.normalization = get_property(conn, DBCFG_NORMALIZATION)
  77             self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
  78             self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
  79
  80
  81     def finalize_import(self, config):
  82         """ Do any required postprocessing to make the tokenizer data ready
  83             for use.
  84         """
  85         with connect(self.dsn) as conn:
  86             sqlp = SQLPreprocessor(conn, config)
  87             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  88
  89
  90     def update_sql_functions(self, config):
  91         """ Reimport the SQL functions for this tokenizer.
  92         """
  93         with connect(self.dsn) as conn:
  94             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  95             sqlp = SQLPreprocessor(conn, config)
  96             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  97                               max_word_freq=max_word_freq)
  98
  99
 100     def check_database(self):
 101         """ Check that the tokenizer is set up correctly.
 102         """
 103         self.init_from_project()
 104
 105         if self.normalization is None\
 106            or self.transliteration is None\
 107            or self.abbreviations is None:
 108             return "Configuration for tokenizer 'legacy_icu' are missing."
 109
 110         return None
 111
 112
 113     def name_analyzer(self):
 114         """ Create a new analyzer for tokenizing names and queries
 115             using this tokinzer. Analyzers are context managers and should
 116             be used accordingly:
 117
 118             ```
 119             with tokenizer.name_analyzer() as analyzer:
 120                 analyser.tokenize()
 121             ```
 122
 123             When used outside the with construct, the caller must ensure to
 124             call the close() function before destructing the analyzer.
 125
 126             Analyzers are not thread-safe. You need to instantiate one per thread.
 127         """
 128         norm = Transliterator.createFromRules("normalizer", self.normalization)
 129         trans = Transliterator.createFromRules("normalizer", self.transliteration)
 130         return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
 131
 132
 133     def _install_php(self, config):
 134         """ Install the php script for the tokenizer.
 135         """
 136         abbr_inverse = list(zip(*self.abbreviations))
 137         php_file = self.data_dir / "tokenizer.php"
 138         php_file.write_text(dedent("""\
 139             <?php
 140             @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
 141             @define('CONST_Term_Normalization_Rules', "{0.normalization}");
 142             @define('CONST_Transliteration', "{0.transliteration}");
 143             @define('CONST_Abbreviations', array(array('{2}'), array('{3}')));
 144             require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
 145             """.format(self, config,
 146                        "','".join(abbr_inverse[0]),
 147                        "','".join(abbr_inverse[1]))))
 148
 149
 150     def _save_config(self, config):
 151         """ Save the configuration that needs to remain stable for the given
 152             database as database properties.
 153         """
 154         with connect(self.dsn) as conn:
 155             set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 156             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 157             set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
 158             set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
 159
 160
 161     def _init_db_tables(self, config):
 162         """ Set up the word table and fill it with pre-computed word
 163             frequencies.
 164         """
 165         with connect(self.dsn) as conn:
 166             sqlp = SQLPreprocessor(conn, config)
 167             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 168             conn.commit()
 169
 170             LOG.warning("Precomputing word tokens")
 171
 172             # get partial words and their frequencies
 173             words = Counter()
 174             with self.name_analyzer() as analyzer:
 175                 with conn.cursor(name="words") as cur:
 176                     cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 177
 178                     for name, cnt in cur:
 179                         term = analyzer.make_standard_word(name)
 180                         if term:
 181                             for word in term.split():
 182                                 words[word] += cnt
 183
 184             # copy them back into the word table
 185             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 186
 187             with conn.cursor() as cur:
 188                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 189                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 190                                WHERE word_id is null""")
 191
 192             conn.commit()
 193
 194
 195 class LegacyICUNameAnalyzer:
 196     """ The legacy analyzer uses the ICU library for splitting names.
 197
 198         Each instance opens a connection to the database to request the
 199         normalization.
 200     """
 201
 202     def __init__(self, dsn, normalizer, transliterator, abbreviations):
 203         self.conn = connect(dsn).connection
 204         self.conn.autocommit = True
 205         self.normalizer = normalizer
 206         self.transliterator = transliterator
 207         self.abbreviations = abbreviations
 208         #psycopg2.extras.register_hstore(self.conn)
 209
 210         self._cache = _TokenCache()
 211
 212
 213     def __enter__(self):
 214         return self
 215
 216
 217     def __exit__(self, exc_type, exc_value, traceback):
 218         self.close()
 219
 220
 221     def close(self):
 222         """ Free all resources used by the analyzer.
 223         """
 224         if self.conn:
 225             self.conn.close()
 226             self.conn = None
 227
 228
 229     def normalize(self, phrase):
 230         """ Normalize the given phrase, i.e. remove all properties that
 231             are irrelevant for search.
 232         """
 233         return self.normalizer.transliterate(phrase)
 234
 235     def make_standard_word(self, name):
 236         """ Create the normalised version of the name.
 237         """
 238         norm = ' ' + self.transliterator.transliterate(name) + ' '
 239         for full, abbr in self.abbreviations:
 240             if full in norm:
 241                 norm = norm.replace(full, abbr)
 242
 243         return norm.strip()
 244
 245
 246     def _make_standard_hnr(self, hnr):
 247         """ Create a normalised version of a housenumber.
 248
 249             This function takes minor shortcuts on transliteration.
 250         """
 251         if hnr.isdigit():
 252             return hnr
 253
 254         return self.transliterator.transliterate(hnr)
 255
 256     def add_postcodes_from_db(self):
 257         """ Add postcodes from the location_postcode table to the word table.
 258         """
 259         copystr = io.StringIO()
 260         with self.conn.cursor() as cur:
 261             cur.execute("SELECT distinct(postcode) FROM location_postcode")
 262             for (postcode, ) in cur:
 263                 copystr.write(postcode)
 264                 copystr.write('\t ')
 265                 copystr.write(self.transliterator.transliterate(postcode))
 266                 copystr.write('\tplace\tpostcode\t0\n')
 267
 268             cur.copy_from(copystr, 'word',
 269                           columns=['word', 'word_token', 'class', 'type',
 270                                    'search_name_count'])
 271             # Don't really need an ID for postcodes....
 272             # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 273             #                WHERE word_id is null and type = 'postcode'""")
 274
 275
 276     def update_special_phrases(self, phrases):
 277         """ Replace the search index for special phrases with the new phrases.
 278         """
 279         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 280                             for p in phrases))
 281
 282         with self.conn.cursor() as cur:
 283             # Get the old phrases.
 284             existing_phrases = set()
 285             cur.execute("""SELECT word, class, type, operator FROM word
 286                            WHERE class != 'place'
 287                                  OR (type != 'house' AND type != 'postcode')""")
 288             for label, cls, typ, oper in cur:
 289                 existing_phrases.add((label, cls, typ, oper or '-'))
 290
 291             to_add = norm_phrases - existing_phrases
 292             to_delete = existing_phrases - norm_phrases
 293
 294             if to_add:
 295                 copystr = io.StringIO()
 296                 for word, cls, typ, oper in to_add:
 297                     term = self.make_standard_word(word)
 298                     if term:
 299                         copystr.write(word)
 300                         copystr.write('\t ')
 301                         copystr.write(term)
 302                         copystr.write('\t')
 303                         copystr.write(cls)
 304                         copystr.write('\t')
 305                         copystr.write(typ)
 306                         copystr.write('\t')
 307                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
 308                         copystr.write('\t0\n')
 309
 310                 cur.copy_from(copystr, 'word',
 311                               columns=['word', 'word_token', 'class', 'type',
 312                                        'operator', 'search_name_count'])
 313
 314             if to_delete:
 315                 psycopg2.extras.execute_values(
 316                     cur,
 317                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 318                         WHERE word = name and class = in_class and type = in_type
 319                               and ((op = '-' and operator is null) or op = operator)""",
 320                     to_delete)
 321
 322         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 323                  len(norm_phrases), len(to_add), len(to_delete))
 324
 325
 326     def add_country_names(self, country_code, names):
 327         """ Add names for the given country to the search index.
 328         """
 329         full_names = set((self.make_standard_word(n) for n in names))
 330         full_names.discard('')
 331         self._add_normalised_country_names(country_code, full_names)
 332
 333
 334     def _add_normalised_country_names(self, country_code, names):
 335         """ Add names for the given country to the search index.
 336         """
 337         with self.conn.cursor() as cur:
 338             # Get existing names
 339             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 340                         (country_code, ))
 341             new_names = names.difference((t[0] for t in cur))
 342
 343             if new_names:
 344                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 345                                                  search_name_count)
 346                                (SELECT nextval('seq_word'), token, '{}', 0
 347                                 FROM unnest(%s) as token)
 348                             """.format(country_code), (list(new_names),))
 349
 350
 351     def process_place(self, place):
 352         """ Determine tokenizer information about the given place.
 353
 354             Returns a JSON-serialisable structure that will be handed into
 355             the database via the token_info field.
 356         """
 357         token_info = _TokenInfo(self._cache)
 358
 359         names = place.get('name')
 360
 361         if names:
 362             full_names = set((self.make_standard_word(name) for name in names.values()))
 363             full_names.discard('')
 364
 365             token_info.add_names(self.conn, full_names)
 366
 367             country_feature = place.get('country_feature')
 368             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 369                 self._add_normalised_country_names(country_feature.lower(),
 370                                                    full_names)
 371
 372         address = place.get('address')
 373
 374         if address:
 375             hnrs = []
 376             addr_terms = []
 377             for key, value in address.items():
 378                 if key == 'postcode':
 379                     self._add_postcode(value)
 380                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 381                     hnrs.append(value)
 382                 elif key == 'street':
 383                     token_info.add_street(self.conn, self.make_standard_word(value))
 384                 elif key == 'place':
 385                     token_info.add_place(self.conn, self.make_standard_word(value))
 386                 elif not key.startswith('_') and \
 387                      key not in ('country', 'full'):
 388                     addr_terms.append((key, self.make_standard_word(value)))
 389
 390             if hnrs:
 391                 hnrs = self._split_housenumbers(hnrs)
 392                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 393
 394             if addr_terms:
 395                 token_info.add_address_terms(self.conn, addr_terms)
 396
 397         return token_info.data
 398
 399
 400     def _add_postcode(self, postcode):
 401         """ Make sure the normalized postcode is present in the word table.
 402         """
 403         if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
 404             term = self.make_standard_word(postcode)
 405             if not term:
 406                 return
 407
 408             with self.conn.cursor() as cur:
 409                 # no word_id needed for postcodes
 410                 cur.execute("""INSERT INTO word (word, word_token, class, type,
 411                                                  search_name_count)
 412                                (SELECT pc, %s, 'place', 'postcode', 0
 413                                 FROM (VALUES (%s)) as v(pc)
 414                                 WHERE NOT EXISTS
 415                                  (SELECT * FROM word
 416                                   WHERE word = pc and class='place' and type='postcode'))
 417                             """, (' ' + term, postcode))
 418             self._cache.postcodes.add(postcode)
 419
 420     @staticmethod
 421     def _split_housenumbers(hnrs):
 422         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 423             # split numbers if necessary
 424             simple_list = []
 425             for hnr in hnrs:
 426                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 427
 428             if len(simple_list) > 1:
 429                 hnrs = list(set(simple_list))
 430             else:
 431                 hnrs = simple_list
 432
 433         return hnrs
 434
 435
 436
 437
 438 class _TokenInfo:
 439     """ Collect token information to be sent back to the database.
 440     """
 441     def __init__(self, cache):
 442         self.cache = cache
 443         self.data = {}
 444
 445     @staticmethod
 446     def _mk_array(tokens):
 447         return '{%s}' % ','.join((str(s) for s in tokens))
 448
 449
 450     def add_names(self, conn, names):
 451         """ Adds token information for the normalised names.
 452         """
 453         # Start with all partial names
 454         terms = set((part for ns in names for part in ns.split()))
 455         # Add partials for the full terms (TO BE REMOVED)
 456         terms.update((n for n in names))
 457         # Add the full names
 458         terms.update((' ' + n for n in names))
 459
 460         self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
 461
 462
 463     def add_housenumbers(self, conn, hnrs):
 464         """ Extract housenumber information from a list of normalised
 465             housenumbers.
 466         """
 467         self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
 468         self.data['hnr'] = ';'.join(hnrs)
 469
 470
 471     def add_street(self, conn, street):
 472         """ Add addr:street match terms.
 473         """
 474         if not street:
 475             return
 476
 477         term = ' ' + street
 478
 479         tid = self.cache.names.get(term)
 480
 481         if tid is None:
 482             with conn.cursor() as cur:
 483                 cur.execute("""SELECT word_id FROM word
 484                                 WHERE word_token = %s
 485                                       and class is null and type is null""",
 486                             (term, ))
 487                 if cur.rowcount > 0:
 488                     tid = cur.fetchone()[0]
 489                     self.cache.names[term] = tid
 490
 491         if tid is not None:
 492             self.data['street'] = '{%d}' % tid
 493
 494
 495     def add_place(self, conn, place):
 496         """ Add addr:place search and match terms.
 497         """
 498         if not place:
 499             return
 500
 501         partial_ids = self.cache.get_term_tokens(conn, place.split())
 502         tid = self.cache.get_term_tokens(conn, [' ' + place])
 503
 504         self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
 505         self.data['place_match'] = '{%s}' % tid[0]
 506
 507
 508     def add_address_terms(self, conn, terms):
 509         """ Add additional address terms.
 510         """
 511         tokens = {}
 512
 513         for key, value in terms:
 514             if not value:
 515                 continue
 516             partial_ids = self.cache.get_term_tokens(conn, value.split())
 517             term = ' ' + value
 518             tid = self.cache.names.get(term)
 519
 520             if tid is None:
 521                 with conn.cursor() as cur:
 522                     cur.execute("""SELECT word_id FROM word
 523                                     WHERE word_token = %s
 524                                           and class is null and type is null""",
 525                                 (term, ))
 526                     if cur.rowcount > 0:
 527                         tid = cur.fetchone()[0]
 528                         self.cache.names[term] = tid
 529
 530             tokens[key] = [self._mk_array(partial_ids),
 531                            '{%s}' % ('' if tid is None else str(tid))]
 532
 533         if tokens:
 534             self.data['addr'] = tokens
 535
 536
 537 class _TokenCache:
 538     """ Cache for token information to avoid repeated database queries.
 539
 540         This cache is not thread-safe and needs to be instantiated per
 541         analyzer.
 542     """
 543     def __init__(self):
 544         self.names = {}
 545         self.postcodes = set()
 546         self.housenumbers = {}
 547
 548
 549     def get_term_tokens(self, conn, terms):
 550         """ Get token ids for a list of terms, looking them up in the database
 551             if necessary.
 552         """
 553         tokens = []
 554         askdb = []
 555
 556         for term in terms:
 557             token = self.names.get(term)
 558             if token is None:
 559                 askdb.append(term)
 560             elif token != 0:
 561                 tokens.append(token)
 562
 563         if askdb:
 564             with conn.cursor() as cur:
 565                 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
 566                             (askdb, ))
 567                 for term, tid in cur:
 568                     self.names[term] = tid
 569                     if tid != 0:
 570                         tokens.append(tid)
 571
 572         return tokens
 573
 574
 575     def get_hnr_tokens(self, conn, terms):
 576         """ Get token ids for a list of housenumbers, looking them up in the
 577             database if necessary.
 578         """
 579         tokens = []
 580         askdb = []
 581
 582         for term in terms:
 583             token = self.housenumbers.get(term)
 584             if token is None:
 585                 askdb.append(term)
 586             else:
 587                 tokens.append(token)
 588
 589         if askdb:
 590             with conn.cursor() as cur:
 591                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 592                             (askdb, ))
 593                 for term, tid in cur:
 594                     self.housenumbers[term] = tid
 595                     tokens.append(tid)
 596
 597         return tokens