nominatim/tokenizer/legacy_icu_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4 but using
   3 libICU instead of the PostgreSQL module.
   4 """
   5 from collections import Counter
   6 import io
   7 import itertools
   8 import json
   9 import logging
  10 import re
  11 from textwrap import dedent
  12 from pathlib import Path
  13
  14 from icu import Transliterator
  15 import psycopg2.extras
  16
  17 from nominatim.db.connection import connect
  18 from nominatim.db.properties import set_property, get_property
  19 from nominatim.db.sql_preprocessor import SQLPreprocessor
  20
  21 DBCFG_NORMALIZATION = "tokenizer_normalization"
  22 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  23 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
  24 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
  25
  26 LOG = logging.getLogger()
  27
  28 def create(dsn, data_dir):
  29     """ Create a new instance of the tokenizer provided by this module.
  30     """
  31     return LegacyICUTokenizer(dsn, data_dir)
  32
  33
  34 class LegacyICUTokenizer:
  35     """ This tokenizer uses libICU to covert names and queries to ASCII.
  36         Otherwise it uses the same algorithms and data structures as the
  37         normalization routines in Nominatm 3.
  38     """
  39
  40     def __init__(self, dsn, data_dir):
  41         self.dsn = dsn
  42         self.data_dir = data_dir
  43         self.normalization = None
  44         self.transliteration = None
  45         self.abbreviations = None
  46
  47
  48     def init_new_db(self, config, init_db=True):
  49         """ Set up a new tokenizer for the database.
  50
  51             This copies all necessary data in the project directory to make
  52             sure the tokenizer remains stable even over updates.
  53         """
  54         if config.TOKENIZER_CONFIG:
  55             cfgfile = Path(config.TOKENIZER_CONFIG)
  56         else:
  57             cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
  58
  59         rules = json.loads(cfgfile.read_text())
  60         self.transliteration = ';'.join(rules['normalization']) + ';'
  61         self.abbreviations = rules["abbreviations"]
  62         self.normalization = config.TERM_NORMALIZATION
  63
  64         self._install_php(config)
  65         self._save_config(config)
  66
  67         if init_db:
  68             self.update_sql_functions(config)
  69             self._init_db_tables(config)
  70
  71
  72     def init_from_project(self):
  73         """ Initialise the tokenizer from the project directory.
  74         """
  75         with connect(self.dsn) as conn:
  76             self.normalization = get_property(conn, DBCFG_NORMALIZATION)
  77             self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
  78             self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
  79
  80
  81     def finalize_import(self, config):
  82         """ Do any required postprocessing to make the tokenizer data ready
  83             for use.
  84         """
  85         with connect(self.dsn) as conn:
  86             sqlp = SQLPreprocessor(conn, config)
  87             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
  88
  89
  90     def update_sql_functions(self, config):
  91         """ Reimport the SQL functions for this tokenizer.
  92         """
  93         with connect(self.dsn) as conn:
  94             max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
  95             sqlp = SQLPreprocessor(conn, config)
  96             sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
  97                               max_word_freq=max_word_freq)
  98
  99
 100     def check_database(self):
 101         """ Check that the tokenizer is set up correctly.
 102         """
 103         self.init_from_project()
 104
 105         if self.normalization is None\
 106            or self.transliteration is None\
 107            or self.abbreviations is None:
 108             return "Configuration for tokenizer 'legacy_icu' are missing."
 109
 110         return None
 111
 112
 113     def name_analyzer(self):
 114         """ Create a new analyzer for tokenizing names and queries
 115             using this tokinzer. Analyzers are context managers and should
 116             be used accordingly:
 117
 118             ```
 119             with tokenizer.name_analyzer() as analyzer:
 120                 analyser.tokenize()
 121             ```
 122
 123             When used outside the with construct, the caller must ensure to
 124             call the close() function before destructing the analyzer.
 125
 126             Analyzers are not thread-safe. You need to instantiate one per thread.
 127         """
 128         norm = Transliterator.createFromRules("normalizer", self.normalization)
 129         trans = Transliterator.createFromRules("normalizer", self.transliteration)
 130         return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
 131
 132
 133     def _install_php(self, config):
 134         """ Install the php script for the tokenizer.
 135         """
 136         php_file = self.data_dir / "tokenizer.php"
 137         php_file.write_text(dedent("""\
 138             <?php
 139             @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
 140             @define('CONST_Term_Normalization_Rules', "{0.normalization}");
 141             @define('CONST_Transliteration'. "{0.transliteration}");
 142             # XXX abreviations
 143             require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
 144             """.format(self, config)))
 145
 146
 147     def _save_config(self, config):
 148         """ Save the configuration that needs to remain stable for the given
 149             database as database properties.
 150         """
 151         with connect(self.dsn) as conn:
 152             set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 153             set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 154             set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
 155             set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
 156
 157
 158     def _init_db_tables(self, config):
 159         """ Set up the word table and fill it with pre-computed word
 160             frequencies.
 161         """
 162         with connect(self.dsn) as conn:
 163             sqlp = SQLPreprocessor(conn, config)
 164             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 165             conn.commit()
 166
 167             LOG.warning("Precomputing word tokens")
 168
 169             # get partial words and their frequencies
 170             words = Counter()
 171             with self.name_analyzer() as analyzer:
 172                 with conn.cursor(name="words") as cur:
 173                     cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
 174
 175                     for name, cnt in cur:
 176                         term = analyzer.make_standard_word(name)
 177                         if term:
 178                             for word in term.split():
 179                                 words[word] += cnt
 180
 181             # copy them back into the word table
 182             copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
 183
 184             with conn.cursor() as cur:
 185                 cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
 186                 cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 187                                WHERE word_id is null""")
 188
 189             conn.commit()
 190
 191
 192 class LegacyICUNameAnalyzer:
 193     """ The legacy analyzer uses the ICU library for splitting names.
 194
 195         Each instance opens a connection to the database to request the
 196         normalization.
 197     """
 198
 199     def __init__(self, dsn, normalizer, transliterator, abbreviations):
 200         self.conn = connect(dsn).connection
 201         self.conn.autocommit = True
 202         self.normalizer = normalizer
 203         self.transliterator = transliterator
 204         self.abbreviations = abbreviations
 205         #psycopg2.extras.register_hstore(self.conn)
 206
 207         self._cache = _TokenCache()
 208
 209
 210     def __enter__(self):
 211         return self
 212
 213
 214     def __exit__(self, exc_type, exc_value, traceback):
 215         self.close()
 216
 217
 218     def close(self):
 219         """ Free all resources used by the analyzer.
 220         """
 221         if self.conn:
 222             self.conn.close()
 223             self.conn = None
 224
 225
 226     def normalize(self, phrase):
 227         """ Normalize the given phrase, i.e. remove all properties that
 228             are irrelevant for search.
 229         """
 230         return self.normalizer.transliterate(phrase)
 231
 232     def make_standard_word(self, name):
 233         """ Create the normalised version of the name.
 234         """
 235         norm = ' ' + self.transliterator.transliterate(name) + ' '
 236         for full, abbr in self.abbreviations:
 237             if full in norm:
 238                 norm = norm.replace(full, abbr)
 239
 240         return norm.strip()
 241
 242
 243     def _make_standard_hnr(self, hnr):
 244         """ Create a normalised version of a housenumber.
 245
 246             This function takes minor shortcuts on transliteration.
 247         """
 248         if hnr.isdigit():
 249             return hnr
 250
 251         return self.transliterator.transliterate(hnr)
 252
 253     def add_postcodes_from_db(self):
 254         """ Add postcodes from the location_postcode table to the word table.
 255         """
 256         copystr = io.StringIO()
 257         with self.conn.cursor() as cur:
 258             cur.execute("SELECT distinct(postcode) FROM location_postcode")
 259             for (postcode, ) in cur:
 260                 copystr.write(postcode)
 261                 copystr.write('\t ')
 262                 copystr.write(self.transliterator.transliterate(postcode))
 263                 copystr.write('\tplace\tpostcode\t0\n')
 264
 265             cur.copy_from(copystr, 'word',
 266                           columns=['word', 'word_token', 'class', 'type',
 267                                    'search_name_count'])
 268             # Don't really need an ID for postcodes....
 269             # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
 270             #                WHERE word_id is null and type = 'postcode'""")
 271
 272
 273     def update_special_phrases(self, phrases):
 274         """ Replace the search index for special phrases with the new phrases.
 275         """
 276         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 277                             for p in phrases))
 278
 279         with self.conn.cursor() as cur:
 280             # Get the old phrases.
 281             existing_phrases = set()
 282             cur.execute("""SELECT word, class, type, operator FROM word
 283                            WHERE class != 'place'
 284                                  OR (type != 'house' AND type != 'postcode')""")
 285             for label, cls, typ, oper in cur:
 286                 existing_phrases.add((label, cls, typ, oper or '-'))
 287
 288             to_add = norm_phrases - existing_phrases
 289             to_delete = existing_phrases - norm_phrases
 290
 291             if to_add:
 292                 copystr = io.StringIO()
 293                 for word, cls, typ, oper in to_add:
 294                     term = self.make_standard_word(word)
 295                     if term:
 296                         copystr.write(word)
 297                         copystr.write('\t ')
 298                         copystr.write(term)
 299                         copystr.write('\t')
 300                         copystr.write(cls)
 301                         copystr.write('\t')
 302                         copystr.write(typ)
 303                         copystr.write('\t')
 304                         copystr.write(oper if oper in ('in', 'near')  else '\\N')
 305                         copystr.write('\t0\n')
 306
 307                 cur.copy_from(copystr, 'word',
 308                               columns=['word', 'word_token', 'class', 'type',
 309                                        'operator', 'search_name_count'])
 310
 311             if to_delete:
 312                 psycopg2.extras.execute_values(
 313                     cur,
 314                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 315                         WHERE word = name and class = in_class and type = in_type
 316                               and ((op = '-' and operator is null) or op = operator)""",
 317                     to_delete)
 318
 319         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 320                  len(norm_phrases), len(to_add), len(to_delete))
 321
 322
 323     def add_country_names(self, country_code, names):
 324         """ Add names for the given country to the search index.
 325         """
 326         full_names = set((self.make_standard_word(n) for n in names))
 327         full_names.discard('')
 328         self._add_normalised_country_names(country_code, full_names)
 329
 330
 331     def _add_normalised_country_names(self, country_code, names):
 332         """ Add names for the given country to the search index.
 333         """
 334         with self.conn.cursor() as cur:
 335             # Get existing names
 336             cur.execute("SELECT word_token FROM word WHERE country_code = %s",
 337                         (country_code, ))
 338             new_names = names.difference((t[0] for t in cur))
 339
 340             if new_names:
 341                 cur.execute("""INSERT INTO word (word_id, word_token, country_code,
 342                                                  search_name_count)
 343                                (SELECT nextval('seq_word'), token, '{}', 0
 344                                 FROM unnest(%s) as token)
 345                             """.format(country_code), (list(new_names),))
 346
 347
 348     def process_place(self, place):
 349         """ Determine tokenizer information about the given place.
 350
 351             Returns a JSON-serialisable structure that will be handed into
 352             the database via the token_info field.
 353         """
 354         token_info = _TokenInfo(self._cache)
 355
 356         names = place.get('name')
 357
 358         if names:
 359             full_names = set((self.make_standard_word(name) for name in names.values()))
 360             full_names.discard('')
 361
 362             token_info.add_names(self.conn, full_names)
 363
 364             country_feature = place.get('country_feature')
 365             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 366                 self._add_normalised_country_names(country_feature.lower(),
 367                                                    full_names)
 368
 369         address = place.get('address')
 370
 371         if address:
 372             hnrs = []
 373             addr_terms = []
 374             for key, value in address.items():
 375                 if key == 'postcode':
 376                     self._add_postcode(value)
 377                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 378                     hnrs.append(value)
 379                 elif key == 'street':
 380                     token_info.add_street(self.conn, self.make_standard_word(value))
 381                 elif key == 'place':
 382                     token_info.add_place(self.conn, self.make_standard_word(value))
 383                 elif not key.startswith('_') and \
 384                      key not in ('country', 'full'):
 385                     addr_terms.append((key, self.make_standard_word(value)))
 386
 387             if hnrs:
 388                 hnrs = self._split_housenumbers(hnrs)
 389                 token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
 390
 391             if addr_terms:
 392                 token_info.add_address_terms(self.conn, addr_terms)
 393
 394         return token_info.data
 395
 396
 397     def _add_postcode(self, postcode):
 398         """ Make sure the normalized postcode is present in the word table.
 399         """
 400         if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
 401             term = self.make_standard_word(postcode)
 402             if not term:
 403                 return
 404
 405             with self.conn.cursor() as cur:
 406                 # no word_id needed for postcodes
 407                 cur.execute("""INSERT INTO word (word, word_token, class, type,
 408                                                  search_name_count)
 409                                (SELECT pc, %s, 'place', 'postcode', 0
 410                                 FROM (VALUES (%s)) as v(pc)
 411                                 WHERE NOT EXISTS
 412                                  (SELECT * FROM word
 413                                   WHERE word = pc and class='place' and type='postcode'))
 414                             """, (' ' + term, postcode))
 415             self._cache.postcodes.add(postcode)
 416
 417     @staticmethod
 418     def _split_housenumbers(hnrs):
 419         if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
 420             # split numbers if necessary
 421             simple_list = []
 422             for hnr in hnrs:
 423                 simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 424
 425             if len(simple_list) > 1:
 426                 hnrs = list(set(simple_list))
 427             else:
 428                 hnrs = simple_list
 429
 430         return hnrs
 431
 432
 433
 434
 435 class _TokenInfo:
 436     """ Collect token information to be sent back to the database.
 437     """
 438     def __init__(self, cache):
 439         self.cache = cache
 440         self.data = {}
 441
 442     @staticmethod
 443     def _mk_array(tokens):
 444         return '{%s}' % ','.join((str(s) for s in tokens))
 445
 446
 447     def add_names(self, conn, names):
 448         """ Adds token information for the normalised names.
 449         """
 450         # Start with all partial names
 451         terms = set((part for ns in names for part in ns.split()))
 452         # Add partials for the full terms (TO BE REMOVED)
 453         terms.update((n for n in names))
 454         # Add the full names
 455         terms.update((' ' + n for n in names))
 456
 457         self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
 458
 459
 460     def add_housenumbers(self, conn, hnrs):
 461         """ Extract housenumber information from a list of normalised
 462             housenumbers.
 463         """
 464         self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
 465         self.data['hnr'] = ';'.join(hnrs)
 466
 467
 468     def add_street(self, conn, street):
 469         """ Add addr:street match terms.
 470         """
 471         if not street:
 472             return
 473
 474         term = ' ' + street
 475
 476         tid = self.cache.names.get(term)
 477
 478         if tid is None:
 479             with conn.cursor() as cur:
 480                 cur.execute("""SELECT word_id FROM word
 481                                 WHERE word_token = %s
 482                                       and class is null and type is null""",
 483                             (term, ))
 484                 if cur.rowcount > 0:
 485                     tid = cur.fetchone()[0]
 486                     self.cache.names[term] = tid
 487
 488         if tid is not None:
 489             self.data['street'] = '{%d}' % tid
 490
 491
 492     def add_place(self, conn, place):
 493         """ Add addr:place search and match terms.
 494         """
 495         if not place:
 496             return
 497
 498         partial_ids = self.cache.get_term_tokens(conn, place.split())
 499         tid = self.cache.get_term_tokens(conn, [' ' + place])
 500
 501         self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
 502         self.data['place_match'] = '{%s}' % tid[0]
 503
 504
 505     def add_address_terms(self, conn, terms):
 506         """ Add additional address terms.
 507         """
 508         tokens = {}
 509
 510         for key, value in terms:
 511             if not value:
 512                 continue
 513             partial_ids = self.cache.get_term_tokens(conn, value.split())
 514             term = ' ' + value
 515             tid = self.cache.names.get(term)
 516
 517             if tid is None:
 518                 with conn.cursor() as cur:
 519                     cur.execute("""SELECT word_id FROM word
 520                                     WHERE word_token = %s
 521                                           and class is null and type is null""",
 522                                 (term, ))
 523                     if cur.rowcount > 0:
 524                         tid = cur.fetchone()[0]
 525                         self.cache.names[term] = tid
 526
 527             tokens[key] = [self._mk_array(partial_ids),
 528                            '{%s}' % ('' if tid is None else str(tid))]
 529
 530         if tokens:
 531             self.data['addr'] = tokens
 532
 533
 534 class _TokenCache:
 535     """ Cache for token information to avoid repeated database queries.
 536
 537         This cache is not thread-safe and needs to be instantiated per
 538         analyzer.
 539     """
 540     def __init__(self):
 541         self.names = {}
 542         self.postcodes = set()
 543         self.housenumbers = {}
 544
 545
 546     def get_term_tokens(self, conn, terms):
 547         """ Get token ids for a list of terms, looking them up in the database
 548             if necessary.
 549         """
 550         tokens = []
 551         askdb = []
 552
 553         for term in terms:
 554             token = self.names.get(term)
 555             if token is None:
 556                 askdb.append(term)
 557             elif token != 0:
 558                 tokens.append(token)
 559
 560         if askdb:
 561             with conn.cursor() as cur:
 562                 cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
 563                             (askdb, ))
 564                 for term, tid in cur:
 565                     self.names[term] = tid
 566                     if tid != 0:
 567                         tokens.append(tid)
 568
 569         return tokens
 570
 571
 572     def get_hnr_tokens(self, conn, terms):
 573         """ Get token ids for a list of housenumbers, looking them up in the
 574             database if necessary.
 575         """
 576         tokens = []
 577         askdb = []
 578
 579         for term in terms:
 580             token = self.housenumbers.get(term)
 581             if token is None:
 582                 askdb.append(term)
 583             else:
 584                 tokens.append(token)
 585
 586         if askdb:
 587             with conn.cursor() as cur:
 588                 cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
 589                             (askdb, ))
 590                 for term, tid in cur:
 591                     self.housenumbers[term] = tid
 592                     tokens.append(tid)
 593
 594         return tokens