nominatim/tokenizer/legacy_tokenizer.py

   1 """
   2 Tokenizer implementing normalisation as used before Nominatim 4.
   3 """
   4 from collections import OrderedDict
   5 import logging
   6 import re
   7 import shutil
   8
   9 from icu import Transliterator
  10 import psycopg2
  11 import psycopg2.extras
  12
  13 from nominatim.db.connection import connect
  14 from nominatim.db import properties
  15 from nominatim.db import utils as db_utils
  16 from nominatim.db.sql_preprocessor import SQLPreprocessor
  17 from nominatim.errors import UsageError
  18
  19 DBCFG_NORMALIZATION = "tokenizer_normalization"
  20 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
  21
  22 LOG = logging.getLogger()
  23
  24 def create(dsn, data_dir):
  25     """ Create a new instance of the tokenizer provided by this module.
  26     """
  27     return LegacyTokenizer(dsn, data_dir)
  28
  29
  30 def _install_module(config_module_path, src_dir, module_dir):
  31     """ Copies the PostgreSQL normalisation module into the project
  32         directory if necessary. For historical reasons the module is
  33         saved in the '/module' subdirectory and not with the other tokenizer
  34         data.
  35
  36         The function detects when the installation is run from the
  37         build directory. It doesn't touch the module in that case.
  38     """
  39     # Custom module locations are simply used as is.
  40     if config_module_path:
  41         LOG.info("Using custom path for database module at '%s'", config_module_path)
  42         return config_module_path
  43
  44     # Compatibility mode for builddir installations.
  45     if module_dir.exists() and src_dir.samefile(module_dir):
  46         LOG.info('Running from build directory. Leaving database module as is.')
  47         return module_dir
  48
  49     # In any other case install the module in the project directory.
  50     if not module_dir.exists():
  51         module_dir.mkdir()
  52
  53     destfile = module_dir / 'nominatim.so'
  54     shutil.copy(str(src_dir / 'nominatim.so'), str(destfile))
  55     destfile.chmod(0o755)
  56
  57     LOG.info('Database module installed at %s', str(destfile))
  58
  59     return module_dir
  60
  61
  62 def _check_module(module_dir, conn):
  63     """ Try to use the PostgreSQL module to confirm that it is correctly
  64         installed and accessible from PostgreSQL.
  65     """
  66     with conn.cursor() as cur:
  67         try:
  68             cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
  69                            RETURNS text AS '{}/nominatim.so', 'transliteration'
  70                            LANGUAGE c IMMUTABLE STRICT;
  71                            DROP FUNCTION nominatim_test_import_func(text)
  72                         """.format(module_dir))
  73         except psycopg2.DatabaseError as err:
  74             LOG.fatal("Error accessing database module: %s", err)
  75             raise UsageError("Database module cannot be accessed.") from err
  76
  77
  78 class LegacyTokenizer:
  79     """ The legacy tokenizer uses a special PostgreSQL module to normalize
  80         names and queries. The tokenizer thus implements normalization through
  81         calls to the database.
  82     """
  83
  84     def __init__(self, dsn, data_dir):
  85         self.dsn = dsn
  86         self.data_dir = data_dir
  87         self.normalization = None
  88
  89
  90     def init_new_db(self, config):
  91         """ Set up a new tokenizer for the database.
  92
  93             This copies all necessary data in the project directory to make
  94             sure the tokenizer remains stable even over updates.
  95         """
  96         module_dir = _install_module(config.DATABASE_MODULE_PATH,
  97                                      config.lib_dir.module,
  98                                      config.project_dir / 'module')
  99
 100         self.normalization = config.TERM_NORMALIZATION
 101
 102         with connect(self.dsn) as conn:
 103             _check_module(module_dir, conn)
 104             self._save_config(conn, config)
 105             conn.commit()
 106
 107         self.update_sql_functions(config)
 108         self._init_db_tables(config)
 109
 110
 111     def init_from_project(self):
 112         """ Initialise the tokenizer from the project directory.
 113         """
 114         with connect(self.dsn) as conn:
 115             self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION)
 116
 117
 118     def update_sql_functions(self, config):
 119         """ Reimport the SQL functions for this tokenizer.
 120         """
 121         with connect(self.dsn) as conn:
 122             max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
 123             modulepath = config.DATABASE_MODULE_PATH or \
 124                          str((config.project_dir / 'module').resolve())
 125             sqlp = SQLPreprocessor(conn, config)
 126             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer.sql',
 127                               max_word_freq=max_word_freq,
 128                               modulepath=modulepath)
 129
 130
 131     def migrate_database(self, config):
 132         """ Initialise the project directory of an existing database for
 133             use with this tokenizer.
 134
 135             This is a special migration function for updating existing databases
 136             to new software versions.
 137         """
 138         module_dir = _install_module(config.DATABASE_MODULE_PATH,
 139                                      config.lib_dir.module,
 140                                      config.project_dir / 'module')
 141
 142         with connect(self.dsn) as conn:
 143             _check_module(module_dir, conn)
 144             self._save_config(conn, config)
 145
 146
 147     def name_analyzer(self):
 148         """ Create a new analyzer for tokenizing names and queries
 149             using this tokinzer. Analyzers are context managers and should
 150             be used accordingly:
 151
 152             ```
 153             with tokenizer.name_analyzer() as analyzer:
 154                 analyser.tokenize()
 155             ```
 156
 157             When used outside the with construct, the caller must ensure to
 158             call the close() function before destructing the analyzer.
 159
 160             Analyzers are not thread-safe. You need to instantiate one per thread.
 161         """
 162         normalizer = Transliterator.createFromRules("phrase normalizer",
 163                                                     self.normalization)
 164         return LegacyNameAnalyzer(self.dsn, normalizer)
 165
 166
 167     def _init_db_tables(self, config):
 168         """ Set up the word table and fill it with pre-computed word
 169             frequencies.
 170         """
 171         with connect(self.dsn) as conn:
 172             sqlp = SQLPreprocessor(conn, config)
 173             sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
 174             conn.commit()
 175
 176         LOG.warning("Precomputing word tokens")
 177         db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
 178
 179
 180     def _save_config(self, conn, config):
 181         """ Save the configuration that needs to remain stable for the given
 182             database as database properties.
 183         """
 184         properties.set_property(conn, DBCFG_NORMALIZATION, self.normalization)
 185         properties.set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
 186
 187
 188 class LegacyNameAnalyzer:
 189     """ The legacy analyzer uses the special Postgresql module for
 190         splitting names.
 191
 192         Each instance opens a connection to the database to request the
 193         normalization.
 194     """
 195
 196     def __init__(self, dsn, normalizer):
 197         self.conn = connect(dsn).connection
 198         self.conn.autocommit = True
 199         self.normalizer = normalizer
 200         psycopg2.extras.register_hstore(self.conn)
 201
 202         self._cache = _TokenCache(self.conn)
 203
 204
 205     def __enter__(self):
 206         return self
 207
 208
 209     def __exit__(self, exc_type, exc_value, traceback):
 210         self.close()
 211
 212
 213     def close(self):
 214         """ Free all resources used by the analyzer.
 215         """
 216         if self.conn:
 217             self.conn.close()
 218             self.conn = None
 219
 220
 221     def normalize(self, phrase):
 222         """ Normalize the given phrase, i.e. remove all properties that
 223             are irrelevant for search.
 224         """
 225         return self.normalizer.transliterate(phrase)
 226
 227
 228     def add_postcodes_from_db(self):
 229         """ Add postcodes from the location_postcode table to the word table.
 230         """
 231         with self.conn.cursor() as cur:
 232             cur.execute("""SELECT count(create_postcode_id(pc))
 233                            FROM (SELECT distinct(postcode) as pc
 234                                  FROM location_postcode) x""")
 235
 236
 237     def update_special_phrases(self, phrases):
 238         """ Replace the search index for special phrases with the new phrases.
 239         """
 240         norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
 241                             for p in phrases))
 242
 243         with self.conn.cursor() as cur:
 244             # Get the old phrases.
 245             existing_phrases = set()
 246             cur.execute("""SELECT word, class, type, operator FROM word
 247                            WHERE class != 'place'
 248                                  OR (type != 'house' AND type != 'postcode')""")
 249             for label, cls, typ, oper in cur:
 250                 existing_phrases.add((label, cls, typ, oper or '-'))
 251
 252             to_add = norm_phrases - existing_phrases
 253             to_delete = existing_phrases - norm_phrases
 254
 255             if to_add:
 256                 psycopg2.extras.execute_values(
 257                     cur,
 258                     """ INSERT INTO word (word_id, word_token, word, class, type,
 259                                           search_name_count, operator)
 260                         (SELECT nextval('seq_word'), make_standard_name(name), name,
 261                                 class, type, 0,
 262                                 CASE WHEN op in ('in', 'near') THEN op ELSE null END
 263                            FROM (VALUES %s) as v(name, class, type, op))""",
 264                     to_add)
 265
 266             if to_delete:
 267                 psycopg2.extras.execute_values(
 268                     cur,
 269                     """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
 270                         WHERE word = name and class = in_class and type = in_type
 271                               and ((op = '-' and operator is null) or op = operator)""",
 272                     to_delete)
 273
 274         LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
 275                  len(norm_phrases), len(to_add), len(to_delete))
 276
 277
 278     def add_country_names(self, country_code, names):
 279         """ Add names for the given country to the search index.
 280         """
 281         with self.conn.cursor() as cur:
 282             cur.execute(
 283                 """INSERT INTO word (word_id, word_token, country_code)
 284                    (SELECT nextval('seq_word'), lookup_token, %s
 285                       FROM (SELECT ' ' || make_standard_name(n) as lookup_token
 286                             FROM unnest(%s)n) y
 287                       WHERE NOT EXISTS(SELECT * FROM word
 288                                        WHERE word_token = lookup_token and country_code = %s))
 289                 """, (country_code, names, country_code))
 290
 291
 292     def process_place(self, place):
 293         """ Determine tokenizer information about the given place.
 294
 295             Returns a JSON-serialisable structure that will be handed into
 296             the database via the token_info field.
 297         """
 298         token_info = _TokenInfo(self._cache)
 299
 300         names = place.get('name')
 301
 302         if names:
 303             token_info.add_names(self.conn, names)
 304
 305             country_feature = place.get('country_feature')
 306             if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
 307                 self.add_country_names(country_feature.lower(), list(names.values()))
 308
 309         address = place.get('address')
 310
 311         if address:
 312             hnrs = []
 313             addr_terms = []
 314             for key, value in address.items():
 315                 if key == 'postcode':
 316                     self._add_postcode(value)
 317                 elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
 318                     hnrs.append(value)
 319                 elif key == 'street':
 320                     token_info.add_street(self.conn, value)
 321                 elif key == 'place':
 322                     token_info.add_place(self.conn, value)
 323                 elif not key.startswith('_') and \
 324                      key not in ('country', 'full'):
 325                     addr_terms.append((key, value))
 326
 327             if hnrs:
 328                 token_info.add_housenumbers(self.conn, hnrs)
 329
 330             if addr_terms:
 331                 token_info.add_address_terms(self.conn, addr_terms)
 332
 333         return token_info.data
 334
 335
 336     def _add_postcode(self, postcode):
 337         """ Make sure the normalized postcode is present in the word table.
 338         """
 339         def _create_postcode_from_db(pcode):
 340             with self.conn.cursor() as cur:
 341                 cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
 342
 343         if re.search(r'[:,;]', postcode) is None:
 344             self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
 345
 346
 347 class _TokenInfo:
 348     """ Collect token information to be sent back to the database.
 349     """
 350     def __init__(self, cache):
 351         self.cache = cache
 352         self.data = {}
 353
 354
 355     def add_names(self, conn, names):
 356         """ Add token information for the names of the place.
 357         """
 358         with conn.cursor() as cur:
 359             # Create the token IDs for all names.
 360             self.data['names'] = cur.scalar("SELECT make_keywords(%s)::text",
 361                                             (names, ))
 362
 363
 364     def add_housenumbers(self, conn, hnrs):
 365         """ Extract housenumber information from the address.
 366         """
 367         if len(hnrs) == 1:
 368             token = self.cache.get_housenumber(hnrs[0])
 369             if token is not None:
 370                 self.data['hnr_tokens'] = token
 371                 self.data['hnr'] = hnrs[0]
 372                 return
 373
 374         # split numbers if necessary
 375         simple_list = []
 376         for hnr in hnrs:
 377             simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
 378
 379         if len(simple_list) > 1:
 380             simple_list = list(set(simple_list))
 381
 382         with conn.cursor() as cur:
 383             cur.execute("SELECT (create_housenumbers(%s)).* ", (simple_list, ))
 384             self.data['hnr_tokens'], self.data['hnr'] = cur.fetchone()
 385
 386
 387     def add_street(self, conn, street):
 388         """ Add addr:street match terms.
 389         """
 390         def _get_street(name):
 391             with conn.cursor() as cur:
 392                 return cur.scalar("SELECT word_ids_from_name(%s)::text", (name, ))
 393
 394         self.data['street'] = self.cache.streets.get(street, _get_street)
 395
 396
 397     def add_place(self, conn, place):
 398         """ Add addr:place search and match terms.
 399         """
 400         def _get_place(name):
 401             with conn.cursor() as cur:
 402                 cur.execute("""SELECT (addr_ids_from_name(%s)
 403                                        || getorcreate_name_id(make_standard_name(%s), ''))::text,
 404                                       word_ids_from_name(%s)::text""",
 405                             (name, name, name))
 406                 return cur.fetchone()
 407
 408         self.data['place_search'], self.data['place_match'] = \
 409             self.cache.places.get(place, _get_place)
 410
 411
 412     def add_address_terms(self, conn, terms):
 413         """ Add additional address terms.
 414         """
 415         def _get_address_term(name):
 416             with conn.cursor() as cur:
 417                 cur.execute("""SELECT addr_ids_from_name(%s)::text,
 418                                       word_ids_from_name(%s)::text""",
 419                             (name, name))
 420                 return cur.fetchone()
 421
 422         tokens = {}
 423         for key, value in terms:
 424             tokens[key] = self.cache.address_terms.get(value, _get_address_term)
 425
 426         self.data['addr'] = tokens
 427
 428
 429 class _LRU:
 430     """ Least recently used cache that accepts a generator function to
 431         produce the item when there is a cache miss.
 432     """
 433
 434     def __init__(self, maxsize=128, init_data=None):
 435         self.data = init_data or OrderedDict()
 436         self.maxsize = maxsize
 437         if init_data is not None and len(init_data) > maxsize:
 438             self.maxsize = len(init_data)
 439
 440     def get(self, key, generator):
 441         """ Get the item with the given key from the cache. If nothing
 442             is found in the cache, generate the value through the
 443             generator function and store it in the cache.
 444         """
 445         value = self.data.get(key)
 446         if value is not None:
 447             self.data.move_to_end(key)
 448         else:
 449             value = generator(key)
 450             if len(self.data) >= self.maxsize:
 451                 self.data.popitem(last=False)
 452             self.data[key] = value
 453
 454         return value
 455
 456
 457 class _TokenCache:
 458     """ Cache for token information to avoid repeated database queries.
 459
 460         This cache is not thread-safe and needs to be instantiated per
 461         analyzer.
 462     """
 463     def __init__(self, conn):
 464         # various LRU caches
 465         self.streets = _LRU(maxsize=256)
 466         self.places = _LRU(maxsize=128)
 467         self.address_terms = _LRU(maxsize=1024)
 468
 469         # Lookup houseunumbers up to 100 and cache them
 470         with conn.cursor() as cur:
 471             cur.execute("""SELECT i, ARRAY[getorcreate_housenumber_id(i::text)]::text
 472                            FROM generate_series(1, 100) as i""")
 473             self._cached_housenumbers = {str(r[0]) : r[1] for r in cur}
 474
 475         # Get postcodes that are already saved
 476         postcodes = OrderedDict()
 477         with conn.cursor() as cur:
 478             cur.execute("""SELECT word FROM word
 479                            WHERE class ='place' and type = 'postcode'""")
 480             for row in cur:
 481                 postcodes[row[0]] = None
 482         self.postcodes = _LRU(maxsize=32, init_data=postcodes)
 483
 484     def get_housenumber(self, number):
 485         """ Get a housenumber token from the cache.
 486         """
 487         return self._cached_housenumbers.get(number)