X-Git-Url: https://git.openstreetmap.org/nominatim.git/blobdiff_plain/3eb4d8805700ba12bd601e552c3bc48064423083..1ffb6bd5d0e1aea120f953a55d72025f47206242:/nominatim/tokenizer/legacy_tokenizer.py diff --git a/nominatim/tokenizer/legacy_tokenizer.py b/nominatim/tokenizer/legacy_tokenizer.py index d6755835..4c03678d 100644 --- a/nominatim/tokenizer/legacy_tokenizer.py +++ b/nominatim/tokenizer/legacy_tokenizer.py @@ -119,6 +119,15 @@ class LegacyTokenizer: self.normalization = properties.get_property(conn, DBCFG_NORMALIZATION) + def finalize_import(self, config): + """ Do any required postprocessing to make the tokenizer data ready + for use. + """ + with connect(self.dsn) as conn: + sqlp = SQLPreprocessor(conn, config) + sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql') + + def update_sql_functions(self, config): """ Reimport the SQL functions for this tokenizer. """ @@ -132,6 +141,33 @@ class LegacyTokenizer: modulepath=modulepath) + def check_database(self): + """ Check that the tokenizer is set up correctly. + """ + hint = """\ + The Postgresql extension nominatim.so was not correctly loaded. + + Error: {error} + + Hints: + * Check the output of the CMmake/make installation step + * Does nominatim.so exist? + * Does nominatim.so exist on the database server? + * Can nominatim.so be accessed by the database user? + """ + with connect(self.dsn) as conn: + with conn.cursor() as cur: + try: + out = cur.scalar("SELECT make_standard_name('a')") + except psycopg2.Error as err: + return hint.format(error=str(err)) + + if out != 'a': + return hint.format(error='Unexpected result for make_standard_name()') + + return None + + def migrate_database(self, config): """ Initialise the project directory of an existing database for use with this tokenizer. @@ -235,6 +271,33 @@ class LegacyNameAnalyzer: self.conn = None + @staticmethod + def get_word_token_info(conn, words): + """ Return token information for the given list of words. + If a word starts with # it is assumed to be a full name + otherwise is a partial name. + + The function returns a list of tuples with + (original word, word token, word id). + + The function is used for testing and debugging only + and not necessarily efficient. + """ + with conn.cursor() as cur: + cur.execute("""SELECT t.term, word_token, word_id + FROM word, (SELECT unnest(%s::TEXT[]) as term) t + WHERE word_token = (CASE + WHEN left(t.term, 1) = '#' THEN + ' ' || make_standard_name(substring(t.term from 2)) + ELSE + make_standard_name(t.term) + END) + and class is null and country_code is null""", + (words, )) + + return [(r[0], r[1], r[2]) for r in cur] + + def normalize(self, phrase): """ Normalize the given phrase, i.e. remove all properties that are irrelevant for search. @@ -242,16 +305,54 @@ class LegacyNameAnalyzer: return self.normalizer.transliterate(phrase) - def add_postcodes_from_db(self): - """ Add postcodes from the location_postcode table to the word table. + @staticmethod + def normalize_postcode(postcode): + """ Convert the postcode to a standardized form. + + This function must yield exactly the same result as the SQL function + 'token_normalized_postcode()'. + """ + return postcode.strip().upper() + + + def update_postcodes_from_db(self): + """ Update postcode tokens in the word table from the location_postcode + table. """ with self.conn.cursor() as cur: - cur.execute("""SELECT count(create_postcode_id(pc)) - FROM (SELECT distinct(postcode) as pc - FROM location_postcode) x""") + # This finds us the rows in location_postcode and word that are + # missing in the other table. + cur.execute("""SELECT * FROM + (SELECT pc, word FROM + (SELECT distinct(postcode) as pc FROM location_postcode) p + FULL JOIN + (SELECT word FROM word + WHERE class ='place' and type = 'postcode') w + ON pc = word) x + WHERE pc is null or word is null""") + + to_delete = [] + to_add = [] + + for postcode, word in cur: + if postcode is None: + to_delete.append(word) + else: + to_add.append(postcode) + + if to_delete: + cur.execute("""DELETE FROM WORD + WHERE class ='place' and type = 'postcode' + and word = any(%s) + """, (to_delete, )) + if to_add: + cur.execute("""SELECT count(create_postcode_id(pc)) + FROM unnest(%s) as pc + """, (to_add, )) + - def update_special_phrases(self, phrases): + def update_special_phrases(self, phrases, should_replace): """ Replace the search index for special phrases with the new phrases. """ norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3]) @@ -280,7 +381,7 @@ class LegacyNameAnalyzer: FROM (VALUES %s) as v(name, class, type, op))""", to_add) - if to_delete: + if to_delete and should_replace: psycopg2.extras.execute_values( cur, """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op) @@ -353,12 +454,8 @@ class LegacyNameAnalyzer: def _add_postcode(self, postcode): """ Make sure the normalized postcode is present in the word table. """ - def _create_postcode_from_db(pcode): - with self.conn.cursor() as cur: - cur.execute('SELECT create_postcode_id(%s)', (pcode, )) - if re.search(r'[:,;]', postcode) is None: - self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db) + self._cache.add_postcode(self.conn, self.normalize_postcode(postcode)) class _TokenInfo: @@ -489,16 +586,19 @@ class _TokenCache: FROM generate_series(1, 100) as i""") self._cached_housenumbers = {str(r[0]) : r[1] for r in cur} - # Get postcodes that are already saved - postcodes = OrderedDict() - with conn.cursor() as cur: - cur.execute("""SELECT word FROM word - WHERE class ='place' and type = 'postcode'""") - for row in cur: - postcodes[row[0]] = None - self.postcodes = _LRU(maxsize=32, init_data=postcodes) + # For postcodes remember the ones that have already been added + self.postcodes = set() def get_housenumber(self, number): """ Get a housenumber token from the cache. """ return self._cached_housenumbers.get(number) + + + def add_postcode(self, conn, postcode): + """ Make sure the given postcode is in the database. + """ + if postcode not in self.postcodes: + with conn.cursor() as cur: + cur.execute('SELECT create_postcode_id(%s)', (postcode, )) + self.postcodes.add(postcode)