1 # SPDX-License-Identifier: GPL-3.0-or-later
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2024 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Module containing the class handling the import
9 of the special phrases.
11 Phrases are analyzed and imported into the database.
13 The phrases already present in the database which are not
14 valids anymore are removed.
16 from typing import Iterable, Tuple, Mapping, Sequence, Optional, Set
20 from psycopg.sql import Identifier, SQL
22 from ...typing import Protocol
23 from ...config import Configuration
24 from ...db.connection import Connection, drop_tables, index_exists
25 from .importer_statistics import SpecialPhrasesImporterStatistics
26 from .special_phrase import SpecialPhrase
27 from ...tokenizer.base import AbstractTokenizer
29 LOG = logging.getLogger()
32 def _classtype_table(phrase_class: str, phrase_type: str) -> str:
33 """ Return the name of the table for the given class and type.
35 return f'place_classtype_{phrase_class}_{phrase_type}'
38 class SpecialPhraseLoader(Protocol):
39 """ Protocol for classes implementing a loader for special phrases.
42 def generate_phrases(self) -> Iterable[SpecialPhrase]:
43 """ Generates all special phrase terms this loader can produce.
49 Class handling the process of special phrases importation into the database.
51 Take a sp loader which load the phrases from an external source.
53 def __init__(self, config: Configuration, conn: Connection,
54 sp_loader: SpecialPhraseLoader) -> None:
56 self.db_connection = conn
57 self.sp_loader = sp_loader
58 self.statistics_handler = SpecialPhrasesImporterStatistics()
59 self.black_list, self.white_list = self._load_white_and_black_lists()
60 self.sanity_check_pattern = re.compile(r'^\w+$')
61 # This set will contain all existing phrases to be added.
62 # It contains tuples with the following format: (label, class, type, operator)
63 self.word_phrases: Set[Tuple[str, str, str, str]] = set()
64 # This set will contain all existing place_classtype tables which doesn't match any
65 # special phrases class/type on the wiki.
66 self.table_phrases_to_delete: Set[str] = set()
68 def get_classtype_pairs_style(self) -> Set[Tuple[str, str]]:
70 Returns list of allowed special phrases from the the style file,
71 restricting to a list of combinations of classes and types
72 which have a 'main' property
74 Note: This requirement was from 2021 and I am a bit unsure if it is still relevant
76 style_file = self.config.get_import_style_file() # this gives the path, so i will import it as a json
77 with open(style_file, 'r') as file:
78 style_data = json.loads(f'[{file.read()}]')
80 style_combinations = set()
81 for _map in style_data: # following ../settings/import-extratags.style
82 classes = _map.get("keys", [])
83 values = _map.get("values", {})
85 for _type, properties in values.items():
86 if "main" in properties and _type: # make sure the tag is not an empty string. since type is the value of the main tag
87 for _class in classes:
88 style_combinations.add((_class, _type))
90 return style_combinations
92 def get_classtype_pairs(self) -> Set[Tuple[str, str]]:
94 Returns list of allowed special phrases from the database,
95 restricting to a list of combinations of classes and types
96 whic occur more than 100 times
98 db_combinations = set()
100 SELECT class AS CLS, type AS typ
103 HAVING COUNT(*) > 100
106 with self.db_connection.cursor() as db_cursor:
107 db_cursor.execute(SQL(query))
108 for row in db_cursor.fetchall():
109 db_combinations.add((row[0], row[1]))
111 return db_combinations
114 def import_phrases(self, tokenizer: AbstractTokenizer, should_replace: bool) -> None:
116 Iterate through all SpecialPhrases extracted from the
117 loader and import them into the database.
119 If should_replace is set to True only the loaded phrases
120 will be kept into the database. All other phrases already
121 in the database will be removed.
123 LOG.warning('Special phrases importation starting')
124 self._fetch_existing_place_classtype_tables()
126 # Store pairs of class/type for further processing
127 class_type_pairs = set()
129 for phrase in self.sp_loader.generate_phrases():
130 result = self._process_phrase(phrase)
132 class_type_pairs.add(result)
134 self._create_classtype_table_and_indexes(class_type_pairs)
136 self._remove_non_existent_tables_from_db()
139 self.db_connection.commit()
141 with tokenizer.name_analyzer() as analyzer:
142 analyzer.update_special_phrases(self.word_phrases, should_replace)
144 LOG.warning('Import done.')
145 self.statistics_handler.notify_import_done()
147 def _fetch_existing_place_classtype_tables(self) -> None:
149 Fetch existing place_classtype tables.
150 Fill the table_phrases_to_delete set of the class.
154 FROM information_schema.tables
155 WHERE table_schema='public'
156 AND table_name like 'place_classtype_%';
158 with self.db_connection.cursor() as db_cursor:
159 db_cursor.execute(SQL(query))
160 for row in db_cursor:
161 self.table_phrases_to_delete.add(row[0])
163 def _load_white_and_black_lists(self) \
164 -> Tuple[Mapping[str, Sequence[str]], Mapping[str, Sequence[str]]]:
166 Load white and black lists from phrases-settings.json.
168 settings = self.config.load_sub_configuration('phrase-settings.json')
170 return settings['blackList'], settings['whiteList']
172 def _check_sanity(self, phrase: SpecialPhrase) -> bool:
174 Check sanity of given inputs in case somebody added garbage in the wiki.
175 If a bad class/type is detected the system will exit with an error.
177 class_matchs = self.sanity_check_pattern.findall(phrase.p_class)
178 type_matchs = self.sanity_check_pattern.findall(phrase.p_type)
180 if not class_matchs or not type_matchs:
181 LOG.warning("Bad class/type: %s=%s. It will not be imported",
182 phrase.p_class, phrase.p_type)
186 def _process_phrase(self, phrase: SpecialPhrase) -> Optional[Tuple[str, str]]:
188 Processes the given phrase by checking black and white list
190 Return the class/type pair corresponding to the phrase.
193 # blacklisting: disallow certain class/type combinations
194 if phrase.p_class in self.black_list.keys() \
195 and phrase.p_type in self.black_list[phrase.p_class]:
198 # whitelisting: if class is in whitelist, allow only tags in the list
199 if phrase.p_class in self.white_list.keys() \
200 and phrase.p_type not in self.white_list[phrase.p_class]:
203 # sanity check, in case somebody added garbage in the wiki
204 if not self._check_sanity(phrase):
205 self.statistics_handler.notify_one_phrase_invalid()
208 self.word_phrases.add((phrase.p_label, phrase.p_class,
209 phrase.p_type, phrase.p_operator))
211 return (phrase.p_class, phrase.p_type)
213 def _create_classtype_table_and_indexes(self,
214 class_type_pairs: Iterable[Tuple[str, str]]) -> None:
216 Create table place_classtype for each given pair.
217 Also create indexes on place_id and centroid.
219 LOG.warning('Create tables and indexes...')
221 sql_tablespace = self.config.TABLESPACE_AUX_DATA
223 sql_tablespace = ' TABLESPACE ' + sql_tablespace
225 with self.db_connection.cursor() as db_cursor:
226 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
228 allowed_special_phrases = self.get_classtype_pairs()
230 for pair in class_type_pairs:
231 phrase_class = pair[0]
232 phrase_type = pair[1]
234 if (phrase_class, phrase_type) not in allowed_special_phrases:
235 LOG.warning("Skipping phrase %s=%s: not in allowed special phrases",
236 phrase_class, phrase_type)
239 table_name = _classtype_table(phrase_class, phrase_type)
241 if table_name in self.table_phrases_to_delete:
242 self.statistics_handler.notify_one_table_ignored()
243 # Remove this table from the ones to delete as it match a
244 # class/type still existing on the special phrases of the wiki.
245 self.table_phrases_to_delete.remove(table_name)
246 # So don't need to create the table and indexes.
250 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
253 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
255 # Grant access on read to the web user.
256 self._grant_access_to_webuser(phrase_class, phrase_type)
258 self.statistics_handler.notify_one_table_created()
260 with self.db_connection.cursor() as db_cursor:
261 db_cursor.execute("DROP INDEX idx_placex_classtype")
263 def _create_place_classtype_table(self, sql_tablespace: str,
264 phrase_class: str, phrase_type: str) -> None:
266 Create table place_classtype of the given phrase_class/phrase_type
269 table_name = _classtype_table(phrase_class, phrase_type)
270 with self.db_connection.cursor() as db_cursor:
271 db_cursor.execute(SQL("""CREATE TABLE IF NOT EXISTS {} {} AS
272 SELECT place_id AS place_id,
273 st_centroid(geometry) AS centroid
275 WHERE class = %s AND type = %s
276 """).format(Identifier(table_name), SQL(sql_tablespace)),
277 (phrase_class, phrase_type))
279 def _create_place_classtype_indexes(self, sql_tablespace: str,
280 phrase_class: str, phrase_type: str) -> None:
282 Create indexes on centroid and place_id for the place_classtype table.
284 index_prefix = f'idx_place_classtype_{phrase_class}_{phrase_type}_'
285 base_table = _classtype_table(phrase_class, phrase_type)
287 if not index_exists(self.db_connection, index_prefix + 'centroid'):
288 with self.db_connection.cursor() as db_cursor:
289 db_cursor.execute(SQL("CREATE INDEX {} ON {} USING GIST (centroid) {}")
290 .format(Identifier(index_prefix + 'centroid'),
291 Identifier(base_table),
292 SQL(sql_tablespace)))
295 if not index_exists(self.db_connection, index_prefix + 'place_id'):
296 with self.db_connection.cursor() as db_cursor:
297 db_cursor.execute(SQL("CREATE INDEX {} ON {} USING btree(place_id) {}")
298 .format(Identifier(index_prefix + 'place_id'),
299 Identifier(base_table),
300 SQL(sql_tablespace)))
302 def _grant_access_to_webuser(self, phrase_class: str, phrase_type: str) -> None:
304 Grant access on read to the table place_classtype for the webuser.
306 table_name = _classtype_table(phrase_class, phrase_type)
307 with self.db_connection.cursor() as db_cursor:
308 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
309 .format(Identifier(table_name),
310 Identifier(self.config.DATABASE_WEBUSER)))
312 def _remove_non_existent_tables_from_db(self) -> None:
314 Remove special phrases which doesn't exist on the wiki anymore.
315 Delete the place_classtype tables.
317 LOG.warning('Cleaning database...')
319 # Delete place_classtype tables corresponding to class/type which
320 # are not on the wiki anymore.
321 drop_tables(self.db_connection, *self.table_phrases_to_delete)
322 for _ in self.table_phrases_to_delete:
323 self.statistics_handler.notify_one_table_deleted()