2 Functions to import special phrases into the database.
6 from pathlib import Path
10 from os.path import isfile
11 from icu import Transliterator
12 from psycopg2.sql import Identifier, Literal, SQL
13 from nominatim.tools.exec_utils import get_url
14 from nominatim.errors import UsageError
16 LOG = logging.getLogger()
17 class SpecialPhrasesImporter():
18 # pylint: disable-msg=too-many-instance-attributes
20 Class handling the process of special phrases importations.
22 def __init__(self, config, phplib_dir, db_connection) -> None:
23 self.db_connection = db_connection
25 self.phplib_dir = phplib_dir
26 self.black_list, self.white_list = self._load_white_and_black_lists()
27 #Compile the regex here to increase performances.
28 self.occurence_pattern = re.compile(
29 r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
31 self.sanity_check_pattern = re.compile(r'^\w+$')
32 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
33 self.config.TERM_NORMALIZATION)
34 #This set will contain all existing phrases from the word table which
35 #no longer exist on the wiki.
36 #It contain tuples with the following format: (normalized_word, class, type, operator)
37 self.words_phrases_to_delete = set()
38 #This set will contain the phrases which still exist from the wiki.
39 #It is used to prevent duplicates on the wiki by removing them from
40 #the word_phrases_to_delete only at the end.
41 self.words_phrases_still_exist = set()
42 #This set will contain all existing place_classtype tables which doesn't match any
43 #special phrases class/type on the wiki.
44 self.table_phrases_to_delete = set()
46 def import_from_wiki(self, languages=None):
48 Iterate through all specified languages and
49 extract corresponding special phrases from the wiki.
51 if languages is not None and not isinstance(languages, list):
52 raise TypeError('The \'languages\' argument should be of type list.')
54 self._fetch_existing_words_phrases()
55 self._fetch_existing_place_classtype_tables()
57 #Get all languages to process.
58 languages = self._load_languages() if not languages else languages
60 #Store pairs of class/type for further processing
61 class_type_pairs = set()
63 for lang in languages:
64 LOG.warning('Import phrases for lang: %s', lang)
65 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
66 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
68 self._create_place_classtype_table_and_indexes(class_type_pairs)
69 self._remove_non_existent_phrases_from_db()
70 self.db_connection.commit()
71 LOG.warning('Import done.')
73 def _fetch_existing_words_phrases(self):
75 Fetch existing special phrases from the word table.
76 Fill the word_phrases_to_delete set of the class.
78 #Only extract special phrases terms:
79 #If class=place and type=house then it is a housenumber term.
80 #If class=place and type=postcode then it is a postcode term.
82 SELECT word, class, type, operator FROM word
83 WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
85 with self.db_connection.cursor() as db_cursor:
86 db_cursor.execute(SQL(word_query))
88 row[3] = '-' if row[3] is None else row[3]
89 self.words_phrases_to_delete.add(
90 (row[0], row[1], row[2], row[3])
93 def _fetch_existing_place_classtype_tables(self):
95 Fetch existing place_classtype tables.
96 Fill the table_phrases_to_delete set of the class.
100 FROM information_schema.tables
101 WHERE table_schema='public'
102 AND table_name like 'place_classtype_%';
104 with self.db_connection.cursor() as db_cursor:
105 db_cursor.execute(SQL(query))
106 for row in db_cursor:
107 self.table_phrases_to_delete.add(row[0])
109 def _load_white_and_black_lists(self):
111 Load white and black lists from phrases-settings.json.
113 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
115 if self.config.PHRASE_CONFIG:
116 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
118 with settings_path.open("r") as json_settings:
119 settings = json.load(json_settings)
120 return settings['blackList'], settings['whiteList']
122 def _load_languages(self):
124 Get list of all languages from env config file
125 or default if there is no languages configured.
126 The system will extract special phrases only from all specified languages.
128 default_languages = [
129 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
130 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
131 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
132 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
133 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
136 def _get_wiki_content(lang):
138 Request and return the wiki page's content
139 corresponding to special phrases for a given lang.
140 Requested URL Example :
141 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
143 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
146 def _check_sanity(self, lang, phrase_class, phrase_type):
148 Check sanity of given inputs in case somebody added garbage in the wiki.
149 If a bad class/type is detected the system will exit with an error.
151 type_matchs = self.sanity_check_pattern.findall(phrase_type)
152 class_matchs = self.sanity_check_pattern.findall(phrase_class)
154 if len(class_matchs) < 1 or len(type_matchs) < 1:
155 LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
156 lang, phrase_class, phrase_type)
160 def _process_xml_content(self, xml_content, lang):
162 Process given xml content by extracting matching patterns.
163 Matching patterns are processed there and returned in a
164 set of class/type pairs.
166 #One match will be of format [label, class, type, operator, plural]
167 matches = self.occurence_pattern.findall(xml_content)
168 #Store pairs of class/type for further processing
169 class_type_pairs = set()
171 for match in matches:
172 phrase_label = match[0].strip()
173 normalized_label = self.transliterator.transliterate(phrase_label)
174 phrase_class = match[1].strip()
175 phrase_type = match[2].strip()
176 phrase_operator = match[3].strip()
177 #Needed if some operator in the wiki are not written in english
178 phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
179 #hack around a bug where building=yes was imported with quotes into the wiki
180 phrase_type = re.sub(r'\"|"', '', phrase_type)
182 #blacklisting: disallow certain class/type combinations
184 phrase_class in self.black_list.keys() and
185 phrase_type in self.black_list[phrase_class]
188 #whitelisting: if class is in whitelist, allow only tags in the list
190 phrase_class in self.white_list.keys() and
191 phrase_type not in self.white_list[phrase_class]
195 #Check if the phrase already exists in the database.
197 (normalized_label, phrase_class, phrase_type, phrase_operator)
198 in self.words_phrases_to_delete
200 #Remove this phrase from the ones to delete as it still exist on the wiki.
201 self.words_phrases_still_exist.add(
202 (normalized_label, phrase_class, phrase_type, phrase_operator)
204 class_type_pairs.add((phrase_class, phrase_type))
205 #Dont need to add this phrase as it already exists in the word table.
208 #sanity check, in case somebody added garbage in the wiki
209 if not self._check_sanity(lang, phrase_class, phrase_type):
212 class_type_pairs.add((phrase_class, phrase_type))
214 self._process_amenity(
215 phrase_label, normalized_label, phrase_class,
216 phrase_type, phrase_operator
219 return class_type_pairs
221 def _process_amenity(self, phrase_label, normalized_label,
222 phrase_class, phrase_type, phrase_operator):
223 # pylint: disable-msg=too-many-arguments
225 Add phrase lookup and corresponding class and
226 type to the word table based on the operator.
228 with self.db_connection.cursor() as db_cursor:
229 if phrase_operator == 'near':
230 db_cursor.execute("""SELECT getorcreate_amenityoperator(
231 make_standard_name(%s), %s, %s, %s, 'near')""",
232 (phrase_label, normalized_label, phrase_class, phrase_type))
233 elif phrase_operator == 'in':
234 db_cursor.execute("""SELECT getorcreate_amenityoperator(
235 make_standard_name(%s), %s, %s, %s, 'in')""",
236 (phrase_label, normalized_label, phrase_class, phrase_type))
238 db_cursor.execute("""SELECT getorcreate_amenity(
239 make_standard_name(%s), %s, %s, %s)""",
240 (phrase_label, normalized_label, phrase_class, phrase_type))
243 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
245 Create table place_classtype for each given pair.
246 Also create indexes on place_id and centroid.
248 LOG.warning('Create tables and indexes...')
250 sql_tablespace = self.config.TABLESPACE_AUX_DATA
252 sql_tablespace = ' TABLESPACE '+sql_tablespace
254 with self.db_connection.cursor() as db_cursor:
255 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
257 for pair in class_type_pairs:
258 phrase_class = pair[0]
259 phrase_type = pair[1]
261 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
263 if table_name in self.table_phrases_to_delete:
264 #Remove this table from the ones to delete as it match a class/type
265 #still existing on the special phrases of the wiki.
266 self.table_phrases_to_delete.remove(table_name)
267 #So dont need to create the table and indexes.
271 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
274 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
276 #Grant access on read to the web user.
277 self._grant_access_to_webuser(phrase_class, phrase_type)
279 with self.db_connection.cursor() as db_cursor:
280 db_cursor.execute("DROP INDEX idx_placex_classtype")
283 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
285 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
287 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
288 with self.db_connection.cursor() as db_cursor:
289 db_cursor.execute(SQL("""
290 CREATE TABLE IF NOT EXISTS {{}} {}
291 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
292 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
293 .format(Identifier(table_name), Literal(phrase_class),
294 Literal(phrase_type)))
297 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
299 Create indexes on centroid and place_id for the place_classtype table.
301 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
302 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
304 if not self.db_connection.index_exists(index_prefix + 'centroid'):
305 with self.db_connection.cursor() as db_cursor:
306 db_cursor.execute(SQL("""
307 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
308 .format(Identifier(index_prefix + 'centroid'),
309 Identifier(base_table)), sql_tablespace)
312 if not self.db_connection.index_exists(index_prefix + 'place_id'):
313 with self.db_connection.cursor() as db_cursor:
314 db_cursor.execute(SQL(
315 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
316 .format(Identifier(index_prefix + 'place_id'),
317 Identifier(base_table)))
320 def _grant_access_to_webuser(self, phrase_class, phrase_type):
322 Grant access on read to the table place_classtype for the webuser.
324 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
325 with self.db_connection.cursor() as db_cursor:
326 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
327 .format(Identifier(table_name),
328 Identifier(self.config.DATABASE_WEBUSER)))
330 def _remove_non_existent_phrases_from_db(self):
332 Remove special phrases which doesn't exist on the wiki anymore.
333 Delete from the word table and delete the place_classtype tables.
335 LOG.warning('Cleaning database...')
336 self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
337 #Array containing all queries to execute. Contain tuples of format (query, parameters)
338 queries_parameters = []
340 #Delete phrases from the word table which are not on the wiki anymore.
341 for phrase_to_delete in self.words_phrases_to_delete:
342 if phrase_to_delete[3] == '-':
344 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
346 parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
347 queries_parameters.append((query, parameters))
350 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
352 parameters = (phrase_to_delete[0], phrase_to_delete[1],
353 phrase_to_delete[2], phrase_to_delete[3], )
354 queries_parameters.append((query, parameters))
356 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
357 for table in self.table_phrases_to_delete:
358 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
359 queries_parameters.append((query, ()))
361 with self.db_connection.cursor() as db_cursor:
362 for query, parameters in queries_parameters:
363 db_cursor.execute(query, parameters)
365 def _convert_php_settings_if_needed(self, file_path):
367 Convert php settings file of special phrases to json file if it is still in php format.
369 if not isfile(file_path):
370 raise UsageError(str(file_path) + ' is not a valid file.')
372 file, extension = os.path.splitext(file_path)
373 json_file_path = Path(file + '.json').resolve()
375 if extension not in('.php', '.json'):
376 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
378 if extension == '.php' and not isfile(json_file_path):
380 subprocess.run(['/usr/bin/env', 'php', '-Cq',
381 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
382 file_path], check=True)
383 LOG.warning('special_phrase configuration file has been converted to json.')
384 return json_file_path
385 except subprocess.CalledProcessError:
386 LOG.error('Error while converting %s to json.', file_path)
389 return json_file_path