2 Functions to import special phrases into the database.
6 from pathlib import Path
10 from os.path import isfile
11 from icu import Transliterator
12 from psycopg2.sql import Identifier, Literal, SQL
13 from nominatim.tools.exec_utils import get_url
14 from nominatim.errors import UsageError
16 LOG = logging.getLogger()
17 class SpecialPhrasesImporter():
18 # pylint: disable-msg=too-many-instance-attributes
19 # pylint: disable-msg=too-few-public-methods
21 Class handling the process of special phrases importations.
23 def __init__(self, config, phplib_dir, db_connection) -> None:
24 self.db_connection = db_connection
26 self.phplib_dir = phplib_dir
27 self.black_list, self.white_list = self._load_white_and_black_lists()
28 #Compile the regex here to increase performances.
29 self.occurence_pattern = re.compile(
30 r'\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([^\|]+) \|\| ([\-YN])'
32 self.sanity_check_pattern = re.compile(r'^\w+$')
33 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
34 self.config.TERM_NORMALIZATION)
35 #This set will contain all existing phrases from the word table which
36 #no longer exist on the wiki.
37 #It contain tuples with the following format: (normalized_word, class, type, operator)
38 self.words_phrases_to_delete = set()
39 #This set will contain the phrases which still exist from the wiki.
40 #It is used to prevent duplicates on the wiki by removing them from
41 #the word_phrases_to_delete only at the end.
42 self.words_phrases_still_exist = set()
43 #This set will contain all existing place_classtype tables which doesn't match any
44 #special phrases class/type on the wiki.
45 self.table_phrases_to_delete = set()
46 self.table_phrases = set()
48 def import_from_wiki(self, languages=None):
50 Iterate through all specified languages and
51 extract corresponding special phrases from the wiki.
53 if languages is not None and not isinstance(languages, list):
54 raise TypeError('The \'languages\' argument should be of type list.')
56 self.fetch_existing_words_phrases()
57 self.fetch_existing_place_classtype_tables()
59 #Get all languages to process.
60 languages = self._load_languages() if not languages else languages
62 #Store pairs of class/type for further processing
63 class_type_pairs = set()
65 for lang in languages:
66 LOG.warning('Import phrases for lang: %s', lang)
67 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
68 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
70 self._create_place_classtype_table_and_indexes(class_type_pairs)
71 self.remove_non_existent_phrases_from_db()
72 self.db_connection.commit()
73 LOG.warning('Import done.')
75 def fetch_existing_words_phrases(self):
77 Fetch existing special phrases from the word table.
78 Fill the word_phrases_to_delete set of the class.
80 #Only extract special phrases terms:
81 #If class=place and type=house then it is a housenumber term.
82 #If class=place and type=postcode then it is a postcode term.
84 SELECT word, class, type, operator FROM word
85 WHERE class != 'place' OR (type != 'house' AND type != 'postcode')
87 with self.db_connection.cursor() as db_cursor:
88 db_cursor.execute(SQL(word_query))
90 row[3] = '-' if row[3] is None else row[3]
91 self.words_phrases_to_delete.add(
92 (row[0], row[1], row[2], row[3])
95 def fetch_existing_place_classtype_tables(self):
97 Fetch existing place_classtype tables.
98 Fill the table_phrases_to_delete set of the class.
102 FROM information_schema.tables
103 WHERE table_schema='public'
104 AND table_name like 'place_classtype_%';
106 with self.db_connection.cursor() as db_cursor:
107 db_cursor.execute(SQL(query))
108 for row in db_cursor:
109 self.table_phrases_to_delete.add(row[0])
111 def _load_white_and_black_lists(self):
113 Load white and black lists from phrases-settings.json.
115 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
117 if self.config.PHRASE_CONFIG:
118 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
120 with open(settings_path, "r") as json_settings:
121 settings = json.load(json_settings)
122 return settings['blackList'], settings['whiteList']
124 def _load_languages(self):
126 Get list of all languages from env config file
127 or default if there is no languages configured.
128 The system will extract special phrases only from all specified languages.
130 default_languages = [
131 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
132 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
133 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
134 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
135 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
138 def _get_wiki_content(lang):
140 Request and return the wiki page's content
141 corresponding to special phrases for a given lang.
142 Requested URL Example :
143 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
145 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
148 def _check_sanity(self, lang, phrase_class, phrase_type):
150 Check sanity of given inputs in case somebody added garbage in the wiki.
151 If a bad class/type is detected the system will exit with an error.
153 type_matchs = self.sanity_check_pattern.findall(phrase_type)
154 class_matchs = self.sanity_check_pattern.findall(phrase_class)
156 if len(class_matchs) < 1 or len(type_matchs) < 1:
157 raise UsageError("Bad class/type for language {}: {}={}".format(
158 lang, phrase_class, phrase_type))
160 def _process_xml_content(self, xml_content, lang):
162 Process given xml content by extracting matching patterns.
163 Matching patterns are processed there and returned in a
164 set of class/type pairs.
166 #One match will be of format [label, class, type, operator, plural]
167 matches = self.occurence_pattern.findall(xml_content)
168 #Store pairs of class/type for further processing
169 class_type_pairs = set()
171 for match in matches:
172 phrase_label = match[0].strip()
173 normalized_label = self.transliterator.transliterate(phrase_label)
174 phrase_class = match[1].strip()
175 phrase_type = match[2].strip()
176 phrase_operator = match[3].strip()
177 #Needed if some operator in the wiki are not written in english
178 phrase_operator = '-' if phrase_operator not in ('near', 'in') else phrase_operator
179 #hack around a bug where building=yes was imported with quotes into the wiki
180 phrase_type = re.sub(r'\"|"', '', phrase_type)
182 #blacklisting: disallow certain class/type combinations
184 phrase_class in self.black_list.keys() and
185 phrase_type in self.black_list[phrase_class]
188 #whitelisting: if class is in whitelist, allow only tags in the list
190 phrase_class in self.white_list.keys() and
191 phrase_type not in self.white_list[phrase_class]
195 #Check if the phrase already exists in the database.
197 (normalized_label, phrase_class, phrase_type, phrase_operator)
198 in self.words_phrases_to_delete
200 #Remove this phrase from the ones to delete as it still exist on the wiki.
201 self.words_phrases_still_exist.add(
202 (normalized_label, phrase_class, phrase_type, phrase_operator)
204 class_type_pairs.add((phrase_class, phrase_type))
205 #Dont need to add this phrase as it already exists in the word table.
208 #sanity check, in case somebody added garbage in the wiki
209 self._check_sanity(lang, phrase_class, phrase_type)
211 class_type_pairs.add((phrase_class, phrase_type))
213 self._process_amenity(
214 phrase_label, normalized_label, phrase_class,
215 phrase_type, phrase_operator
218 return class_type_pairs
220 def _process_amenity(self, phrase_label, normalized_label,
221 phrase_class, phrase_type, phrase_operator):
222 # pylint: disable-msg=too-many-arguments
224 Add phrase lookup and corresponding class and
225 type to the word table based on the operator.
227 with self.db_connection.cursor() as db_cursor:
228 if phrase_operator == 'near':
229 db_cursor.execute("""SELECT getorcreate_amenityoperator(
230 make_standard_name(%s), %s, %s, %s, 'near')""",
231 (phrase_label, normalized_label, phrase_class, phrase_type))
232 elif phrase_operator == 'in':
233 db_cursor.execute("""SELECT getorcreate_amenityoperator(
234 make_standard_name(%s), %s, %s, %s, 'in')""",
235 (phrase_label, normalized_label, phrase_class, phrase_type))
237 db_cursor.execute("""SELECT getorcreate_amenity(
238 make_standard_name(%s), %s, %s, %s)""",
239 (phrase_label, normalized_label, phrase_class, phrase_type))
242 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
244 Create table place_classtype for each given pair.
245 Also create indexes on place_id and centroid.
247 LOG.warning('Create tables and indexes...')
249 sql_tablespace = self.config.TABLESPACE_AUX_DATA
251 sql_tablespace = ' TABLESPACE '+sql_tablespace
253 with self.db_connection.cursor() as db_cursor:
254 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
256 for pair in class_type_pairs:
257 phrase_class = pair[0]
258 phrase_type = pair[1]
260 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
262 if table_name in self.table_phrases_to_delete:
263 #Remove this table from the ones to delete as it match a class/type
264 #still existing on the special phrases of the wiki.
265 self.table_phrases_to_delete.remove(table_name)
266 #So dont need to create the table and indexes.
270 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
273 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
275 #Grant access on read to the web user.
276 self._grant_access_to_webuser(phrase_class, phrase_type)
278 with self.db_connection.cursor() as db_cursor:
279 db_cursor.execute("DROP INDEX idx_placex_classtype")
282 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
284 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
286 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
287 with self.db_connection.cursor() as db_cursor:
288 db_cursor.execute(SQL("""
289 CREATE TABLE IF NOT EXISTS {{}} {}
290 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
291 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
292 .format(Identifier(table_name), Literal(phrase_class),
293 Literal(phrase_type)))
296 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
298 Create indexes on centroid and place_id for the place_classtype table.
300 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
301 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
302 #Index on centroidself.table_phrases_to_delete.add(row)
303 if not self.db_connection.index_exists(index_prefix + 'centroid'):
304 with self.db_connection.cursor() as db_cursor:
305 db_cursor.execute(SQL("""
306 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
307 .format(Identifier(index_prefix + 'centroid'),
308 Identifier(base_table)), sql_tablespace)
311 if not self.db_connection.index_exists(index_prefix + 'place_id'):
312 with self.db_connection.cursor() as db_cursor:
313 db_cursor.execute(SQL(
314 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
315 .format(Identifier(index_prefix + 'place_id'),
316 Identifier(base_table)))
319 def _grant_access_to_webuser(self, phrase_class, phrase_type):
321 Grant access on read to the table place_classtype for the webuser.
323 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
324 with self.db_connection.cursor() as db_cursor:
325 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
326 .format(Identifier(table_name),
327 Identifier(self.config.DATABASE_WEBUSER)))
329 def remove_non_existent_phrases_from_db(self):
331 Remove special phrases which doesn't exist on the wiki anymore.
332 Delete from the word table and delete the place_classtype tables.
334 LOG.warning('Cleaning database...')
335 self.words_phrases_to_delete = self.words_phrases_to_delete - self.words_phrases_still_exist
336 #Array containing all queries to execute. Contain tuples of format (query, parameters)
337 queries_parameters = []
339 #Delete phrases from the word table which are not on the wiki anymore.
340 for phrase_to_delete in self.words_phrases_to_delete:
341 if phrase_to_delete[3] == '-':
343 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator IS null
345 parameters = (phrase_to_delete[0], phrase_to_delete[1], phrase_to_delete[2], )
346 queries_parameters.append((query, parameters))
349 DELETE FROM word WHERE word = %s AND class = %s AND type = %s AND operator = %s
351 parameters = (phrase_to_delete[0], phrase_to_delete[1],
352 phrase_to_delete[2], phrase_to_delete[3], )
353 queries_parameters.append((query, parameters))
355 #Delete place_classtype tables corresponding to class/type which are not on the wiki anymore
356 for table in self.table_phrases_to_delete:
357 query = SQL('DROP TABLE IF EXISTS {}').format(Identifier(table))
358 queries_parameters.append((query, ()))
360 with self.db_connection.cursor() as db_cursor:
361 for query, parameters in queries_parameters:
362 db_cursor.execute(query, parameters)
364 def _convert_php_settings_if_needed(self, file_path):
366 Convert php settings file of special phrases to json file if it is still in php format.
368 if not isfile(file_path):
369 raise UsageError(str(file_path) + ' is not a valid file.')
371 file, extension = os.path.splitext(file_path)
372 json_file_path = Path(file + '.json').resolve()
374 if extension not in('.php', '.json'):
375 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
377 if extension == '.php' and not isfile(json_file_path):
379 subprocess.run(['/usr/bin/env', 'php', '-Cq',
380 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
381 file_path], check=True)
382 LOG.warning('special_phrase configuration file has been converted to json.')
383 return json_file_path
384 except subprocess.CalledProcessError:
385 LOG.error('Error while converting %s to json.', file_path)
388 return json_file_path