2 Functions to import special phrases into the database.
6 from pathlib import Path
10 from os.path import isfile
11 from icu import Transliterator
12 from psycopg2.sql import Identifier, Literal, SQL
13 from nominatim.tools.exec_utils import get_url
14 from nominatim.errors import UsageError
16 LOG = logging.getLogger()
17 class SpecialPhrasesImporter():
18 # pylint: disable-msg=too-many-instance-attributes
19 # pylint: disable-msg=too-few-public-methods
21 Class handling the process of special phrases importations.
23 def __init__(self, config, phplib_dir, db_connection) -> None:
24 self.db_connection = db_connection
26 self.phplib_dir = phplib_dir
27 self.black_list, self.white_list = self._load_white_and_black_lists()
28 #Compile the regex here to increase performances.
29 self.occurence_pattern = re.compile(
30 r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
32 self.sanity_check_pattern = re.compile(r'^\w+$')
33 self.transliterator = Transliterator.createFromRules("special-phrases normalizer",
34 self.config.TERM_NORMALIZATION)
36 def import_from_wiki(self, languages=None):
38 Iterate through all specified languages and
39 extract corresponding special phrases from the wiki.
41 if languages is not None and not isinstance(languages, list):
42 raise TypeError('The \'languages\' argument should be of type list.')
44 #Get all languages to process.
45 languages = self._load_languages() if not languages else languages
47 #Store pairs of class/type for further processing
48 class_type_pairs = set()
50 for lang in languages:
51 LOG.warning('Import phrases for lang: %s', lang)
52 wiki_page_xml_content = SpecialPhrasesImporter._get_wiki_content(lang)
53 class_type_pairs.update(self._process_xml_content(wiki_page_xml_content, lang))
55 self._create_place_classtype_table_and_indexes(class_type_pairs)
56 self.db_connection.commit()
57 LOG.warning('Import done.')
59 def _load_white_and_black_lists(self):
61 Load white and black lists from phrases-settings.json.
63 settings_path = (self.config.config_dir / 'phrase-settings.json').resolve()
65 if self.config.PHRASE_CONFIG:
66 settings_path = self._convert_php_settings_if_needed(self.config.PHRASE_CONFIG)
68 with settings_path.open("r") as json_settings:
69 settings = json.load(json_settings)
70 return settings['blackList'], settings['whiteList']
72 def _load_languages(self):
74 Get list of all languages from env config file
75 or default if there is no languages configured.
76 The system will extract special phrases only from all specified languages.
79 'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
80 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
81 'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
82 'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
83 return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
86 def _get_wiki_content(lang):
88 Request and return the wiki page's content
89 corresponding to special phrases for a given lang.
90 Requested URL Example :
91 https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
93 url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' + lang.upper() # pylint: disable=line-too-long
96 def _check_sanity(self, lang, phrase_class, phrase_type):
98 Check sanity of given inputs in case somebody added garbage in the wiki.
99 If a bad class/type is detected the system will exit with an error.
101 type_matchs = self.sanity_check_pattern.findall(phrase_type)
102 class_matchs = self.sanity_check_pattern.findall(phrase_class)
104 if len(class_matchs) < 1 or len(type_matchs) < 1:
105 LOG.warning("Bad class/type for language %s: %s=%s. It will not be imported",
106 lang, phrase_class, phrase_type)
110 def _process_xml_content(self, xml_content, lang):
112 Process given xml content by extracting matching patterns.
113 Matching patterns are processed there and returned in a
114 set of class/type pairs.
116 #One match will be of format [label, class, type, operator, plural]
117 matches = self.occurence_pattern.findall(xml_content)
118 #Store pairs of class/type for further processing
119 class_type_pairs = set()
121 for match in matches:
122 phrase_label = match[0].strip()
123 normalized_label = self.transliterator.transliterate(phrase_label)
124 phrase_class = match[1].strip()
125 phrase_type = match[2].strip()
126 phrase_operator = match[3].strip()
127 #hack around a bug where building=yes was imported with quotes into the wiki
128 phrase_type = re.sub(r'\"|"', '', phrase_type)
130 #sanity check, in case somebody added garbage in the wiki
131 self._check_sanity(lang, phrase_class, phrase_type)
133 #blacklisting: disallow certain class/type combinations
135 phrase_class in self.black_list.keys() and
136 phrase_type in self.black_list[phrase_class]
139 #whitelisting: if class is in whitelist, allow only tags in the list
141 phrase_class in self.white_list.keys() and
142 phrase_type not in self.white_list[phrase_class]
146 #add class/type to the pairs dict
147 class_type_pairs.add((phrase_class, phrase_type))
149 self._process_amenity(
150 phrase_label, normalized_label, phrase_class,
151 phrase_type, phrase_operator
154 return class_type_pairs
156 def _process_amenity(self, phrase_label, normalized_label,
157 phrase_class, phrase_type, phrase_operator):
158 # pylint: disable-msg=too-many-arguments
160 Add phrase lookup and corresponding class and
161 type to the word table based on the operator.
163 with self.db_connection.cursor() as db_cursor:
164 if phrase_operator == 'near':
165 db_cursor.execute("""SELECT getorcreate_amenityoperator(
166 make_standard_name(%s), %s, %s, %s, 'near')""",
167 (phrase_label, normalized_label, phrase_class, phrase_type))
168 elif phrase_operator == 'in':
169 db_cursor.execute("""SELECT getorcreate_amenityoperator(
170 make_standard_name(%s), %s, %s, %s, 'in')""",
171 (phrase_label, normalized_label, phrase_class, phrase_type))
173 db_cursor.execute("""SELECT getorcreate_amenity(
174 make_standard_name(%s), %s, %s, %s)""",
175 (phrase_label, normalized_label, phrase_class, phrase_type))
178 def _create_place_classtype_table_and_indexes(self, class_type_pairs):
180 Create table place_classtype for each given pair.
181 Also create indexes on place_id and centroid.
183 LOG.warning('Create tables and indexes...')
185 sql_tablespace = self.config.TABLESPACE_AUX_DATA
187 sql_tablespace = ' TABLESPACE '+sql_tablespace
189 with self.db_connection.cursor() as db_cursor:
190 db_cursor.execute("CREATE INDEX idx_placex_classtype ON placex (class, type)")
192 for pair in class_type_pairs:
193 phrase_class = pair[0]
194 phrase_type = pair[1]
197 self._create_place_classtype_table(sql_tablespace, phrase_class, phrase_type)
200 self._create_place_classtype_indexes(sql_tablespace, phrase_class, phrase_type)
202 #Grant access on read to the web user.
203 self._grant_access_to_webuser(phrase_class, phrase_type)
205 with self.db_connection.cursor() as db_cursor:
206 db_cursor.execute("DROP INDEX idx_placex_classtype")
209 def _create_place_classtype_table(self, sql_tablespace, phrase_class, phrase_type):
211 Create table place_classtype of the given phrase_class/phrase_type if doesn't exit.
213 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
214 with self.db_connection.cursor() as db_cursor:
215 db_cursor.execute(SQL("""
216 CREATE TABLE IF NOT EXISTS {{}} {}
217 AS SELECT place_id AS place_id,st_centroid(geometry) AS centroid FROM placex
218 WHERE class = {{}} AND type = {{}}""".format(sql_tablespace))
219 .format(Identifier(table_name), Literal(phrase_class),
220 Literal(phrase_type)))
223 def _create_place_classtype_indexes(self, sql_tablespace, phrase_class, phrase_type):
225 Create indexes on centroid and place_id for the place_classtype table.
227 index_prefix = 'idx_place_classtype_{}_{}_'.format(phrase_class, phrase_type)
228 base_table = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
230 if not self.db_connection.index_exists(index_prefix + 'centroid'):
231 with self.db_connection.cursor() as db_cursor:
232 db_cursor.execute(SQL("""
233 CREATE INDEX {{}} ON {{}} USING GIST (centroid) {}""".format(sql_tablespace))
234 .format(Identifier(index_prefix + 'centroid'),
235 Identifier(base_table)), sql_tablespace)
238 if not self.db_connection.index_exists(index_prefix + 'place_id'):
239 with self.db_connection.cursor() as db_cursor:
240 db_cursor.execute(SQL(
241 """CREATE INDEX {{}} ON {{}} USING btree(place_id) {}""".format(sql_tablespace))
242 .format(Identifier(index_prefix + 'place_id'),
243 Identifier(base_table)))
246 def _grant_access_to_webuser(self, phrase_class, phrase_type):
248 Grant access on read to the table place_classtype for the webuser.
250 table_name = 'place_classtype_{}_{}'.format(phrase_class, phrase_type)
251 with self.db_connection.cursor() as db_cursor:
252 db_cursor.execute(SQL("""GRANT SELECT ON {} TO {}""")
253 .format(Identifier(table_name),
254 Identifier(self.config.DATABASE_WEBUSER)))
256 def _convert_php_settings_if_needed(self, file_path):
258 Convert php settings file of special phrases to json file if it is still in php format.
260 if not isfile(file_path):
261 raise UsageError(str(file_path) + ' is not a valid file.')
263 file, extension = os.path.splitext(file_path)
264 json_file_path = Path(file + '.json').resolve()
266 if extension not in('.php', '.json'):
267 raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')
269 if extension == '.php' and not isfile(json_file_path):
271 subprocess.run(['/usr/bin/env', 'php', '-Cq',
272 (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
273 file_path], check=True)
274 LOG.warning('special_phrase configuration file has been converted to json.')
275 return json_file_path
276 except subprocess.CalledProcessError:
277 LOG.error('Error while converting %s to json.', file_path)
280 return json_file_path