1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Functions for importing, updating and otherwise maintaining the table
9 of artificial postcode centroids.
14 from math import isfinite
16 from psycopg2 import sql as pysql
18 from nominatim.db.connection import connect
20 LOG = logging.getLogger()
22 def _to_float(num, max_value):
23 """ Convert the number in string into a float. The number is expected
24 to be in the range of [-max_value, max_value]. Otherwise rises a
28 if not isfinite(num) or num <= -max_value or num >= max_value:
33 class _CountryPostcodesCollector:
34 """ Collector for postcodes of a single country.
37 def __init__(self, country):
38 self.country = country
42 def add(self, postcode, x, y):
43 """ Add the given postcode to the collection cache. If the postcode
44 already existed, it is overwritten with the new centroid.
46 self.collected[postcode] = (x, y)
49 def commit(self, conn, analyzer, project_dir):
50 """ Update postcodes for the country from the postcodes selected so far
51 as well as any externally supplied postcodes.
53 self._update_from_external(analyzer, project_dir)
54 to_add, to_delete, to_update = self._compute_changes(conn)
56 LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
57 self.country, len(to_add), len(to_delete), len(to_update))
59 with conn.cursor() as cur:
62 """INSERT INTO location_postcode
63 (place_id, indexed_status, country_code,
64 postcode, geometry) VALUES %s""",
66 template=pysql.SQL("""(nextval('seq_place'), 1, {},
67 %s, 'SRID=4326;POINT(%s %s)')
68 """).format(pysql.Literal(self.country)))
70 cur.execute("""DELETE FROM location_postcode
71 WHERE country_code = %s and postcode = any(%s)
72 """, (self.country, to_delete))
75 pysql.SQL("""UPDATE location_postcode
76 SET indexed_status = 2,
77 geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
78 FROM (VALUES %s) AS v (pc, x, y)
79 WHERE country_code = {} and postcode = pc
80 """).format(pysql.Literal(self.country)), to_update)
83 def _compute_changes(self, conn):
84 """ Compute which postcodes from the collected postcodes have to be
85 added or modified and which from the location_postcode table
90 with conn.cursor() as cur:
91 cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
92 FROM location_postcode
93 WHERE country_code = %s""",
95 for postcode, x, y in cur:
96 newx, newy = self.collected.pop(postcode, (None, None))
98 dist = (x - newx)**2 + (y - newy)**2
100 to_update.append((postcode, newx, newy))
102 to_delete.append(postcode)
104 to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
107 return to_add, to_delete, to_update
110 def _update_from_external(self, analyzer, project_dir):
111 """ Look for an external postcode file for the active country in
112 the project directory and add missing postcodes when found.
114 csvfile = self._open_external(project_dir)
119 reader = csv.DictReader(csvfile)
121 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
122 LOG.warning("Bad format for external postcode file for country '%s'."
123 " Ignored.", self.country)
125 postcode = analyzer.normalize_postcode(row['postcode'])
126 if postcode not in self.collected:
128 self.collected[postcode] = (_to_float(row['lon'], 180),
129 _to_float(row['lat'], 90))
131 LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
132 row['lat'], row['lon'], self.country)
138 def _open_external(self, project_dir):
139 fname = project_dir / f'{self.country}_postcodes.csv'
142 LOG.info("Using external postcode file '%s'.", fname)
143 return open(fname, 'r', encoding='utf-8')
145 fname = project_dir / f'{self.country}_postcodes.csv.gz'
148 LOG.info("Using external postcode file '%s'.", fname)
149 return gzip.open(fname, 'rt')
154 def update_postcodes(dsn, project_dir, tokenizer):
155 """ Update the table of artificial postcodes.
157 Computes artificial postcode centroids from the placex table,
158 potentially enhances it with external data and then updates the
159 postcodes in the table 'location_postcode'.
161 with tokenizer.name_analyzer() as analyzer:
162 with connect(dsn) as conn:
163 # First get the list of countries that currently have postcodes.
164 # (Doing this before starting to insert, so it is fast on import.)
165 with conn.cursor() as cur:
166 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
167 todo_countries = set((row[0] for row in cur))
169 # Recompute the list of valid postcodes from placex.
170 with conn.cursor(name="placex_postcodes") as cur:
172 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
174 COALESCE(plx.country_code,
175 get_country_code(ST_Centroid(pl.geometry))) as cc,
176 token_normalized_postcode(pl.address->'postcode') as pc,
177 ST_Centroid(ST_Collect(COALESCE(plx.centroid,
178 ST_Centroid(pl.geometry)))) as centroid
179 FROM place AS pl LEFT OUTER JOIN placex AS plx
180 ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
181 WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
183 WHERE pc IS NOT null AND cc IS NOT null
184 ORDER BY country_code, pc""")
188 for country, postcode, x, y in cur:
189 if collector is None or country != collector.country:
190 if collector is not None:
191 collector.commit(conn, analyzer, project_dir)
192 collector = _CountryPostcodesCollector(country)
193 todo_countries.discard(country)
194 collector.add(postcode, x, y)
196 if collector is not None:
197 collector.commit(conn, analyzer, project_dir)
199 # Now handle any countries that are only in the postcode table.
200 for country in todo_countries:
201 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
205 analyzer.update_postcodes_from_db()
207 def can_compute(dsn):
209 Check that the place table exists so that
210 postcodes can be computed.
212 with connect(dsn) as conn:
213 return conn.table_exists('place')