1 # SPDX-License-Identifier: GPL-2.0-only
3 # This file is part of Nominatim. (https://nominatim.org)
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
8 Functions for importing, updating and otherwise maintaining the table
9 of artificial postcode centroids.
11 from collections import defaultdict
15 from math import isfinite
17 from psycopg2 import sql as pysql
19 from nominatim.db.connection import connect
20 from nominatim.utils.centroid import PointsCentroid
22 LOG = logging.getLogger()
24 def _to_float(num, max_value):
25 """ Convert the number in string into a float. The number is expected
26 to be in the range of [-max_value, max_value]. Otherwise rises a
30 if not isfinite(num) or num <= -max_value or num >= max_value:
35 class _CountryPostcodesCollector:
36 """ Collector for postcodes of a single country.
39 def __init__(self, country):
40 self.country = country
41 self.collected = defaultdict(PointsCentroid)
44 def add(self, postcode, x, y):
45 """ Add the given postcode to the collection cache. If the postcode
46 already existed, it is overwritten with the new centroid.
48 self.collected[postcode] += (x, y)
51 def commit(self, conn, analyzer, project_dir):
52 """ Update postcodes for the country from the postcodes selected so far
53 as well as any externally supplied postcodes.
55 self._update_from_external(analyzer, project_dir)
56 to_add, to_delete, to_update = self._compute_changes(conn)
58 LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
59 self.country, len(to_add), len(to_delete), len(to_update))
61 with conn.cursor() as cur:
64 """INSERT INTO location_postcode
65 (place_id, indexed_status, country_code,
66 postcode, geometry) VALUES %s""",
68 template=pysql.SQL("""(nextval('seq_place'), 1, {},
69 %s, 'SRID=4326;POINT(%s %s)')
70 """).format(pysql.Literal(self.country)))
72 cur.execute("""DELETE FROM location_postcode
73 WHERE country_code = %s and postcode = any(%s)
74 """, (self.country, to_delete))
77 pysql.SQL("""UPDATE location_postcode
78 SET indexed_status = 2,
79 geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
80 FROM (VALUES %s) AS v (pc, x, y)
81 WHERE country_code = {} and postcode = pc
82 """).format(pysql.Literal(self.country)), to_update)
85 def _compute_changes(self, conn):
86 """ Compute which postcodes from the collected postcodes have to be
87 added or modified and which from the location_postcode table
92 with conn.cursor() as cur:
93 cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
94 FROM location_postcode
95 WHERE country_code = %s""",
97 for postcode, x, y in cur:
98 pcobj = self.collected.pop(postcode, None)
100 newx, newy = pcobj.centroid()
101 if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
102 to_update.append((postcode, newx, newy))
104 to_delete.append(postcode)
106 to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
107 self.collected = None
109 return to_add, to_delete, to_update
112 def _update_from_external(self, analyzer, project_dir):
113 """ Look for an external postcode file for the active country in
114 the project directory and add missing postcodes when found.
116 csvfile = self._open_external(project_dir)
121 reader = csv.DictReader(csvfile)
123 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
124 LOG.warning("Bad format for external postcode file for country '%s'."
125 " Ignored.", self.country)
127 postcode = analyzer.normalize_postcode(row['postcode'])
128 if postcode not in self.collected:
130 # Do float conversation separately, it might throw
131 centroid = (_to_float(row['lon'], 180),
132 _to_float(row['lat'], 90))
133 self.collected[postcode] += centroid
135 LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
136 row['lat'], row['lon'], self.country)
142 def _open_external(self, project_dir):
143 fname = project_dir / f'{self.country}_postcodes.csv'
146 LOG.info("Using external postcode file '%s'.", fname)
147 return open(fname, 'r', encoding='utf-8')
149 fname = project_dir / f'{self.country}_postcodes.csv.gz'
152 LOG.info("Using external postcode file '%s'.", fname)
153 return gzip.open(fname, 'rt')
158 def update_postcodes(dsn, project_dir, tokenizer):
159 """ Update the table of artificial postcodes.
161 Computes artificial postcode centroids from the placex table,
162 potentially enhances it with external data and then updates the
163 postcodes in the table 'location_postcode'.
165 with tokenizer.name_analyzer() as analyzer:
166 with connect(dsn) as conn:
167 # First get the list of countries that currently have postcodes.
168 # (Doing this before starting to insert, so it is fast on import.)
169 with conn.cursor() as cur:
170 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
171 todo_countries = set((row[0] for row in cur))
173 # Recompute the list of valid postcodes from placex.
174 with conn.cursor(name="placex_postcodes") as cur:
176 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
178 COALESCE(plx.country_code,
179 get_country_code(ST_Centroid(pl.geometry))) as cc,
180 token_normalized_postcode(pl.address->'postcode') as pc,
181 COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
182 FROM place AS pl LEFT OUTER JOIN placex AS plx
183 ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
184 WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
185 WHERE pc IS NOT null AND cc IS NOT null
186 ORDER BY country_code, pc""")
190 for country, postcode, x, y in cur:
191 if collector is None or country != collector.country:
192 if collector is not None:
193 collector.commit(conn, analyzer, project_dir)
194 collector = _CountryPostcodesCollector(country)
195 todo_countries.discard(country)
196 collector.add(postcode, x, y)
198 if collector is not None:
199 collector.commit(conn, analyzer, project_dir)
201 # Now handle any countries that are only in the postcode table.
202 for country in todo_countries:
203 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
207 analyzer.update_postcodes_from_db()
209 def can_compute(dsn):
211 Check that the place table exists so that
212 postcodes can be computed.
214 with connect(dsn) as conn:
215 return conn.table_exists('place')