]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tools/postcodes.py
move postcode centroid computation to Python
[nominatim.git] / nominatim / tools / postcodes.py
1 # SPDX-License-Identifier: GPL-2.0-only
2 #
3 # This file is part of Nominatim. (https://nominatim.org)
4 #
5 # Copyright (C) 2022 by the Nominatim developer community.
6 # For a full list of authors see the git log.
7 """
8 Functions for importing, updating and otherwise maintaining the table
9 of artificial postcode centroids.
10 """
11 from collections import defaultdict
12 import csv
13 import gzip
14 import logging
15 from math import isfinite
16
17 from psycopg2 import sql as pysql
18
19 from nominatim.db.connection import connect
20 from nominatim.utils.centroid import PointsCentroid
21
22 LOG = logging.getLogger()
23
24 def _to_float(num, max_value):
25     """ Convert the number in string into a float. The number is expected
26         to be in the range of [-max_value, max_value]. Otherwise rises a
27         ValueError.
28     """
29     num = float(num)
30     if not isfinite(num) or num <= -max_value or num >= max_value:
31         raise ValueError()
32
33     return num
34
35 class _CountryPostcodesCollector:
36     """ Collector for postcodes of a single country.
37     """
38
39     def __init__(self, country):
40         self.country = country
41         self.collected = defaultdict(PointsCentroid)
42
43
44     def add(self, postcode, x, y):
45         """ Add the given postcode to the collection cache. If the postcode
46             already existed, it is overwritten with the new centroid.
47         """
48         self.collected[postcode] += (x, y)
49
50
51     def commit(self, conn, analyzer, project_dir):
52         """ Update postcodes for the country from the postcodes selected so far
53             as well as any externally supplied postcodes.
54         """
55         self._update_from_external(analyzer, project_dir)
56         to_add, to_delete, to_update = self._compute_changes(conn)
57
58         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
59                  self.country, len(to_add), len(to_delete), len(to_update))
60
61         with conn.cursor() as cur:
62             if to_add:
63                 cur.execute_values(
64                     """INSERT INTO location_postcode
65                          (place_id, indexed_status, country_code,
66                           postcode, geometry) VALUES %s""",
67                     to_add,
68                     template=pysql.SQL("""(nextval('seq_place'), 1, {},
69                                           %s, 'SRID=4326;POINT(%s %s)')
70                                        """).format(pysql.Literal(self.country)))
71             if to_delete:
72                 cur.execute("""DELETE FROM location_postcode
73                                WHERE country_code = %s and postcode = any(%s)
74                             """, (self.country, to_delete))
75             if to_update:
76                 cur.execute_values(
77                     pysql.SQL("""UPDATE location_postcode
78                                  SET indexed_status = 2,
79                                      geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
80                                  FROM (VALUES %s) AS v (pc, x, y)
81                                  WHERE country_code = {} and postcode = pc
82                               """).format(pysql.Literal(self.country)), to_update)
83
84
85     def _compute_changes(self, conn):
86         """ Compute which postcodes from the collected postcodes have to be
87             added or modified and which from the location_postcode table
88             have to be deleted.
89         """
90         to_update = []
91         to_delete = []
92         with conn.cursor() as cur:
93             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
94                            FROM location_postcode
95                            WHERE country_code = %s""",
96                         (self.country, ))
97             for postcode, x, y in cur:
98                 pcobj = self.collected.pop(postcode, None)
99                 if pcobj:
100                     newx, newy = pcobj.centroid()
101                     if (x - newx) > 0.0000001 or (y - newy) > 0.0000001:
102                         to_update.append((postcode, newx, newy))
103                 else:
104                     to_delete.append(postcode)
105
106         to_add = [(k, *v.centroid()) for k, v in self.collected.items()]
107         self.collected = None
108
109         return to_add, to_delete, to_update
110
111
112     def _update_from_external(self, analyzer, project_dir):
113         """ Look for an external postcode file for the active country in
114             the project directory and add missing postcodes when found.
115         """
116         csvfile = self._open_external(project_dir)
117         if csvfile is None:
118             return
119
120         try:
121             reader = csv.DictReader(csvfile)
122             for row in reader:
123                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
124                     LOG.warning("Bad format for external postcode file for country '%s'."
125                                 " Ignored.", self.country)
126                     return
127                 postcode = analyzer.normalize_postcode(row['postcode'])
128                 if postcode not in self.collected:
129                     try:
130                         # Do float conversation separately, it might throw
131                         centroid = (_to_float(row['lon'], 180),
132                                     _to_float(row['lat'], 90))
133                         self.collected[postcode] += centroid
134                     except ValueError:
135                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
136                                     row['lat'], row['lon'], self.country)
137
138         finally:
139             csvfile.close()
140
141
142     def _open_external(self, project_dir):
143         fname = project_dir / f'{self.country}_postcodes.csv'
144
145         if fname.is_file():
146             LOG.info("Using external postcode file '%s'.", fname)
147             return open(fname, 'r', encoding='utf-8')
148
149         fname = project_dir / f'{self.country}_postcodes.csv.gz'
150
151         if fname.is_file():
152             LOG.info("Using external postcode file '%s'.", fname)
153             return gzip.open(fname, 'rt')
154
155         return None
156
157
158 def update_postcodes(dsn, project_dir, tokenizer):
159     """ Update the table of artificial postcodes.
160
161         Computes artificial postcode centroids from the placex table,
162         potentially enhances it with external data and then updates the
163         postcodes in the table 'location_postcode'.
164     """
165     with tokenizer.name_analyzer() as analyzer:
166         with connect(dsn) as conn:
167             # First get the list of countries that currently have postcodes.
168             # (Doing this before starting to insert, so it is fast on import.)
169             with conn.cursor() as cur:
170                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
171                 todo_countries = set((row[0] for row in cur))
172
173             # Recompute the list of valid postcodes from placex.
174             with conn.cursor(name="placex_postcodes") as cur:
175                 cur.execute("""
176                 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
177                 FROM (SELECT
178                         COALESCE(plx.country_code,
179                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
180                         token_normalized_postcode(pl.address->'postcode') as pc,
181                         COALESCE(plx.centroid, ST_Centroid(pl.geometry)) as centroid
182                       FROM place AS pl LEFT OUTER JOIN placex AS plx
183                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
184                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null) xx
185                 WHERE pc IS NOT null AND cc IS NOT null
186                 ORDER BY country_code, pc""")
187
188                 collector = None
189
190                 for country, postcode, x, y in cur:
191                     if collector is None or country != collector.country:
192                         if collector is not None:
193                             collector.commit(conn, analyzer, project_dir)
194                         collector = _CountryPostcodesCollector(country)
195                         todo_countries.discard(country)
196                     collector.add(postcode, x, y)
197
198                 if collector is not None:
199                     collector.commit(conn, analyzer, project_dir)
200
201             # Now handle any countries that are only in the postcode table.
202             for country in todo_countries:
203                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
204
205             conn.commit()
206
207         analyzer.update_postcodes_from_db()
208
209 def can_compute(dsn):
210     """
211         Check that the place table exists so that
212         postcodes can be computed.
213     """
214     with connect(dsn) as conn:
215         return conn.table_exists('place')