]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tools/postcodes.py
more formatting fixes
[nominatim.git] / nominatim / tools / postcodes.py
1 """
2 Functions for importing, updating and otherwise maintaining the table
3 of artificial postcode centroids.
4 """
5 import csv
6 import gzip
7 import logging
8 from math import isfinite
9
10 from psycopg2.extras import execute_values
11
12 from nominatim.db.connection import connect
13
14 LOG = logging.getLogger()
15
16 def _to_float(num, max_value):
17     """ Convert the number in string into a float. The number is expected
18         to be in the range of [-max_value, max_value]. Otherwise rises a
19         ValueError.
20     """
21     num = float(num)
22     if not isfinite(num) or num <= -max_value or num >= max_value:
23         raise ValueError()
24
25     return num
26
27 class _CountryPostcodesCollector:
28     """ Collector for postcodes of a single country.
29     """
30
31     def __init__(self, country):
32         self.country = country
33         self.collected = dict()
34
35
36     def add(self, postcode, x, y):
37         """ Add the given postcode to the collection cache. If the postcode
38             already existed, it is overwritten with the new centroid.
39         """
40         self.collected[postcode] = (x, y)
41
42
43     def commit(self, conn, analyzer, project_dir):
44         """ Update postcodes for the country from the postcodes selected so far
45             as well as any externally supplied postcodes.
46         """
47         self._update_from_external(analyzer, project_dir)
48         to_add, to_delete, to_update = self._compute_changes(conn)
49
50         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
51                  self.country, len(to_add), len(to_delete), len(to_update))
52
53         with conn.cursor() as cur:
54             if to_add:
55                 execute_values(cur,
56                                """INSERT INTO location_postcode
57                                       (place_id, indexed_status, country_code,
58                                        postcode, geometry) VALUES %s""",
59                                to_add,
60                                template="""(nextval('seq_place'), 1, '{}',
61                                            %s, 'SRID=4326;POINT(%s %s)')
62                                         """.format(self.country))
63             if to_delete:
64                 cur.execute("""DELETE FROM location_postcode
65                                WHERE country_code = %s and postcode = any(%s)
66                             """, (self.country, to_delete))
67             if to_update:
68                 execute_values(cur,
69                                """UPDATE location_postcode
70                                   SET indexed_status = 2,
71                                       geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
72                                   FROM (VALUES %s) AS v (pc, x, y)
73                                   WHERE country_code = '{}' and postcode = pc
74                                """.format(self.country),
75                                to_update)
76
77
78     def _compute_changes(self, conn):
79         """ Compute which postcodes from the collected postcodes have to be
80             added or modified and which from the location_postcode table
81             have to be deleted.
82         """
83         to_update = []
84         to_delete = []
85         with conn.cursor() as cur:
86             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
87                            FROM location_postcode
88                            WHERE country_code = %s""",
89                         (self.country, ))
90             for postcode, x, y in cur:
91                 newx, newy = self.collected.pop(postcode, (None, None))
92                 if newx is not None:
93                     dist = (x - newx)**2 + (y - newy)**2
94                     if dist > 0.0000001:
95                         to_update.append((postcode, newx, newy))
96                 else:
97                     to_delete.append(postcode)
98
99         to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
100         self.collected = []
101
102         return to_add, to_delete, to_update
103
104
105     def _update_from_external(self, analyzer, project_dir):
106         """ Look for an external postcode file for the active country in
107             the project directory and add missing postcodes when found.
108         """
109         csvfile = self._open_external(project_dir)
110         if csvfile is None:
111             return
112
113         try:
114             reader = csv.DictReader(csvfile)
115             for row in reader:
116                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
117                     LOG.warning("Bad format for external postcode file for country '%s'."
118                                 " Ignored.", self.country)
119                     return
120                 postcode = analyzer.normalize_postcode(row['postcode'])
121                 if postcode not in self.collected:
122                     try:
123                         self.collected[postcode] = (_to_float(row['lon'], 180),
124                                                     _to_float(row['lat'], 90))
125                     except ValueError:
126                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
127                                     row['lat'], row['lon'], self.country)
128
129         finally:
130             csvfile.close()
131
132
133     def _open_external(self, project_dir):
134         fname = project_dir / '{}_postcodes.csv'.format(self.country)
135
136         if fname.is_file():
137             LOG.info("Using external postcode file '%s'.", fname)
138             return open(fname, 'r')
139
140         fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
141
142         if fname.is_file():
143             LOG.info("Using external postcode file '%s'.", fname)
144             return gzip.open(fname, 'rt')
145
146         return None
147
148
149 def update_postcodes(dsn, project_dir, tokenizer):
150     """ Update the table of artificial postcodes.
151
152         Computes artificial postcode centroids from the placex table,
153         potentially enhances it with external data and then updates the
154         postcodes in the table 'location_postcode'.
155     """
156     with tokenizer.name_analyzer() as analyzer:
157         with connect(dsn) as conn:
158             # First get the list of countries that currently have postcodes.
159             # (Doing this before starting to insert, so it is fast on import.)
160             with conn.cursor() as cur:
161                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
162                 todo_countries = set((row[0] for row in cur))
163
164             # Recompute the list of valid postcodes from placex.
165             with conn.cursor(name="placex_postcodes") as cur:
166                 cur.execute("""
167                 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
168                 FROM (SELECT
169                         COALESCE(plx.country_code,
170                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
171                         token_normalized_postcode(pl.address->'postcode') as pc,
172                         ST_Centroid(ST_Collect(COALESCE(plx.centroid,
173                                                         ST_Centroid(pl.geometry)))) as centroid
174                       FROM place AS pl LEFT OUTER JOIN placex AS plx
175                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
176                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
177                     GROUP BY cc, pc) xx
178                 WHERE pc IS NOT null AND cc IS NOT null
179                 ORDER BY country_code, pc""")
180
181                 collector = None
182
183                 for country, postcode, x, y in cur:
184                     if collector is None or country != collector.country:
185                         if collector is not None:
186                             collector.commit(conn, analyzer, project_dir)
187                         collector = _CountryPostcodesCollector(country)
188                         todo_countries.discard(country)
189                     collector.add(postcode, x, y)
190
191                 if collector is not None:
192                     collector.commit(conn, analyzer, project_dir)
193
194             # Now handle any countries that are only in the postcode table.
195             for country in todo_countries:
196                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
197
198             conn.commit()
199
200         analyzer.update_postcodes_from_db()
201
202 def can_compute(dsn):
203     """
204         Check that the place table exists so that
205         postcodes can be computed.
206     """
207     with connect(dsn) as conn:
208         return conn.table_exists('place')