]> git.openstreetmap.org Git - nominatim.git/blob - nominatim/tools/postcodes.py
ignore entries without country code
[nominatim.git] / nominatim / tools / postcodes.py
1 """
2 Functions for importing, updating and otherwise maintaining the table
3 of artificial postcode centroids.
4 """
5 import csv
6 import gzip
7 import logging
8
9 from psycopg2.extras import execute_values
10
11 from nominatim.db.connection import connect
12
13 LOG = logging.getLogger()
14
15 class _CountryPostcodesCollector:
16     """ Collector for postcodes of a single country.
17     """
18
19     def __init__(self, country):
20         self.country = country
21         self.collected = dict()
22
23
24     def add(self, postcode, x, y):
25         """ Add the given postcode to the collection cache. If the postcode
26             already existed, it is overwritten with the new centroid.
27         """
28         self.collected[postcode] = (x, y)
29
30
31     def commit(self, conn, analyzer, project_dir):
32         """ Update postcodes for the country from the postcodes selected so far
33             as well as any externally supplied postcodes.
34         """
35         self._update_from_external(analyzer, project_dir)
36         to_add, to_delete, to_update = self._compute_changes(conn)
37
38         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
39                  self.country, len(to_add), len(to_delete), len(to_update))
40
41         with conn.cursor() as cur:
42             if to_add:
43                 execute_values(cur,
44                                """INSERT INTO location_postcode
45                                       (place_id, indexed_status, country_code,
46                                        postcode, geometry) VALUES %s""",
47                                to_add,
48                                template="""(nextval('seq_place'), 1, '{}',
49                                            %s, 'SRID=4326;POINT(%s %s)')
50                                         """.format(self.country))
51             if to_delete:
52                 cur.execute("""DELETE FROM location_postcode
53                                WHERE country_code = %s and postcode = any(%s)
54                             """, (self.country, to_delete))
55             if to_update:
56                 execute_values(cur,
57                                """UPDATE location_postcode
58                                   SET indexed_status = 2,
59                                       geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
60                                   FROM (VALUES %s) AS v (pc, x, y)
61                                   WHERE country_code = '{}' and postcode = pc
62                                """.format(self.country),
63                                to_update)
64
65
66     def _compute_changes(self, conn):
67         """ Compute which postcodes from the collected postcodes have to be
68             added or modified and which from the location_postcode table
69             have to be deleted.
70         """
71         to_update = []
72         to_delete = []
73         with conn.cursor() as cur:
74             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
75                            FROM location_postcode
76                            WHERE country_code = %s""",
77                         (self.country, ))
78             for postcode, x, y in cur:
79                 newx, newy = self.collected.pop(postcode, (None, None))
80                 if newx is not None:
81                     dist = (x - newx)**2 + (y - newy)**2
82                     if dist > 0.0000001:
83                         to_update.append((postcode, newx, newy))
84                 else:
85                     to_delete.append(postcode)
86
87         to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
88         self.collected = []
89
90         return to_add, to_delete, to_update
91
92
93     def _update_from_external(self, analyzer, project_dir):
94         """ Look for an external postcode file for the active country in
95             the project directory and add missing postcodes when found.
96         """
97         csvfile = self._open_external(project_dir)
98         if csvfile is None:
99             return
100
101         try:
102             reader = csv.DictReader(csvfile)
103             for row in reader:
104                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
105                     LOG.warning("Bad format for external postcode file for country '%s'."
106                                 " Ignored.", self.country)
107                     return
108                 postcode = analyzer.normalize_postcode(row['postcode'])
109                 if postcode not in self.collected:
110                     try:
111                         self.collected[postcode] = (float(row['lon']), float(row['lat']))
112                     except ValueError:
113                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
114                                     row['lat'], row['lon'], self.country)
115
116         finally:
117             csvfile.close()
118
119
120     def _open_external(self, project_dir):
121         fname = project_dir / '{}_postcodes.csv'.format(self.country)
122
123         if fname.is_file():
124             LOG.info("Using external postcode file '%s'.", fname)
125             return open(fname, 'r')
126
127         fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
128
129         if fname.is_file():
130             LOG.info("Using external postcode file '%s'.", fname)
131             return gzip.open(fname, 'rt')
132
133         return None
134
135
136 def update_postcodes(dsn, project_dir, tokenizer):
137     """ Update the table of artificial postcodes.
138
139         Computes artificial postcode centroids from the placex table,
140         potentially enhances it with external data and then updates the
141         postcodes in the table 'location_postcode'.
142     """
143     with tokenizer.name_analyzer() as analyzer:
144         with connect(dsn) as conn:
145             # First get the list of countries that currently have postcodes.
146             # (Doing this before starting to insert, so it is fast on import.)
147             with conn.cursor() as cur:
148                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
149                 todo_countries = set((row[0] for row in cur))
150
151             # Recompute the list of valid postcodes from placex.
152             with conn.cursor(name="placex_postcodes") as cur:
153                 cur.execute("""SELECT country_code, pc, ST_X(centroid), ST_Y(centroid)
154                                FROM (
155                                  SELECT country_code,
156                                         token_normalized_postcode(address->'postcode') as pc,
157                                         ST_Centroid(ST_Collect(ST_Centroid(geometry))) as centroid
158                                  FROM placex
159                                  WHERE address ? 'postcode' and geometry IS NOT null
160                                        and country_code is not null
161                                  GROUP BY country_code, pc) xx
162                                WHERE pc is not null
163                                ORDER BY country_code, pc""")
164
165                 collector = None
166
167                 for country, postcode, x, y in cur:
168                     if collector is None or country != collector.country:
169                         if collector is not None:
170                             collector.commit(conn, analyzer, project_dir)
171                         collector = _CountryPostcodesCollector(country)
172                         todo_countries.discard(country)
173                     collector.add(postcode, x, y)
174
175                 if collector is not None:
176                     collector.commit(conn, analyzer, project_dir)
177
178             # Now handle any countries that are only in the postcode table.
179             for country in todo_countries:
180                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
181
182             conn.commit()
183
184         analyzer.update_postcodes_from_db()