2 Functions for importing, updating and otherwise maintaining the table
3 of artificial postcode centroids.
8 from math import isfinite
10 from psycopg2.extras import execute_values
12 from nominatim.db.connection import connect
14 LOG = logging.getLogger()
16 def _to_float(num, max_value):
17 """ Convert the number in string into a float. The number is expected
18 to be in the range of [-max_value, max_value]. Otherwise rises a
22 if not isfinite(num) or num <= -max_value or num >= max_value:
27 class _CountryPostcodesCollector:
28 """ Collector for postcodes of a single country.
31 def __init__(self, country):
32 self.country = country
33 self.collected = dict()
36 def add(self, postcode, x, y):
37 """ Add the given postcode to the collection cache. If the postcode
38 already existed, it is overwritten with the new centroid.
40 self.collected[postcode] = (x, y)
43 def commit(self, conn, analyzer, project_dir):
44 """ Update postcodes for the country from the postcodes selected so far
45 as well as any externally supplied postcodes.
47 self._update_from_external(analyzer, project_dir)
48 to_add, to_delete, to_update = self._compute_changes(conn)
50 LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
51 self.country, len(to_add), len(to_delete), len(to_update))
53 with conn.cursor() as cur:
56 """INSERT INTO location_postcode
57 (place_id, indexed_status, country_code,
58 postcode, geometry) VALUES %s""",
60 template="""(nextval('seq_place'), 1, '{}',
61 %s, 'SRID=4326;POINT(%s %s)')
62 """.format(self.country))
64 cur.execute("""DELETE FROM location_postcode
65 WHERE country_code = %s and postcode = any(%s)
66 """, (self.country, to_delete))
69 """UPDATE location_postcode
70 SET indexed_status = 2,
71 geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
72 FROM (VALUES %s) AS v (pc, x, y)
73 WHERE country_code = '{}' and postcode = pc
74 """.format(self.country),
78 def _compute_changes(self, conn):
79 """ Compute which postcodes from the collected postcodes have to be
80 added or modified and which from the location_postcode table
85 with conn.cursor() as cur:
86 cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
87 FROM location_postcode
88 WHERE country_code = %s""",
90 for postcode, x, y in cur:
91 newx, newy = self.collected.pop(postcode, (None, None))
93 dist = (x - newx)**2 + (y - newy)**2
95 to_update.append((postcode, newx, newy))
97 to_delete.append(postcode)
99 to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
102 return to_add, to_delete, to_update
105 def _update_from_external(self, analyzer, project_dir):
106 """ Look for an external postcode file for the active country in
107 the project directory and add missing postcodes when found.
109 csvfile = self._open_external(project_dir)
114 reader = csv.DictReader(csvfile)
116 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
117 LOG.warning("Bad format for external postcode file for country '%s'."
118 " Ignored.", self.country)
120 postcode = analyzer.normalize_postcode(row['postcode'])
121 if postcode not in self.collected:
123 self.collected[postcode] = (_to_float(row['lon'], 180),
124 _to_float(row['lat'], 90))
126 LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
127 row['lat'], row['lon'], self.country)
133 def _open_external(self, project_dir):
134 fname = project_dir / '{}_postcodes.csv'.format(self.country)
137 LOG.info("Using external postcode file '%s'.", fname)
138 return open(fname, 'r')
140 fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
143 LOG.info("Using external postcode file '%s'.", fname)
144 return gzip.open(fname, 'rt')
149 def update_postcodes(dsn, project_dir, tokenizer):
150 """ Update the table of artificial postcodes.
152 Computes artificial postcode centroids from the placex table,
153 potentially enhances it with external data and then updates the
154 postcodes in the table 'location_postcode'.
156 with tokenizer.name_analyzer() as analyzer:
157 with connect(dsn) as conn:
158 # First get the list of countries that currently have postcodes.
159 # (Doing this before starting to insert, so it is fast on import.)
160 with conn.cursor() as cur:
161 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
162 todo_countries = set((row[0] for row in cur))
164 # Recompute the list of valid postcodes from placex.
165 with conn.cursor(name="placex_postcodes") as cur:
167 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
170 COALESCE(plx.country_code, get_country_code(ST_Centroid(pl.geometry))) as cc,
171 token_normalized_postcode(pl.address->'postcode') as pc,
172 COALESCE(ST_Centroid(ST_Collect(plx.centroid)), ST_Centroid(ST_Collect(ST_Centroid(pl.geometry)))) as centroid
173 FROM place AS pl LEFT OUTER JOIN placex AS plx ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
174 WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
177 WHERE pc IS NOT null AND cc IS NOT null
178 ORDER BY country_code, pc""")
182 for country, postcode, x, y in cur:
183 if collector is None or country != collector.country:
184 if collector is not None:
185 collector.commit(conn, analyzer, project_dir)
186 collector = _CountryPostcodesCollector(country)
187 todo_countries.discard(country)
188 collector.add(postcode, x, y)
190 if collector is not None:
191 collector.commit(conn, analyzer, project_dir)
193 # Now handle any countries that are only in the postcode table.
194 for country in todo_countries:
195 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
199 analyzer.update_postcodes_from_db()
201 def can_compute(dsn):
203 Check that the place table exists so that
204 postcodes can be computed.
206 with connect(dsn) as conn:
207 return conn.table_exists('place')