2 Functions for importing, updating and otherwise maintaining the table
 
   3 of artificial postcode centroids.
 
   8 from math import isfinite
 
  10 from psycopg2 import sql as pysql
 
  12 from nominatim.db.connection import connect
 
  14 LOG = logging.getLogger()
 
  16 def _to_float(num, max_value):
 
  17     """ Convert the number in string into a float. The number is expected
 
  18         to be in the range of [-max_value, max_value]. Otherwise rises a
 
  22     if not isfinite(num) or num <= -max_value or num >= max_value:
 
  27 class _CountryPostcodesCollector:
 
  28     """ Collector for postcodes of a single country.
 
  31     def __init__(self, country):
 
  32         self.country = country
 
  33         self.collected = dict()
 
  36     def add(self, postcode, x, y):
 
  37         """ Add the given postcode to the collection cache. If the postcode
 
  38             already existed, it is overwritten with the new centroid.
 
  40         self.collected[postcode] = (x, y)
 
  43     def commit(self, conn, analyzer, project_dir):
 
  44         """ Update postcodes for the country from the postcodes selected so far
 
  45             as well as any externally supplied postcodes.
 
  47         self._update_from_external(analyzer, project_dir)
 
  48         to_add, to_delete, to_update = self._compute_changes(conn)
 
  50         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
 
  51                  self.country, len(to_add), len(to_delete), len(to_update))
 
  53         with conn.cursor() as cur:
 
  56                     """INSERT INTO location_postcode
 
  57                          (place_id, indexed_status, country_code,
 
  58                           postcode, geometry) VALUES %s""",
 
  60                     template=pysql.SQL("""(nextval('seq_place'), 1, {},
 
  61                                           %s, 'SRID=4326;POINT(%s %s)')
 
  62                                        """).format(pysql.Literal(self.country)))
 
  64                 cur.execute("""DELETE FROM location_postcode
 
  65                                WHERE country_code = %s and postcode = any(%s)
 
  66                             """, (self.country, to_delete))
 
  69                     pysql.SQL("""UPDATE location_postcode
 
  70                                  SET indexed_status = 2,
 
  71                                      geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
 
  72                                  FROM (VALUES %s) AS v (pc, x, y)
 
  73                                  WHERE country_code = {} and postcode = pc
 
  74                               """).format(pysql.Literal(self.country)), to_update)
 
  77     def _compute_changes(self, conn):
 
  78         """ Compute which postcodes from the collected postcodes have to be
 
  79             added or modified and which from the location_postcode table
 
  84         with conn.cursor() as cur:
 
  85             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
 
  86                            FROM location_postcode
 
  87                            WHERE country_code = %s""",
 
  89             for postcode, x, y in cur:
 
  90                 newx, newy = self.collected.pop(postcode, (None, None))
 
  92                     dist = (x - newx)**2 + (y - newy)**2
 
  94                         to_update.append((postcode, newx, newy))
 
  96                     to_delete.append(postcode)
 
  98         to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
 
 101         return to_add, to_delete, to_update
 
 104     def _update_from_external(self, analyzer, project_dir):
 
 105         """ Look for an external postcode file for the active country in
 
 106             the project directory and add missing postcodes when found.
 
 108         csvfile = self._open_external(project_dir)
 
 113             reader = csv.DictReader(csvfile)
 
 115                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
 
 116                     LOG.warning("Bad format for external postcode file for country '%s'."
 
 117                                 " Ignored.", self.country)
 
 119                 postcode = analyzer.normalize_postcode(row['postcode'])
 
 120                 if postcode not in self.collected:
 
 122                         self.collected[postcode] = (_to_float(row['lon'], 180),
 
 123                                                     _to_float(row['lat'], 90))
 
 125                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
 
 126                                     row['lat'], row['lon'], self.country)
 
 132     def _open_external(self, project_dir):
 
 133         fname = project_dir / '{}_postcodes.csv'.format(self.country)
 
 136             LOG.info("Using external postcode file '%s'.", fname)
 
 137             return open(fname, 'r')
 
 139         fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
 
 142             LOG.info("Using external postcode file '%s'.", fname)
 
 143             return gzip.open(fname, 'rt')
 
 148 def update_postcodes(dsn, project_dir, tokenizer):
 
 149     """ Update the table of artificial postcodes.
 
 151         Computes artificial postcode centroids from the placex table,
 
 152         potentially enhances it with external data and then updates the
 
 153         postcodes in the table 'location_postcode'.
 
 155     with tokenizer.name_analyzer() as analyzer:
 
 156         with connect(dsn) as conn:
 
 157             # First get the list of countries that currently have postcodes.
 
 158             # (Doing this before starting to insert, so it is fast on import.)
 
 159             with conn.cursor() as cur:
 
 160                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
 
 161                 todo_countries = set((row[0] for row in cur))
 
 163             # Recompute the list of valid postcodes from placex.
 
 164             with conn.cursor(name="placex_postcodes") as cur:
 
 166                 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
 
 168                         COALESCE(plx.country_code,
 
 169                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
 
 170                         token_normalized_postcode(pl.address->'postcode') as pc,
 
 171                         ST_Centroid(ST_Collect(COALESCE(plx.centroid,
 
 172                                                         ST_Centroid(pl.geometry)))) as centroid
 
 173                       FROM place AS pl LEFT OUTER JOIN placex AS plx
 
 174                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
 
 175                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
 
 177                 WHERE pc IS NOT null AND cc IS NOT null
 
 178                 ORDER BY country_code, pc""")
 
 182                 for country, postcode, x, y in cur:
 
 183                     if collector is None or country != collector.country:
 
 184                         if collector is not None:
 
 185                             collector.commit(conn, analyzer, project_dir)
 
 186                         collector = _CountryPostcodesCollector(country)
 
 187                         todo_countries.discard(country)
 
 188                     collector.add(postcode, x, y)
 
 190                 if collector is not None:
 
 191                     collector.commit(conn, analyzer, project_dir)
 
 193             # Now handle any countries that are only in the postcode table.
 
 194             for country in todo_countries:
 
 195                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
 
 199         analyzer.update_postcodes_from_db()
 
 201 def can_compute(dsn):
 
 203         Check that the place table exists so that
 
 204         postcodes can be computed.
 
 206     with connect(dsn) as conn:
 
 207         return conn.table_exists('place')