2 Functions for importing, updating and otherwise maintaining the table
 
   3 of artificial postcode centroids.
 
   8 from math import isfinite
 
  10 from psycopg2.extras import execute_values
 
  12 from nominatim.db.connection import connect
 
  14 LOG = logging.getLogger()
 
  16 def _to_float(num, max_value):
 
  17     """ Convert the number in string into a float. The number is expected
 
  18         to be in the range of [-max_value, max_value]. Otherwise rises a
 
  22     if not isfinite(num) or num <= -max_value or num >= max_value:
 
  27 class _CountryPostcodesCollector:
 
  28     """ Collector for postcodes of a single country.
 
  31     def __init__(self, country):
 
  32         self.country = country
 
  33         self.collected = dict()
 
  36     def add(self, postcode, x, y):
 
  37         """ Add the given postcode to the collection cache. If the postcode
 
  38             already existed, it is overwritten with the new centroid.
 
  40         self.collected[postcode] = (x, y)
 
  43     def commit(self, conn, analyzer, project_dir):
 
  44         """ Update postcodes for the country from the postcodes selected so far
 
  45             as well as any externally supplied postcodes.
 
  47         self._update_from_external(analyzer, project_dir)
 
  48         to_add, to_delete, to_update = self._compute_changes(conn)
 
  50         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
 
  51                  self.country, len(to_add), len(to_delete), len(to_update))
 
  53         with conn.cursor() as cur:
 
  56                                """INSERT INTO location_postcode
 
  57                                       (place_id, indexed_status, country_code,
 
  58                                        postcode, geometry) VALUES %s""",
 
  60                                template="""(nextval('seq_place'), 1, '{}',
 
  61                                            %s, 'SRID=4326;POINT(%s %s)')
 
  62                                         """.format(self.country))
 
  64                 cur.execute("""DELETE FROM location_postcode
 
  65                                WHERE country_code = %s and postcode = any(%s)
 
  66                             """, (self.country, to_delete))
 
  69                                """UPDATE location_postcode
 
  70                                   SET indexed_status = 2,
 
  71                                       geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
 
  72                                   FROM (VALUES %s) AS v (pc, x, y)
 
  73                                   WHERE country_code = '{}' and postcode = pc
 
  74                                """.format(self.country),
 
  78     def _compute_changes(self, conn):
 
  79         """ Compute which postcodes from the collected postcodes have to be
 
  80             added or modified and which from the location_postcode table
 
  85         with conn.cursor() as cur:
 
  86             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
 
  87                            FROM location_postcode
 
  88                            WHERE country_code = %s""",
 
  90             for postcode, x, y in cur:
 
  91                 newx, newy = self.collected.pop(postcode, (None, None))
 
  93                     dist = (x - newx)**2 + (y - newy)**2
 
  95                         to_update.append((postcode, newx, newy))
 
  97                     to_delete.append(postcode)
 
  99         to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
 
 102         return to_add, to_delete, to_update
 
 105     def _update_from_external(self, analyzer, project_dir):
 
 106         """ Look for an external postcode file for the active country in
 
 107             the project directory and add missing postcodes when found.
 
 109         csvfile = self._open_external(project_dir)
 
 114             reader = csv.DictReader(csvfile)
 
 116                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
 
 117                     LOG.warning("Bad format for external postcode file for country '%s'."
 
 118                                 " Ignored.", self.country)
 
 120                 postcode = analyzer.normalize_postcode(row['postcode'])
 
 121                 if postcode not in self.collected:
 
 123                         self.collected[postcode] = (_to_float(row['lon'], 180),
 
 124                                                     _to_float(row['lat'], 90))
 
 126                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
 
 127                                     row['lat'], row['lon'], self.country)
 
 133     def _open_external(self, project_dir):
 
 134         fname = project_dir / '{}_postcodes.csv'.format(self.country)
 
 137             LOG.info("Using external postcode file '%s'.", fname)
 
 138             return open(fname, 'r')
 
 140         fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
 
 143             LOG.info("Using external postcode file '%s'.", fname)
 
 144             return gzip.open(fname, 'rt')
 
 149 def update_postcodes(dsn, project_dir, tokenizer):
 
 150     """ Update the table of artificial postcodes.
 
 152         Computes artificial postcode centroids from the placex table,
 
 153         potentially enhances it with external data and then updates the
 
 154         postcodes in the table 'location_postcode'.
 
 156     with tokenizer.name_analyzer() as analyzer:
 
 157         with connect(dsn) as conn:
 
 158             # First get the list of countries that currently have postcodes.
 
 159             # (Doing this before starting to insert, so it is fast on import.)
 
 160             with conn.cursor() as cur:
 
 161                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
 
 162                 todo_countries = set((row[0] for row in cur))
 
 164             # Recompute the list of valid postcodes from placex.
 
 165             with conn.cursor(name="placex_postcodes") as cur:
 
 166                 cur.execute("""SELECT country_code, pc, ST_X(centroid), ST_Y(centroid)
 
 169                                         token_normalized_postcode(address->'postcode') as pc,
 
 170                                         ST_Centroid(ST_Collect(ST_Centroid(geometry))) as centroid
 
 172                                  WHERE address ? 'postcode' and geometry IS NOT null
 
 173                                        and country_code is not null
 
 174                                  GROUP BY country_code, pc) xx
 
 176                                ORDER BY country_code, pc""")
 
 180                 for country, postcode, x, y in cur:
 
 181                     if collector is None or country != collector.country:
 
 182                         if collector is not None:
 
 183                             collector.commit(conn, analyzer, project_dir)
 
 184                         collector = _CountryPostcodesCollector(country)
 
 185                         todo_countries.discard(country)
 
 186                     collector.add(postcode, x, y)
 
 188                 if collector is not None:
 
 189                     collector.commit(conn, analyzer, project_dir)
 
 191             # Now handle any countries that are only in the postcode table.
 
 192             for country in todo_countries:
 
 193                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
 
 197         analyzer.update_postcodes_from_db()