1 # SPDX-License-Identifier: GPL-2.0-only
 
   3 # This file is part of Nominatim. (https://nominatim.org)
 
   5 # Copyright (C) 2022 by the Nominatim developer community.
 
   6 # For a full list of authors see the git log.
 
   8 Functions for importing, updating and otherwise maintaining the table
 
   9 of artificial postcode centroids.
 
  14 from math import isfinite
 
  16 from psycopg2 import sql as pysql
 
  18 from nominatim.db.connection import connect
 
  20 LOG = logging.getLogger()
 
  22 def _to_float(num, max_value):
 
  23     """ Convert the number in string into a float. The number is expected
 
  24         to be in the range of [-max_value, max_value]. Otherwise rises a
 
  28     if not isfinite(num) or num <= -max_value or num >= max_value:
 
  33 class _CountryPostcodesCollector:
 
  34     """ Collector for postcodes of a single country.
 
  37     def __init__(self, country):
 
  38         self.country = country
 
  39         self.collected = dict()
 
  42     def add(self, postcode, x, y):
 
  43         """ Add the given postcode to the collection cache. If the postcode
 
  44             already existed, it is overwritten with the new centroid.
 
  46         self.collected[postcode] = (x, y)
 
  49     def commit(self, conn, analyzer, project_dir):
 
  50         """ Update postcodes for the country from the postcodes selected so far
 
  51             as well as any externally supplied postcodes.
 
  53         self._update_from_external(analyzer, project_dir)
 
  54         to_add, to_delete, to_update = self._compute_changes(conn)
 
  56         LOG.info("Processing country '%s' (%s added, %s deleted, %s updated).",
 
  57                  self.country, len(to_add), len(to_delete), len(to_update))
 
  59         with conn.cursor() as cur:
 
  62                     """INSERT INTO location_postcode
 
  63                          (place_id, indexed_status, country_code,
 
  64                           postcode, geometry) VALUES %s""",
 
  66                     template=pysql.SQL("""(nextval('seq_place'), 1, {},
 
  67                                           %s, 'SRID=4326;POINT(%s %s)')
 
  68                                        """).format(pysql.Literal(self.country)))
 
  70                 cur.execute("""DELETE FROM location_postcode
 
  71                                WHERE country_code = %s and postcode = any(%s)
 
  72                             """, (self.country, to_delete))
 
  75                     pysql.SQL("""UPDATE location_postcode
 
  76                                  SET indexed_status = 2,
 
  77                                      geometry = ST_SetSRID(ST_Point(v.x, v.y), 4326)
 
  78                                  FROM (VALUES %s) AS v (pc, x, y)
 
  79                                  WHERE country_code = {} and postcode = pc
 
  80                               """).format(pysql.Literal(self.country)), to_update)
 
  83     def _compute_changes(self, conn):
 
  84         """ Compute which postcodes from the collected postcodes have to be
 
  85             added or modified and which from the location_postcode table
 
  90         with conn.cursor() as cur:
 
  91             cur.execute("""SELECT postcode, ST_X(geometry), ST_Y(geometry)
 
  92                            FROM location_postcode
 
  93                            WHERE country_code = %s""",
 
  95             for postcode, x, y in cur:
 
  96                 newx, newy = self.collected.pop(postcode, (None, None))
 
  98                     dist = (x - newx)**2 + (y - newy)**2
 
 100                         to_update.append((postcode, newx, newy))
 
 102                     to_delete.append(postcode)
 
 104         to_add = [(k, v[0], v[1]) for k, v in self.collected.items()]
 
 107         return to_add, to_delete, to_update
 
 110     def _update_from_external(self, analyzer, project_dir):
 
 111         """ Look for an external postcode file for the active country in
 
 112             the project directory and add missing postcodes when found.
 
 114         csvfile = self._open_external(project_dir)
 
 119             reader = csv.DictReader(csvfile)
 
 121                 if 'postcode' not in row or 'lat' not in row or 'lon' not in row:
 
 122                     LOG.warning("Bad format for external postcode file for country '%s'."
 
 123                                 " Ignored.", self.country)
 
 125                 postcode = analyzer.normalize_postcode(row['postcode'])
 
 126                 if postcode not in self.collected:
 
 128                         self.collected[postcode] = (_to_float(row['lon'], 180),
 
 129                                                     _to_float(row['lat'], 90))
 
 131                         LOG.warning("Bad coordinates %s, %s in %s country postcode file.",
 
 132                                     row['lat'], row['lon'], self.country)
 
 138     def _open_external(self, project_dir):
 
 139         fname = project_dir / '{}_postcodes.csv'.format(self.country)
 
 142             LOG.info("Using external postcode file '%s'.", fname)
 
 143             return open(fname, 'r')
 
 145         fname = project_dir / '{}_postcodes.csv.gz'.format(self.country)
 
 148             LOG.info("Using external postcode file '%s'.", fname)
 
 149             return gzip.open(fname, 'rt')
 
 154 def update_postcodes(dsn, project_dir, tokenizer):
 
 155     """ Update the table of artificial postcodes.
 
 157         Computes artificial postcode centroids from the placex table,
 
 158         potentially enhances it with external data and then updates the
 
 159         postcodes in the table 'location_postcode'.
 
 161     with tokenizer.name_analyzer() as analyzer:
 
 162         with connect(dsn) as conn:
 
 163             # First get the list of countries that currently have postcodes.
 
 164             # (Doing this before starting to insert, so it is fast on import.)
 
 165             with conn.cursor() as cur:
 
 166                 cur.execute("SELECT DISTINCT country_code FROM location_postcode")
 
 167                 todo_countries = set((row[0] for row in cur))
 
 169             # Recompute the list of valid postcodes from placex.
 
 170             with conn.cursor(name="placex_postcodes") as cur:
 
 172                 SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
 
 174                         COALESCE(plx.country_code,
 
 175                                  get_country_code(ST_Centroid(pl.geometry))) as cc,
 
 176                         token_normalized_postcode(pl.address->'postcode') as pc,
 
 177                         ST_Centroid(ST_Collect(COALESCE(plx.centroid,
 
 178                                                         ST_Centroid(pl.geometry)))) as centroid
 
 179                       FROM place AS pl LEFT OUTER JOIN placex AS plx
 
 180                              ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
 
 181                     WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
 
 183                 WHERE pc IS NOT null AND cc IS NOT null
 
 184                 ORDER BY country_code, pc""")
 
 188                 for country, postcode, x, y in cur:
 
 189                     if collector is None or country != collector.country:
 
 190                         if collector is not None:
 
 191                             collector.commit(conn, analyzer, project_dir)
 
 192                         collector = _CountryPostcodesCollector(country)
 
 193                         todo_countries.discard(country)
 
 194                     collector.add(postcode, x, y)
 
 196                 if collector is not None:
 
 197                     collector.commit(conn, analyzer, project_dir)
 
 199             # Now handle any countries that are only in the postcode table.
 
 200             for country in todo_countries:
 
 201                 _CountryPostcodesCollector(country).commit(conn, analyzer, project_dir)
 
 205         analyzer.update_postcodes_from_db()
 
 207 def can_compute(dsn):
 
 209         Check that the place table exists so that
 
 210         postcodes can be computed.
 
 212     with connect(dsn) as conn:
 
 213         return conn.table_exists('place')