]> git.openstreetmap.org Git - nominatim.git/commitdiff
2018 TIGER data conversion scripts, add documentation to /docs/data-sources
authormarc tobias <mtmail@gmx.net>
Fri, 28 Sep 2018 18:17:02 +0000 (20:17 +0200)
committerSarah Hoffmann <lonvia@denofr.de>
Thu, 15 Nov 2018 22:01:08 +0000 (23:01 +0100)
CMakeLists.txt
data-sources/us-tiger/README.md [new file with mode: 0644]
data-sources/us-tiger/convert.sh [new file with mode: 0755]
data-sources/us-tiger/tiger_address_convert.py [moved from utils/tigerAddressImport.py with 100% similarity]
data-sources/us-tiger/tiger_county_fips.json [moved from utils/tiger_county_fips.json with 100% similarity]
docs/CMakeLists.txt
docs/admin/Import-and-Update.md
docs/data-sources/overview.md [new file with mode: 0644]
docs/mkdocs.yml
test/bdd/api/reverse/queries.feature
utils/imports.php [deleted file]

index a7c7b3951463441c95adf6ddb9a44c67a4c8c6b8..d6f7d2cb94d5abcd32ef4d59672f4ce1230981cb 100644 (file)
@@ -105,7 +105,6 @@ set(CUSTOMFILES
     website/status.php
     utils/blocks.php
     utils/country_languages.php
-    utils/imports.php
     utils/importWikipedia.php
     utils/export.php
     utils/query.php
diff --git a/data-sources/us-tiger/README.md b/data-sources/us-tiger/README.md
new file mode 100644 (file)
index 0000000..e75a9ef
--- /dev/null
@@ -0,0 +1,29 @@
+# US TIGER address data
+
+Convert [TIGER](https://www.census.gov/geo/maps-data/data/tiger.html)/Line dataset of the US Census Bureau to SQL files which can be imported by Nominatim. The created tables in the Nominatim database are separate from OpenStreetMap tables and get queried at search time separately.
+
+The dataset gets updated once per year. Downloading is prown to be slow (can take a full day) and converting them can take hours as well.
+
+Replace '2018' with the current year throughout.
+
+  1. Install the GDAL library and python bindings and the unzip tool
+
+        # Ubuntu:
+        sudo apt-get install python-gdal unzip
+        # CentOS:
+        sudo yum install gdal-python unzip
+
+  2. Get the TIGER 2018 data. You will need the EDGES files
+     (3,233 zip files, 11GB total).
+
+         wget -r ftp://ftp2.census.gov/geo/tiger/TIGER2018/EDGES/
+
+  3. Convert the data into SQL statements. Adjust the file paths in the scripts as needed
+
+        cd data-sources/us-tiger
+        ./convert.sh <input-path> <output-path>
+        
+  4. Maybe: package the created files
+  
+        tar -czf tiger2018-nominatim-preprocessed.tar.gz tiger
+        
\ No newline at end of file
diff --git a/data-sources/us-tiger/convert.sh b/data-sources/us-tiger/convert.sh
new file mode 100755 (executable)
index 0000000..b94017e
--- /dev/null
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+INPATH=$1
+OUTPATH=$2
+
+if [[ ! -d "$INPATH" ]]; then
+    echo "input path does not exist"
+    exit 1
+fi
+
+if [[ ! -d "$OUTPATH" ]]; then
+    echo "output path does not exist"
+    exit 1
+fi
+
+INREGEX='_([0-9]{5})_edges.zip'
+WORKPATH="$OUTPATH/tmp-workdir/"
+mkdir -p "$WORKPATH"
+
+
+
+INFILES=($INPATH/*.zip)
+echo "Found ${#INFILES[*]} files."
+
+for F in ${INFILES[*]}; do
+    # echo $F
+
+    if [[ "$F" =~ $INREGEX ]]; then
+        COUNTYID=${BASH_REMATCH[1]}
+        SHAPEFILE="$WORKPATH/$(basename $F '.zip').shp"
+        SQLFILE="$OUTPATH/$COUNTYID.sql"
+
+        unzip -o -q -d "$WORKPATH" "$F"
+        if [[ ! -e "$SHAPEFILE" ]]; then
+            echo "Unzip failed. $SHAPEFILE not found."
+            exit 1
+        fi
+
+        ./tiger_address_convert.py "$SHAPEFILE" "$SQLFILE"
+
+        rm $WORKPATH/*
+    fi
+done
+
+OUTFILES=($OUTPATH/*.sql)
+echo "Wrote ${#OUTFILES[*]} files."
+
+rmdir $WORKPATH
index cbe91b91d355e90416b0aa30be7d096cec010b25..68af5429257b2501858f0b9fce0ccdd02ce02160 100644 (file)
@@ -10,8 +10,10 @@ ADD_CUSTOM_TARGET(doc
    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/admin ${CMAKE_CURRENT_BINARY_DIR}/admin
    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/develop ${CMAKE_CURRENT_BINARY_DIR}/develop
    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/api ${CMAKE_CURRENT_BINARY_DIR}/api
+   COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/data-sources ${CMAKE_CURRENT_BINARY_DIR}/data-sources
    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/index.md ${CMAKE_CURRENT_BINARY_DIR}/index.md
    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CMAKE_CURRENT_SOURCE_DIR}/extra.css ${CMAKE_CURRENT_BINARY_DIR}/extra.css
+   COMMAND ${CMAKE_COMMAND} -E create_symlink ${PROJECT_SOURCE_DIR}/data-sources/us-tiger/README.md ${CMAKE_CURRENT_BINARY_DIR}/data-sources/US-Tiger.md
    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Centos-7.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Centos-7.md
    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-16.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-16.md
    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-18.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-18.md
index 731ff8faee6bda0fb480b53e62fdceb7052b6bf9..847aa37d8eb98956d92b4789d0c7a976ada7df96 100644 (file)
@@ -101,52 +101,34 @@ Note that this command downloads the phrases from the wiki link above.
 
 ## Installing Tiger housenumber data for the US
 
-Nominatim is able to use the official TIGER address set to complement the
-OSM house number data in the US. You can add TIGER data to your own Nominatim
-instance by following these steps:
+Nominatim is able to use the official [TIGER](https://www.census.gov/geo/maps-data/data/tiger.html)
+address set to complement the OSM house number data in the US. You can add
+TIGER data to your own Nominatim instance by following these steps. The
+entire US adds about 10GB to your database.
 
-  1. Install the GDAL library and python bindings and the unzip tool
-
-       * Ubuntu: `sudo apt-get install python-gdal unzip`
-       * CentOS: `sudo yum install gdal-python unzip`
-
-  2. Get preprocessed TIGER 2017 data and unpack it into the
+  1. Get preprocessed TIGER 2018 data and unpack it into the
      data directory in your Nominatim sources:
 
         cd Nominatim/data
-        wget https://nominatim.org/data/tiger2017-nominatim-preprocessed.tar.gz
-        tar xf tiger2017-nominatim-preprocessed.tar.gz
+        wget https://nominatim.org/data/tiger2018-nominatim-preprocessed.tar.gz
+        tar xf tiger2018-nominatim-preprocessed.tar.gz
+
+    `data-source/us-tiger/README.md` explains how the data got preprocessed.
 
-  3. Import the data into your Nominatim database: 
+  2. Import the data into your Nominatim database: 
 
         ./utils/setup.php --import-tiger-data
 
-  4. Enable use of the Tiger data in your `settings/local.php` by adding:
+  3. Enable use of the Tiger data in your `settings/local.php` by adding:
 
          @define('CONST_Use_US_Tiger_Data', true);
 
-  5. Apply the new settings:
+  4. Apply the new settings:
 
 ```sh
     ./utils/setup.php --create-functions --enable-diff-updates --create-partition-functions
 ```
 
-The entire US adds about 10GB to your database.
-
-You can also process the data from the original TIGER data to create the
-SQL files, Nominatim needs for the import:
-
-  1. Get the TIGER 2017 data. You will need the EDGES files
-     (3,234 zip files, 11GB total).
-
-         wget -r ftp://ftp2.census.gov/geo/tiger/TIGER2017/EDGES/
-
-  2. Convert the data into SQL statements: 
-
-         ./utils/imports.php --parse-tiger <tiger edge data directory>
-
-Be warned that this can take quite a long time. After this process is finished,
-the same preprocessed files as above are available in `data/tiger`.
 
 ## Updates
 
diff --git a/docs/data-sources/overview.md b/docs/data-sources/overview.md
new file mode 100644 (file)
index 0000000..a6dc0db
--- /dev/null
@@ -0,0 +1,4 @@
+# Additional Data Sources
+
+This guide explains how data sources other than OpenStreetMap mentioned in
+the install instructions got obtained and converted.
index b620decf19bbed792e0aa9b15f4c450d5e7a4d69..7c516070cc34a5480ecd3f6ad7bfd32c5729e724 100644 (file)
@@ -20,6 +20,9 @@ pages:
         - 'Troubleshooting' : 'admin/Faq.md'
     - 'Developers Guide':
         - 'Overview' : 'develop/overview.md'
+    - 'Data Sources':
+        - 'Overview' : 'data-sources/overview.md'
+        - 'US Census (Tiger)': data-sources/US-Tiger.md'
     - 'Appendix':
         - 'Installation on CentOS 7' : 'appendix/Install-on-Centos-7.md'
         - 'Installation on Ubuntu 16' : 'appendix/Install-on-Ubuntu-16.md'
index 1973f0b94aea217d8bdd6a28e66e1dced07dd96b..8fbe552c6255a928758691d6ec8f0ca577394485 100644 (file)
@@ -10,7 +10,7 @@ Feature: Reverse geocoding
           | way      | place    | house |
         And result addresses contain
           | house_number | road            | postcode | country_code |
-          | 906          | West 1st Street | 57274    | us |
+          | 909          | West 1st Street | 57274    | us |
 
     @Tiger
     Scenario: No TIGER house number for zoom < 18
diff --git a/utils/imports.php b/utils/imports.php
deleted file mode 100755 (executable)
index 9d1085f..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-#!@PHP_BIN@ -Cq
-<?php
-
-require_once(dirname(dirname(__FILE__)).'/settings/settings.php');
-require_once(CONST_BasePath.'/lib/init-cmd.php');
-ini_set('memory_limit', '800M');
-
-$aCMDOptions
- = array(
-    'Create and setup nominatim search system',
-    array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
-    array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
-    array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
-
-    array('parse-tiger', '', 0, 1, 1, 1, 'realpath', 'Convert tiger edge files to nominatim sql import - datafiles from 2011 or later (source: edges directory of tiger data)'),
-   );
-getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
-
-
-if (isset($aCMDResult['parse-tiger'])) {
-    if (!file_exists(CONST_Tiger_Data_Path)) mkdir(CONST_Tiger_Data_Path);
-
-    $sTempDir = tempnam('/tmp', 'tiger');
-    unlink($sTempDir);
-    mkdir($sTempDir);
-
-    foreach (glob($aCMDResult['parse-tiger'].'/tl_20??_?????_edges.zip', 0) as $sImportFile) {
-        set_time_limit(30);
-        preg_match('#([0-9]{5})_(.*)#', basename($sImportFile), $aMatch);
-        $sCountyID = $aMatch[1];
-
-        echo 'Processing '.$sCountyID."...\n";
-        $sUnzipCmd = "unzip -d $sTempDir $sImportFile";
-        exec($sUnzipCmd);
-
-        $sShapeFilename = $sTempDir.'/'.basename($sImportFile, '.zip').'.shp';
-        $sSqlFilenameTmp = $sTempDir.'/'.$sCountyID.'.sql';
-        $sSqlFilename = CONST_Tiger_Data_Path.'/'.$sCountyID.'.sql';
-
-        if (!file_exists($sShapeFilename)) {
-            echo "Failed unzip ($sImportFile)\n";
-        } else {
-            $sParseCmd = CONST_BasePath.'/utils/tigerAddressImport.py '.$sShapeFilename.' '.$sSqlFilenameTmp;
-            exec($sParseCmd);
-            if (!file_exists($sSqlFilenameTmp)) {
-                echo "Failed parse ($sImportFile)\n";
-            } else {
-                copy($sSqlFilenameTmp, $sSqlFilename);
-            }
-        }
-        // Cleanup
-        foreach (glob($sTempDir.'/*') as $sTmpFile) {
-            unlink($sTmpFile);
-        }
-    }
-}