]> git.openstreetmap.org Git - nominatim.git/commitdiff
Gsoc2019 contributions for adding Wikidata to Nominatim (#1475)
authorTC Haddad <tchaddad@gmail.com>
Sun, 6 Oct 2019 07:56:39 +0000 (00:56 -0700)
committermtmail <mtmail@gmx.net>
Sun, 6 Oct 2019 07:56:39 +0000 (15:56 +0800)
Complete rewrite of wikipedia processing scripts, addition of processing wikidata, new data source, new documentation by @tchaddad during Google Summer of Code 2019 project.

data-sources/wikipedia-wikidata/README.md [new file with mode: 0644]
data-sources/wikipedia-wikidata/import_wikidata.sh [new file with mode: 0755]
data-sources/wikipedia-wikidata/import_wikipedia.sh [new file with mode: 0755]
data-sources/wikipedia-wikidata/mysql2pgsql.perl [moved from mysql2pgsql/mysql2pgsql.perl with 99% similarity]
data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv [new file with mode: 0644]
data-sources/wikipedia-wikidata/wikidata_place_types.txt [new file with mode: 0644]
data-sources/wikipedia-wikidata/wikidata_places.md [new file with mode: 0644]
docs/CMakeLists.txt
docs/mkdocs.yml
utils/import_wikipedia.sh [deleted file]

diff --git a/data-sources/wikipedia-wikidata/README.md b/data-sources/wikipedia-wikidata/README.md
new file mode 100644 (file)
index 0000000..7ac1974
--- /dev/null
@@ -0,0 +1,58 @@
+## Add Wikipedia and Wikidata to Nominatim
+
+OSM contributors frequently tag items with links to Wikipedia and Wikidata. Nominatim can use the page ranking of Wikipedia pages to help indicate the relative importance of osm features. This is done by calculating an importance score between 0 and 1 based on the number of inlinks to an article for a location. If two places have the same name and one is more important than the other, the wikipedia score often points to the correct place. 
+
+These scripts extract and prepare both Wikipedia page rank and Wikidata links for use in Nominatim.  
+
+#### Create a new postgres DB for Processing
+
+Due to the size of initial and intermediate tables, processing can be done in an external database:
+```
+CREATE DATABASE wikiprocessingdb;
+```
+---
+Wikipedia
+---  
+
+Processing these data requires a large amount of disk space (~1TB) and considerable time (>24 hours).
+
+#### Import & Process Wikipedia tables
+
+This step downloads and converts [Wikipedia](https://dumps.wikimedia.org/) page data SQL dumps to postgreSQL files which can be imported and processed with pagelink information from Wikipedia language sites to calculate importance scores.
+
+- The script will processes data from whatever set of Wikipedia languages are specified in the initial languages array
+
+- Note that processing the top 40 Wikipedia languages can take over a day, and will add nearly 1TB to the processing database. The final output tables will be approximately 11GB and 2GB in size
+
+To download, convert, and import the data, then process summary statistics and compute importance scores, run:
+```
+./wikipedia_import.sh
+```
+---
+Wikidata
+---
+
+This script downloads and processes Wikidata to enrich the previously created Wekipedia tables for use in Nominatim.
+
+#### Import & Process Wikidata
+
+This step downloads and converts [Wikidata](https://dumps.wikimedia.org/wikidatawiki/) page data SQL dumps to postgreSQL files which can be processed and imported into Nominatim database. Also utilizes Wikidata Query Service API to discover and include place types.
+
+- Script presumes that the user has already processed Wikipedia tables as specified above
+
+- Script requires wikidata_place_types.txt and wikidata_place_type_levles.csv
+
+- script requires the [jq json parser](https://stedolan.github.io/jq/)
+
+- Script processes data from whatever set of Wikipedia languages are specified in the initial languages array
+
+- Script queries Wikidata Query Service API and imports all instances of place types listed in wikidata_place_types.txt
+
+- Script updates wikipedia_articles table with extracted wikidata 
+
+By including Wikidata in the wikipedia_articles table, new connections can be made on the fly from the Nominatim placex table to wikipedia_article importance scores. 
+
+To download, convert, and import the data, then process required items, run:
+``` 
+./wikidata_import.sh
+```
diff --git a/data-sources/wikipedia-wikidata/import_wikidata.sh b/data-sources/wikipedia-wikidata/import_wikidata.sh
new file mode 100755 (executable)
index 0000000..4654616
--- /dev/null
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+psqlcmd() {
+     psql wikiprocessingdb
+}
+
+mysql2pgsqlcmd() {
+     ./mysql2pgsql.perl /dev/stdin /dev/stdout
+}
+
+
+# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
+
+language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" )
+
+
+# get a few wikidata dump tables
+
+wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz
+wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz
+wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz
+
+
+# import wikidata tables
+
+gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd
+gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd
+gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd
+
+
+# get wikidata places from wikidata query API
+
+while read F  ; do
+    wget "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json
+    jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt
+    awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv
+    cat $F.csv >> wikidata_place_dump.csv
+    rm $F.json $F.txt $F.csv
+done < wikidata_place_types.txt
+
+
+# import wikidata places
+
+echo "CREATE TABLE wikidata_place_dump (item text, instance_of text);"  | psqlcmd
+echo "COPY wikidata_place_dump (item, instance_of) FROM  '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv' DELIMITER ',' CSV;"  | psqlcmd
+
+echo "CREATE TABLE wikidata_place_type_levels (place_type text, level integer);" | psqlcmd
+echo "COPY wikidata_place_type_levels (place_type, level) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv' DELIMITER ',' CSV HEADER;" | psqlcmd
+
+
+# create derived tables
+
+echo "CREATE TABLE geo_earth_primary AS SELECT gt_page_id, gt_lat, gt_lon FROM geo_tags WHERE gt_globe = 'earth' AND gt_primary = 1 AND NOT( gt_lat < -90 OR gt_lat > 90 OR gt_lon < -180 OR gt_lon > 180 OR gt_lat=0 OR gt_lon=0) ;" | psqlcmd
+echo "CREATE TABLE geo_earth_wikidata AS SELECT DISTINCT geo_earth_primary.gt_page_id, geo_earth_primary.gt_lat, geo_earth_primary.gt_lon, page.page_title, page.page_namespace FROM geo_earth_primary LEFT OUTER JOIN page ON (geo_earth_primary.gt_page_id = page.page_id) ORDER BY geo_earth_primary.gt_page_id;" | psqlcmd
+
+echo "ALTER TABLE wikidata_place_dump ADD COLUMN ont_level integer, ADD COLUMN lat numeric(11,8), ADD COLUMN lon numeric(11,8);" | psqlcmd
+echo "UPDATE wikidata_place_dump SET ont_level = wikidata_place_type_levels.level FROM wikidata_place_type_levels WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type;" | psqlcmd
+
+echo "CREATE TABLE wikidata_places AS SELECT DISTINCT ON (item) item, instance_of, MAX(ont_level) AS ont_level, lat, lon FROM wikidata_place_dump GROUP BY item, instance_of, ont_level, lat, lon ORDER BY item;" | psqlcmd
+echo "UPDATE wikidata_places SET lat = geo_earth_wikidata.gt_lat, lon = geo_earth_wikidata.gt_lon FROM geo_earth_wikidata WHERE wikidata_places.item = geo_earth_wikidata.page_title" | psqlcmd
+
+
+# process language pages 
+
+echo "CREATE TABLE wikidata_pages (item text, instance_of text, lat numeric(11,8), lon numeric(11,8), ips_site_page text, language text );" | psqlcmd
+
+for i in "${language[@]}"
+do
+   echo "CREATE TABLE wikidata_${i}_pages as select wikidata_places.item, wikidata_places.instance_of, wikidata_places.lat, wikidata_places.lon, wb_items_per_site.ips_site_page FROM wikidata_places LEFT JOIN wb_items_per_site ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id) WHERE ips_site_id = '${i}wiki' AND LEFT(wikidata_places.item,1) = 'Q' order by wikidata_places.item;" | psqlcmd
+   echo "ALTER TABLE wikidata_${i}_pages ADD COLUMN language text;" | psqlcmd
+   echo "UPDATE wikidata_${i}_pages SET language = '${i}';" | psqlcmd
+   echo "INSERT INTO wikidata_pages SELECT item, instance_of, lat, lon, ips_site_page, language FROM wikidata_${i}_pages;" | psqlcmd
+done
+
+echo "ALTER TABLE wikidata_pages ADD COLUMN wp_page_title text;" | psqlcmd
+echo "UPDATE wikidata_pages SET wp_page_title = REPLACE(ips_site_page, ' ', '_');" | psqlcmd
+echo "ALTER TABLE wikidata_pages DROP COLUMN ips_site_page;" | psqlcmd
+
+
+# add wikidata to wikipedia_article table
+
+echo "UPDATE wikipedia_article SET lat = wikidata_pages.lat, lon = wikidata_pages.lon, wd_page_title = wikidata_pages.item, instance_of = wikidata_pages.instance_of FROM wikidata_pages WHERE wikipedia_article.language = wikidata_pages.language AND wikipedia_article.title  = wikidata_pages.wp_page_title;" | psqlcmd
+echo "CREATE TABLE wikipedia_article_slim AS SELECT * FROM wikipedia_article WHERE wikidata_id IS NOT NULL;" | psqlcmd
+echo "ALTER TABLE wikipedia_article RENAME TO wikipedia_article_full;" | psqlcmd
+echo "ALTER TABLE wikipedia_article_slim RENAME TO wikipedia_article;" | psqlcmd
+
+
+# clean up intermediate tables
+
+echo "DROP TABLE wikidata_place_dump;" | psqlcmd
+echo "DROP TABLE geo_earth_primary;" | psqlcmd
+for i in "${language[@]}"
+do
+    echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd
+done
diff --git a/data-sources/wikipedia-wikidata/import_wikipedia.sh b/data-sources/wikipedia-wikidata/import_wikipedia.sh
new file mode 100755 (executable)
index 0000000..8b8ba52
--- /dev/null
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+psqlcmd() {
+     psql wikiprocessingdb
+}
+
+mysql2pgsqlcmd() {
+     ./mysql2pgsql.perl /dev/stdin /dev/stdout
+}
+
+
+# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
+
+language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" )
+
+
+# create wikipedia calculation tables
+
+echo "CREATE TABLE linkcounts (language text, title text, count integer, sumcount integer, lat double precision, lon double precision);"  | psqlcmd
+echo "CREATE TABLE wikipedia_article (language text NOT NULL, title text NOT NULL, langcount integer, othercount integer, totalcount integer, lat double precision, lon double precision, importance double precision, title_en text, osm_type character(1), osm_id bigint );" | psqlcmd
+echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title text );"  | psqlcmd
+
+
+# download individual wikipedia language tables
+
+for i in "${language[@]}"
+do
+    wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
+    wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
+    wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
+    wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
+done
+
+
+# import individual wikipedia language tables
+
+for i in "${language[@]}"
+do
+    gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd
+    gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd
+    gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd
+    gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd
+done
+
+
+# process language tables and associated pagelink counts
+
+for i in "${language[@]}"
+do
+    echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd
+    echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd
+    echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | psqlcmd
+    echo "alter table ${i}pagelinkcount add column othercount integer;" | psqlcmd
+    echo "update ${i}pagelinkcount set othercount = 0;" | psqlcmd
+    for j in "${language[@]}"
+    do
+        echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | psqlcmd
+    done
+    echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | psqlcmd
+done
+
+
+# calculate importance score for each wikipedia page
+
+echo "update wikipedia_article set importance = log(totalcount)/log((select max(totalcount) from wikipedia_article))" | psqlcmd
+
+
+# clean up intermediate tables to conserve space
+
+for i in "${language[@]}"
+do
+    echo "DROP TABLE ${i}pagelinks;" | psqlcmd
+    echo "DROP TABLE ${i}page;" | psqlcmd
+    echo "DROP TABLE ${i}langlinks;" | psqlcmd
+    echo "DROP TABLE ${i}redirect;" | psqlcmd
+    echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd
+done
similarity index 99%
rename from mysql2pgsql/mysql2pgsql.perl
rename to data-sources/wikipedia-wikidata/mysql2pgsql.perl
index e21eff37a5a8f4d63d60f28c9fd06eede31a57fe..3297331fc27adf16ccc22dd57492fde31e2ffe21 100755 (executable)
@@ -341,7 +341,7 @@ if (/(create\s+table\s+)([-_\w]+)\s/i) { #  example: CREATE TABLE `english_engli
         #  in the foreign-key case it will only remove the foreign-key constraint, not the other table entirely.)
         #  (source: 8.1.3 docs, section "drop table")
         warn "table $table will be dropped CASCADE\n";
-        $pre_create_sql .= "DROP TABLE $table CASCADE\\g\n";    # custom dumps may be missing the 'dump' commands
+        $pre_create_sql .= "DROP TABLE $table CASCADE;\n";    # custom dumps may be missing the 'dump' commands
     }
 
     s/(create\s+table\s+)([-_\w]+)\s/$1 $table /i;
@@ -367,6 +367,7 @@ if ($create_sql ne "") {         # we are inside create table statement so lets
         s/INSERT METHOD[=\s+][^;\s]+//i;
         s/PASSWORD=[^;\s]+//i;
         s/ROW_FORMAT=(?:DEFAULT|DYNAMIC|FIXED|COMPRESSED|REDUNDANT|COMPACT)+//i;
+        s/KEY_BLOCK_SIZE=8//i;
         s/DELAY KEY WRITE=[^;\s]+//i;
         s/INDEX DIRECTORY[=\s+][^;\s]+//i;
         s/DATA DIRECTORY=[^;\s]+//i;
@@ -389,6 +390,7 @@ if ($create_sql ne "") {         # we are inside create table statement so lets
         s/DEFAULT CHARSET=[^;\s]+//i; #  my mysql version is 4.1.11
         s/ENGINE\s*=\s*[^;\s]+//i;   #  my mysql version is 4.1.11
         s/ROW_FORMAT=[^;\s]+//i;   #  my mysql version is 5.0.22
+        s/KEY_BLOCK_SIZE=8//i; 
         s/MIN_ROWS=[^;\s]+//i;
         s/MAX_ROWS=[^;\s]+//i;
         s/AVG_ROW_LENGTH=[^;\s]+//i;
diff --git a/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv b/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv
new file mode 100644 (file)
index 0000000..e414321
--- /dev/null
@@ -0,0 +1,199 @@
+place_type,level\r
+Q9842,4\r
+Q9430,3\r
+Q928830,4\r
+Q9259,1\r
+Q91028,5\r
+Q8514,2\r
+Q8502,2\r
+Q83405,3\r
+Q82794,2\r
+Q820477,1\r
+Q811979,1\r
+Q8072,2\r
+Q79007,2\r
+Q786014,3\r
+Q75848,2\r
+Q75520,2\r
+Q728937,4\r
+Q7275,2\r
+Q719456,3\r
+Q7075,3\r
+Q697295,4\r
+Q6852233,2\r
+Q682943,3\r
+Q665487,5\r
+Q655686,3\r
+Q643589,5\r
+Q641226,2\r
+Q631305,2\r
+Q6256,2\r
+Q6023295,2\r
+Q5773747,5\r
+Q56061,1\r
+Q55659167,4\r
+Q55488,4\r
+Q55465477,3\r
+Q54050,2\r
+Q532,3\r
+Q53060,2\r
+Q52177058,4\r
+Q515716,5\r
+Q5153984,4\r
+Q515,3\r
+Q5144960,5\r
+Q5119,4\r
+Q5119,4\r
+Q5107,2\r
+Q5084,4\r
+Q5031071,4\r
+Q5003624,2\r
+Q4989906,1\r
+Q4976993,3\r
+Q486972,1\r
+Q486972,2\r
+Q483110,3\r
+Q4830453,4\r
+Q47521,3\r
+Q473972,1\r
+Q46831,2\r
+Q46614560,5\r
+Q44782,3\r
+Q44613,4\r
+Q44539,4\r
+Q44494,2\r
+Q44377,2\r
+Q4421,2\r
+Q43501,2\r
+Q4286337,3\r
+Q42523,3\r
+Q41176,2\r
+Q40357,3\r
+Q4022,4\r
+Q40080,2\r
+Q39816,2\r
+Q39715,3\r
+Q39614,1\r
+Q3957,3\r
+Q3947,4\r
+Q3914,3\r
+Q38723,2\r
+Q38720,3\r
+Q3623867,5\r
+Q35666,2\r
+Q355304,3\r
+Q35509,2\r
+Q35112127,3\r
+Q34985575,4\r
+Q34876,5\r
+Q34763,2\r
+Q34627,4\r
+Q3455524,3\r
+Q34442,4\r
+Q33837,2\r
+Q33506,3\r
+Q32815,4\r
+Q3257686,2\r
+Q3240715,2\r
+Q3191695,5\r
+Q3153117,2\r
+Q30198,2\r
+Q30139652,3\r
+Q294422,3\r
+Q2870166,3\r
+Q27686,3\r
+Q274153,3\r
+Q271669,1\r
+Q2659904,2\r
+Q24529780,2\r
+Q24354,3\r
+Q2354973,4\r
+Q23442,2\r
+Q23413,3\r
+Q23397,3\r
+Q2327515,4\r
+Q2311958,5\r
+Q22927291,6\r
+Q22698,1\r
+Q2175765,4\r
+Q205495,4\r
+Q204832,3\r
+Q2042028,2\r
+Q202216,6\r
+Q1970725,3\r
+Q194203,5\r
+Q194195,2\r
+Q190429,2\r
+Q185187,3\r
+Q185113,2\r
+Q183366,2\r
+Q1799794,1\r
+Q1788454,4\r
+Q1785071,3\r
+Q1777138,3\r
+Q177634,2\r
+Q177380,2\r
+Q174814,4\r
+Q174782,2\r
+Q17350442,2\r
+Q17343829,3\r
+Q17334923,0\r
+Q17018380,3\r
+Q16970,4\r
+Q16917,3\r
+Q16831714,4\r
+Q165,3\r
+Q160742,4\r
+Q159719,3\r
+Q159334,4\r
+Q15640612,5\r
+Q15324,2\r
+Q15284,5\r
+Q15243209,6\r
+Q152081,1\r
+Q15195406,4\r
+Q1500350,5\r
+Q149621,5\r
+Q14757767,4\r
+Q14350,3\r
+Q1410668,3\r
+Q1394476,3\r
+Q1377575,2\r
+Q1353183,3\r
+Q134447,4\r
+Q133215,3\r
+Q133056,2\r
+Q13221722,3\r
+Q13220204,2\r
+Q1311958,4\r
+Q1303167,3\r
+Q130003,3\r
+Q12518,2\r
+Q12516,3\r
+Q1248784,3\r
+Q123705,3\r
+Q12323,3\r
+Q12284,4\r
+Q12280,4\r
+Q121359,2\r
+Q1210950,2\r
+Q11755880,3\r
+Q11707,3\r
+Q11315,3\r
+Q11303,3\r
+Q1115575,4\r
+Q1107656,1\r
+Q10864048,1\r
+Q1076486,2\r
+Q105731,3\r
+Q105190,3\r
+Q1048525,3\r
+Q102496,5\r
+Q28872924,1\r
+Q15617994,1\r
+Q159313,2\r
+Q24398318,3\r
+Q327333,2\r
+Q43229,1\r
+Q860861,1\r
+Q4989906,1\r
diff --git a/data-sources/wikipedia-wikidata/wikidata_place_types.txt b/data-sources/wikipedia-wikidata/wikidata_place_types.txt
new file mode 100644 (file)
index 0000000..5578fd3
--- /dev/null
@@ -0,0 +1,195 @@
+Q9842
+Q9430
+Q928830
+Q9259
+Q91028
+Q8514
+Q8502
+Q83405
+Q82794
+Q820477
+Q811979
+Q8072
+Q79007
+Q786014
+Q75848
+Q75520
+Q728937
+Q7275
+Q719456
+Q7075
+Q697295
+Q6852233
+Q682943
+Q665487
+Q655686
+Q643589
+Q641226
+Q631305
+Q6256
+Q6023295
+Q5773747
+Q56061
+Q55659167
+Q55488
+Q55465477
+Q54050
+Q532
+Q53060
+Q52177058
+Q515716
+Q5153984
+Q515
+Q5144960
+Q5119
+Q5107
+Q5084
+Q5031071
+Q5003624
+Q4989906
+Q4976993
+Q486972
+Q483110
+Q4830453
+Q47521
+Q473972
+Q46831
+Q46614560
+Q44782
+Q44613
+Q44539
+Q44494
+Q44377
+Q4421
+Q43501
+Q4286337
+Q42523
+Q41176
+Q40357
+Q4022
+Q40080
+Q39816
+Q39715
+Q39614
+Q3957
+Q3947
+Q3914
+Q38723
+Q38720
+Q3623867
+Q35666
+Q355304
+Q35509
+Q35112127
+Q34985575
+Q34876
+Q34763
+Q34627
+Q3455524
+Q34442
+Q33837
+Q33506
+Q32815
+Q3257686
+Q3240715
+Q3191695
+Q3153117
+Q30198
+Q30139652
+Q294422
+Q2870166
+Q27686
+Q274153
+Q271669
+Q2659904
+Q24529780
+Q24354
+Q2354973
+Q23442
+Q23413
+Q23397
+Q2327515
+Q2311958
+Q22927291
+Q22698
+Q2175765
+Q205495
+Q204832
+Q2042028
+Q202216
+Q1970725
+Q194203
+Q194195
+Q190429
+Q185187
+Q185113
+Q183366
+Q1799794
+Q1788454
+Q1785071
+Q1777138
+Q177634
+Q177380
+Q174814
+Q174782
+Q17350442
+Q17343829
+Q17334923
+Q17018380
+Q16970
+Q16917
+Q16831714
+Q165
+Q160742
+Q159719
+Q159334
+Q15640612
+Q15324
+Q15284
+Q15243209
+Q152081
+Q15195406
+Q1500350
+Q149621
+Q14757767
+Q14350
+Q1410668
+Q1394476
+Q1377575
+Q1353183
+Q134447
+Q133215
+Q133056
+Q13221722
+Q13220204
+Q1311958
+Q1303167
+Q130003
+Q12518
+Q12516
+Q1248784
+Q123705
+Q12323
+Q12284
+Q12280
+Q121359
+Q1210950
+Q11755880
+Q11707
+Q11315
+Q11303
+Q1115575
+Q1107656
+Q10864048
+Q1076486
+Q105731
+Q105190
+Q1048525
+Q102496
+Q28872924
+Q15617994
+Q159313
+Q24398318
+Q327333
+Q43229
+Q860861
diff --git a/data-sources/wikipedia-wikidata/wikidata_places.md b/data-sources/wikipedia-wikidata/wikidata_places.md
new file mode 100644 (file)
index 0000000..bc153ff
--- /dev/null
@@ -0,0 +1,200 @@
+
+## Wikidata place types and related OSM Tags
+
+Wikidata does not have any official ontologies, however the [DBpedia project](https://wiki.dbpedia.org/) has created an [ontology](https://wiki.dbpedia.org/services-resources/ontology) that covered [place types](http://mappings.dbpedia.org/server/ontology/classes/#Place). The table below used the DBpedia place ontology as a starting point, and is provided as a cross-reference to the relevant OSM tags.
+
+The Wikidata place types listed in the table below can be used in conjunction with the [Wikidata Query Service](https://query.wikidata.org/) to retrieve instances of those place types from the Wikidata knowledgebase. 
+
+```
+SELECT ?item ?lat ?lon
+WHERE {
+  ?item wdt:P31*/wdt:P279*wd:Q9430; wdt:P625 ?pt.
+  ?item p:P625?loc.
+  ?loc psv:P625?cnode.
+  ?cnode wikibase:geoLatitude?lat.
+  ?cnode wikibase:geoLongitude?lon.
+}
+```
+
+An example json return for all instances of the Wikidata item "Q9430" (Ocean) can be seen at [json](https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT?item?lat?lon%20WHERE{?item%20wdt:P31*/wdt:P279*wd:Q9430;wdt:P625?pt.?item%20p:P625?loc.?loc%20psv:P625?cnode.?cnode%20wikibase:geoLatitude?lat.?cnode%20wikibase:geoLongitude?lon.})
+
+**NOTE** the OSM tags listed are those listed in the wikidata entries, and not all the possible matches for tags within OSM.
+
+
+   title   |             concept                   |       OSM Tag     | 
+-----------|---------------------------------------|------------------|
+[Q17334923](https://www.wikidata.org/entity/Q17334923)  | Location | | 
+[Q811979](https://www.wikidata.org/entity/Q811979)           | Architectural Structure | | 
+[Q194195](https://www.wikidata.org/entity/Q194195)   | Amusement park | 
+[Q204832](https://www.wikidata.org/entity/Q204832)   | Roller coaster | [attraction=roller_coaster](https://wiki.openstreetmap.org/wiki/Tag:attraction=roller_coaster) | 
+[Q2870166](https://www.wikidata.org/entity/Q2870166)   | Water ride | |
+[Q641226](https://www.wikidata.org/entity/Q641226)    | Arena | [amenity=events_centre](https://wiki.openstreetmap.org/wiki/Tag:amenity=events_centre) | 
+[Q41176](https://www.wikidata.org/entity/Q41176)     | Building | [building=yes](https://wiki.openstreetmap.org/wiki/Key:building) |
+[Q1303167](https://www.wikidata.org/entity/Q1303167)   | Barn | [building=barn](https://wiki.openstreetmap.org/wiki/Tag:building=barn) |
+[Q655686](https://www.wikidata.org/entity/Q655686)   | Commercial building | [building=commercial](https://wiki.openstreetmap.org/wiki/Tag:building=commercial) | 
+[Q4830453](https://www.wikidata.org/entity/Q4830453)   | Business | |
+[Q7075](https://www.wikidata.org/entity/Q7075)     | Library | [amenity=library](https://wiki.openstreetmap.org/wiki/Tag:amenity=library) |
+[Q133215](https://www.wikidata.org/entity/Q133215)   | Casino | [amenity=casino](https://wiki.openstreetmap.org/wiki/Tag:amenity=casino) | 
+[Q23413](https://www.wikidata.org/entity/Q23413)     | Castle | [historic=castle](https://wiki.openstreetmap.org/wiki/Tag:historic=castle) |
+[Q83405](https://www.wikidata.org/entity/Q83405)     | Factory | | 
+[Q53060](https://www.wikidata.org/entity/Q53060)     | Gate  | [barrier=gate](https://wiki.openstreetmap.org/wiki/Tag:barrier=gate) |cnode%20wikibase:geoLatitude?lat.?cnode%20wikibase:geoLongitude?lon.})
+[Q11755880](https://www.wikidata.org/entity/Q11755880)           | Residential Building  | [building=residential](https://wiki.openstreetmap.org/wiki/Tag:building=residential) | 
+[Q3947](https://www.wikidata.org/entity/Q3947)      | House  | [building=house](https://wiki.openstreetmap.org/wiki/Tag:building=house) |
+[Q35112127](https://www.wikidata.org/entity/Q35112127)           | Historic Building  | |
+[Q5773747](https://www.wikidata.org/entity/Q5773747)   | Historic house  | | 
+[Q38723](https://www.wikidata.org/entity/Q38723)           | Higher Education Institution  | 
+[Q3914](https://www.wikidata.org/entity/Q3914)      | School  | [amenity=school](https://wiki.openstreetmap.org/wiki/Tag:amenity=school) | 
+[Q9842](https://www.wikidata.org/entity/Q9842)      | Primary school  | | 
+[Q159334](https://www.wikidata.org/entity/Q159334)    | Secondary school  | | 
+[Q16917](https://www.wikidata.org/entity/Q16917)     | Hospital  | [amenity=hospital](https://wiki.openstreetmap.org/wiki/Tag:amenity=hospital), [healthcare=hospital](https://wiki.openstreetmap.org/wiki/Tag:healthcare=hospital), [building=hospital](https://wiki.openstreetmap.org/wiki/Tag:building=hospital) |
+[Q27686](https://www.wikidata.org/entity/Q27686)     | Hotel  | [tourism=hotel](https://wiki.openstreetmap.org/wiki/Tag:tourism=hotel), [building=hotel](https://wiki.openstreetmap.org/wiki/Tag:building=hotel) |
+[Q33506](https://www.wikidata.org/entity/Q33506)     | Museum  | [tourism=museum](https://wiki.openstreetmap.org/wiki/Tag:tourism=museum) |
+[Q40357](https://www.wikidata.org/entity/Q40357)     | Prison  | [amenity=prison](https://wiki.openstreetmap.org/wiki/Tag:amenity=prison) |
+[Q24398318](https://www.wikidata.org/entity/Q24398318)           | Religious Building  | |
+[Q160742](https://www.wikidata.org/entity/Q160742)    | Abbey  | |
+[Q16970](https://www.wikidata.org/entity/Q16970)     | Church (building)  | [building=church](https://wiki.openstreetmap.org/wiki/Tag:building=church) |
+[Q44613](https://www.wikidata.org/entity/Q44613)     | Monastery  | [amenity=monastery](https://wiki.openstreetmap.org/wiki/Tag:amenity=monastery) | 
+[Q32815](https://www.wikidata.org/entity/Q32815)     | Mosque  | [building=mosque](https://wiki.openstreetmap.org/wiki/Tag:building=mosque) | 
+[Q697295](https://www.wikidata.org/entity/Q697295)    | Shrine  | [building=shrine](https://wiki.openstreetmap.org/wiki/Tag:building=shrine) |
+[Q34627](https://www.wikidata.org/entity/Q34627)     | Synagogue  | [building=synagogue](https://wiki.openstreetmap.org/wiki/Tag:building=synagogue) |
+[Q44539](https://www.wikidata.org/entity/Q44539)     | Temple  | [building=temple](https://wiki.openstreetmap.org/wiki/Tag:building=temple) | 
+[Q11707](https://www.wikidata.org/entity/Q11707)     | Restaurant  | [amenity=restaurant](https://wiki.openstreetmap.org/wiki/Tag:amenity=restaurant) |
+[Q11315](https://www.wikidata.org/entity/Q11315)     | Shopping mall  | [shop=mall](https://wiki.openstreetmap.org/wiki/Tag:shop=mall), [shop=shopping_centre](https://wiki.openstreetmap.org/wiki/Tag:shop=shopping_centre) | 
+[Q11303](https://www.wikidata.org/entity/Q11303)     | Skyscraper  | |
+[Q17350442](https://www.wikidata.org/entity/Q17350442)           | Venue  | |
+[Q41253](https://www.wikidata.org/entity/Q41253)           | Movie Theater  | [amenity=cinema](https://wiki.openstreetmap.org/wiki/Tag:amenity=cinema) | 
+[Q483110](https://www.wikidata.org/entity/Q483110)    | Stadium  | [leisure=stadium](https://wiki.openstreetmap.org/wiki/Tag:leisure=stadium), [building=stadium](https://wiki.openstreetmap.org/wiki/Tag:building=stadium) |
+[Q24354](https://www.wikidata.org/entity/Q24354)     | Theater (structure)  | [amenity=theatre](https://wiki.openstreetmap.org/wiki/Tag:amenity=theatre) |
+[Q121359](https://www.wikidata.org/entity/Q121359)    | Infrastructure  | |
+[Q1248784](https://www.wikidata.org/entity/Q1248784)   | Airport  | |
+[Q12323](https://www.wikidata.org/entity/Q12323)     | Dam  | [waterway=dam](https://wiki.openstreetmap.org/wiki/Tag:waterway=dam) |
+[Q1353183](https://www.wikidata.org/entity/Q1353183)   | Launch pad  | | 
+[Q105190](https://www.wikidata.org/entity/Q105190)   | Levee  | [man_made=dyke](https://wiki.openstreetmap.org/wiki/Tag:man_made=dyke) |
+[Q105731](https://www.wikidata.org/entity/Q105731)    | Lock (water navigation)   | [lock=yes](https://wiki.openstreetmap.org/wiki/Key:lock) |
+[Q44782](https://www.wikidata.org/entity/Q44782)     | Port  | |
+[Q159719](https://www.wikidata.org/entity/Q159719)    | Power station  | [power=plant](https://wiki.openstreetmap.org/wiki/Tag:power=plant) |
+[Q174814](https://www.wikidata.org/entity/Q174814)    | Electrical substation   |  |
+[Q134447](https://www.wikidata.org/entity/Q134447)    | Nuclear power plant  | [plant:source=nuclear](https://wiki.openstreetmap.org/wiki/Tag:plant:source=nuclear) |
+[Q786014](https://www.wikidata.org/entity/Q786014)   | Rest area  | [highway=rest_area](https://wiki.openstreetmap.org/wiki/Tag:highway=rest_area), [highway=services](https://wiki.openstreetmap.org/wiki/Tag:highway=services) |
+[Q12280](https://www.wikidata.org/entity/Q12280)     | Bridge  | [bridge=* ](https://wiki.openstreetmap.org/wiki/Key:bridge), [man_made=bridge](https://wiki.openstreetmap.org/wiki/Tag:man_made=bridge) |
+[Q728937](https://www.wikidata.org/entity/Q728937)           | Railroad Line  | [railway=rail](https://wiki.openstreetmap.org/wiki/Tag:railway=rail) | 
+[Q1311958](https://www.wikidata.org/entity/Q1311958)           | Railway Tunnel  | | 
+[Q34442](https://www.wikidata.org/entity/Q34442)     | Road  | [highway=* ](https://wiki.openstreetmap.org/wiki/Key:highway), [route=road](https://wiki.openstreetmap.org/wiki/Tag:route=road) |
+[Q1788454](https://www.wikidata.org/entity/Q1788454)   | Road junction  |  | 
+[Q44377](https://www.wikidata.org/entity/Q44377)     | Tunnel  | [tunnel=* ](https://wiki.openstreetmap.org/wiki/Key:tunnel) |
+[Q5031071](https://www.wikidata.org/entity/Q5031071)  | Canal tunnel  | |
+[Q719456](https://www.wikidata.org/entity/Q719456)    | Station  | [public_transport=station](https://wiki.openstreetmap.org/wiki/Tag:public_transport=station) |
+[Q205495](https://www.wikidata.org/entity/Q205495)    | Filling station  | [amenity=fuel](https://wiki.openstreetmap.org/wiki/Tag:amenity=fuel) |
+[Q928830](https://www.wikidata.org/entity/Q928830)    | Metro station  | [station=subway](https://wiki.openstreetmap.org/wiki/Tag:station=subway) |
+[Q55488](https://www.wikidata.org/entity/Q55488)     | Train station  | [railway=station](https://wiki.openstreetmap.org/wiki/Tag:railway=station) |
+[Q2175765](https://www.wikidata.org/entity/Q2175765)   | Tram stop  | [railway=tram_stop](https://wiki.openstreetmap.org/wiki/Tag:railway=tram_stop), [public_transport=stop_position](https://wiki.openstreetmap.org/wiki/Tag:public_transport=stop_position) |
+[Q6852233](https://www.wikidata.org/entity/Q6852233)   | Military building  | |
+[Q44494](https://www.wikidata.org/entity/Q44494)     | Mill (grinding)  | |
+[Q185187](https://www.wikidata.org/entity/Q185187)    | Watermill  | [man_made=watermill](https://wiki.openstreetmap.org/wiki/Tag:man_made=watermill) |
+[Q38720](https://www.wikidata.org/entity/Q38720)     | Windmill  | [man_made=windmill](https://wiki.openstreetmap.org/wiki/Tag:man_made=windmill) | 
+[Q4989906](https://www.wikidata.org/entity/Q4989906)   | Monument  | [historic=monument](https://wiki.openstreetmap.org/wiki/Tag:historic=monument) |
+[Q5003624](https://www.wikidata.org/entity/Q5003624)   | Memorial  | [historic=memorial](https://wiki.openstreetmap.org/wiki/Tag:historic=memorial) |
+[Q271669](https://www.wikidata.org/entity/Q271669)   | Landform  | |
+[Q190429](https://www.wikidata.org/entity/Q190429)    | Depression (geology)  | |
+[Q17018380](https://www.wikidata.org/entity/Q17018380)  | Bight (geography)  | | 
+[Q54050](https://www.wikidata.org/entity/Q54050)     | Hill  | |
+[Q1210950](https://www.wikidata.org/entity/Q1210950)   | Channel (geography)  | |
+[Q23442](https://www.wikidata.org/entity/Q23442)    | Island  | [place=island](https://wiki.openstreetmap.org/wiki/Tag:place=island) | 
+[Q42523](https://www.wikidata.org/entity/Q42523)    | Atoll  | |
+[Q34763](https://www.wikidata.org/entity/Q34763)    | Peninsula  | | 
+[Q355304](https://www.wikidata.org/entity/Q355304)   | Watercourse  | |
+[Q30198](https://www.wikidata.org/entity/Q30198)    | Marsh  | [wetland=marsh](https://wiki.openstreetmap.org/wiki/Tag:wetland=marsh) |
+[Q75520](https://www.wikidata.org/entity/Q75520)    | Plateau  | |
+[Q2042028](https://www.wikidata.org/entity/Q2042028)  | Ravine  | |
+[Q631305](https://www.wikidata.org/entity/Q631305)   | Rock formation  | | 
+[Q12516](https://www.wikidata.org/entity/Q12516)    | Pyramid  | |
+[Q1076486](https://www.wikidata.org/entity/Q1076486) | Sports venue  |  |
+[Q682943](https://www.wikidata.org/entity/Q682943)   | Cricket field  | [sport=cricket](https://wiki.openstreetmap.org/wiki/Tag:sport=cricket) | 
+[Q1048525](https://www.wikidata.org/entity/Q1048525)  | Golf course  | [leisure=golf_course](https://wiki.openstreetmap.org/wiki/Tag:leisure=golf_course) |
+[Q1777138](https://www.wikidata.org/entity/Q1777138)  | Race track  | [highway=raceway](https://wiki.openstreetmap.org/wiki/Tag:highway=raceway) | 
+[Q130003](https://www.wikidata.org/entity/Q130003)   | Ski resort  | |
+[Q174782](https://www.wikidata.org/entity/Q174782)   | Town square  | [place=square](https://wiki.openstreetmap.org/wiki/Tag:place=square) |
+[Q12518](https://www.wikidata.org/entity/Q12518)    | Tower  | [building=tower](https://wiki.openstreetmap.org/wiki/Tag:building=tower), [man_made=tower](https://wiki.openstreetmap.org/wiki/Tag:man_made=tower) |
+[Q39715](https://www.wikidata.org/entity/Q39715)    | Lighthouse  | [man_made=lighthouse](https://wiki.openstreetmap.org/wiki/Tag:man_made=lighthouse) |
+[Q274153](https://www.wikidata.org/entity/Q274153)   | Water tower | [building=water_tower](https://wiki.openstreetmap.org/wiki/Tag:building=water_tower), [man_made=water_tower](https://wiki.openstreetmap.org/wiki/Tag:man_made=water_tower) |
+[Q43501](https://www.wikidata.org/entity/Q43501)    | Zoo  | [tourism=zoo](https://wiki.openstreetmap.org/wiki/Tag:tourism=zoo) | 
+[Q39614](https://www.wikidata.org/entity/Q39614)    | Cemetery  | [amenity=grave_yard](https://wiki.openstreetmap.org/wiki/Tag:amenity=grave_yard), [landuse=cemetery](https://wiki.openstreetmap.org/wiki/Tag:landuse=cemetery) |
+[Q152081](https://www.wikidata.org/entity/Q152081)   | Concentration camp  | |
+[Q1107656](https://www.wikidata.org/entity/Q1107656)  | Garden  | [leisure=garden](https://wiki.openstreetmap.org/wiki/Tag:leisure=garden) |
+[Q820477](https://www.wikidata.org/entity/Q820477)   | Mine |  | 
+[Q33837](https://www.wikidata.org/entity/Q33837) | Archipelago  | [place=archipelago](https://wiki.openstreetmap.org/wiki/Tag:place=archipelago) | 
+[Q40080](https://www.wikidata.org/entity/Q40080)    | Beach  | [natural=beach](https://wiki.openstreetmap.org/wiki/Tag:natural=beach) |
+[Q15324](https://www.wikidata.org/entity/Q15324)    | Body of water | [natural=water](https://wiki.openstreetmap.org/wiki/Tag:natural=water) | 
+[Q23397](https://www.wikidata.org/entity/Q23397)    | Lake  | [water=lake](https://wiki.openstreetmap.org/wiki/Tag:water=lake) | 
+[Q9430](https://www.wikidata.org/entity/Q9430)     | Ocean  | |
+[Q165](https://www.wikidata.org/entity/Q165)    | Sea  | |
+[Q47521](https://www.wikidata.org/entity/Q47521)    | Stream  | | 
+[Q12284](https://www.wikidata.org/entity/Q12284)    | Canal  | [waterway=canal](https://wiki.openstreetmap.org/wiki/Tag:waterway=canal) |
+[Q4022](https://www.wikidata.org/entity/Q4022)     | River  | [waterway=river](https://wiki.openstreetmap.org/wiki/Tag:waterway=river), [type=waterway](https://wiki.openstreetmap.org/wiki/Relation:waterway) |
+[Q185113](https://www.wikidata.org/entity/Q185113)   | Cape | [natural=cape](https://wiki.openstreetmap.org/wiki/Tag:natural=cape) | 
+[Q35509](https://www.wikidata.org/entity/Q35509)    | Cave  | [natural=cave_entrance](https://wiki.openstreetmap.org/wiki/Tag:natural=cave_entrance) | 
+[Q8514](https://www.wikidata.org/entity/Q8514)     | Desert  | | 
+[Q4421](https://www.wikidata.org/entity/Q4421)     | Forest  | [natural=wood](https://wiki.openstreetmap.org/wiki/Tag:natural=wood) |
+[Q35666](https://www.wikidata.org/entity/Q35666)    | Glacier  | [natural=glacier](https://wiki.openstreetmap.org/wiki/Tag:natural=glacier) |
+[Q177380](https://www.wikidata.org/entity/Q177380)   | Hot spring | | 
+[Q8502](https://www.wikidata.org/entity/Q8502)     | Mountain  | [natural=peak](https://wiki.openstreetmap.org/wiki/Tag:natural=peak) | 
+[Q133056](https://www.wikidata.org/entity/Q133056)   | Mountain pass  | | 
+[Q46831](https://www.wikidata.org/entity/Q46831)    | Mountain range  | |
+[Q39816](https://www.wikidata.org/entity/Q39816)    | Valley  | [natural=valley](https://wiki.openstreetmap.org/wiki/Tag:natural=valley) |
+[Q8072](https://www.wikidata.org/entity/Q8072)     | Volcano  | [natural=volcano](https://wiki.openstreetmap.org/wiki/Tag:natural=volcano) |
+[Q43229](https://www.wikidata.org/entity/Q43229)    | Organization  |  | 
+[Q327333](https://www.wikidata.org/entity/Q327333)   | Government agency  | [office=government](https://wiki.openstreetmap.org/wiki/Tag:office=government)|
+[Q22698](https://www.wikidata.org/entity/Q22698)    | Park | [leisure=park](https://wiki.openstreetmap.org/wiki/Tag:leisure=park) | 
+[Q159313](https://www.wikidata.org/entity/Q159313)   | Urban agglomeration | |
+[Q177634](https://www.wikidata.org/entity/Q177634)   | Community  | |
+[Q5107](https://www.wikidata.org/entity/Q5107)    | Continent | [place=continent](https://wiki.openstreetmap.org/wiki/Tag:place=continent) |
+[Q6256](https://www.wikidata.org/entity/Q6256)     | Country  | [place=country](https://wiki.openstreetmap.org/wiki/Tag:place=country) | 
+[Q75848](https://www.wikidata.org/entity/Q75848)    | Gated community | | 
+[Q3153117](https://www.wikidata.org/entity/Q3153117) | Intercommunality  | |
+[Q82794](https://www.wikidata.org/entity/Q82794)    | Region  | | 
+[Q56061](https://www.wikidata.org/entity/Q56061)    | Administrative division  | [boundary=administrative](https://wiki.openstreetmap.org/wiki/Tag:boundary=administrative)  | 
+[Q665487](https://www.wikidata.org/entity/Q665487)   | Diocese | | 
+[Q4976993](https://www.wikidata.org/entity/Q4976993)  | Parish | [boundary=civil_parish](https://wiki.openstreetmap.org/wiki/Tag:boundary=civil_parish) |
+[Q194203](https://www.wikidata.org/entity/Q194203)   | Arrondissements of France  | |
+[Q91028](https://www.wikidata.org/entity/Q91028)    | Arrondissements of Belgium  | | 
+[Q3623867](https://www.wikidata.org/entity/Q3623867)  | Arrondissements of Benin  | | 
+[Q2311958](https://www.wikidata.org/entity/Q2311958)  | Canton (country subdivision) | [political_division=canton](https://wiki.openstreetmap.org/wiki/FR:Cantons_in_France) |
+[Q643589](https://www.wikidata.org/entity/Q643589)   | Department |  | 
+[Q202216](https://www.wikidata.org/entity/Q202216)   | Overseas department and region  | |
+[Q149621](https://www.wikidata.org/entity/Q149621)   | District  | [place=district](https://wiki.openstreetmap.org/wiki/Tag:place=district) |
+[Q15243209](https://www.wikidata.org/wiki/Q15243209) | Historic district  | |
+[Q5144960](https://www.wikidata.org/entity/Q5144960)  | Microregion  | | 
+[Q15284](https://www.wikidata.org/entity/Q15284)    | Municipality  | |
+[Q515716](https://www.wikidata.org/entity/Q515716)   | Prefecture  | |
+[Q34876](https://www.wikidata.org/entity/Q34876)    | Province  | |
+[Q3191695](https://www.wikidata.org/entity/Q3191695)  | Regency (Indonesia)  | |
+[Q1970725](https://www.wikidata.org/entity/Q1970725)  | Natural region  | |
+[Q486972](https://www.wikidata.org/entity/Q486972)   | Human settlement  | | 
+[Q515](https://www.wikidata.org/entity/Q515)      | City  | [place=city](https://wiki.openstreetmap.org/wiki/Tag:place=city) |
+[Q5119](https://www.wikidata.org/entity/Q5119)     | Capital city | [capital=yes](https://wiki.openstreetmap.org/wiki/Key:capital) |
+[Q4286337](https://www.wikidata.org/entity/Q4286337)  | City district  | | 
+[Q1394476](https://www.wikidata.org/entity/Q1394476)  | Civil township  | | 
+[Q1115575](https://www.wikidata.org/entity/Q1115575)  | Civil parish  | [designation=civil_parish](https://wiki.openstreetmap.org/wiki/Tag:designation=civil_parish) |
+[Q5153984](https://www.wikidata.org/entity/Q5153984)  | Commune-level subdivisions  | |
+[Q123705](https://www.wikidata.org/entity/Q123705)   | Neighbourhood  | [place=neighbourhood](https://wiki.openstreetmap.org/wiki/Tag:place=neighbourhood) |
+[Q1500350](https://www.wikidata.org/entity/Q1500350)  | Townships of China  | |
+[Q17343829](https://www.wikidata.org/entity/Q17343829)           | Unincorporated Community  | |
+[Q3957](https://www.wikidata.org/entity/Q3957)     | Town  | [place=town](https://wiki.openstreetmap.org/wiki/Tag:place=town) | 
+[Q532](https://www.wikidata.org/entity/Q532)      | Village  | [place=village](https://wiki.openstreetmap.org/wiki/Tag:place=village) |
+[Q5084](https://www.wikidata.org/entity/Q5084)     | Hamlet   | [place=hamlet](https://wiki.openstreetmap.org/wiki/Tag:place=hamlet) | 
+[Q7275](https://www.wikidata.org/entity/Q7275)     | State  | | 
+[Q79007](https://www.wikidata.org/entity/Q79007)    | Street  | |
+[Q473972](https://www.wikidata.org/entity/Q473972)   | Protected area  | [boundary=protected_area](https://wiki.openstreetmap.org/wiki/Tag:boundary=protected_area) |
+[Q1377575](https://www.wikidata.org/entity/Q1377575)  | Wildlife refuge  | | 
+[Q1410668](https://www.wikidata.org/entity/Q1410668)  | National Wildlife Refuge  | [protection_title=National Wildlife Refuge](ownership=national), [ownership=national](https://wiki.openstreetmap.org/wiki/Tag:ownership=national)|
+[Q9259](https://www.wikidata.org/entity/Q9259)     | World Heritage Site  | |
+
+---
+
+### Future Work
+
+The Wikidata improvements to Nominatim can be further enhanced by:
+
+- continuing to add new Wikidata links to OSM objects
+- increasing the number of place types accounted for in the wikipedia_articles table
+- working to use place types in the wikipedia_article matching process
index c54ea083f1492dfa7ca3c47049c99ea42939797f..fb35cc1d5e75b0182058cdfb5876bfba8521ec48 100644 (file)
@@ -18,6 +18,7 @@ ADD_CUSTOM_TARGET(doc
    COMMAND ${CMAKE_COMMAND} -E create_symlink ${PROJECT_SOURCE_DIR}/data-sources/gb-postcodes/README.md ${CMAKE_CURRENT_BINARY_DIR}/data-sources/GB-Postcodes.md
    COMMAND ${CMAKE_COMMAND} -E create_symlink ${PROJECT_SOURCE_DIR}/data-sources/country-grid/README.md ${CMAKE_CURRENT_BINARY_DIR}/data-sources/Country-Grid.md
    COMMAND ${CMAKE_COMMAND} -E create_symlink ${PROJECT_SOURCE_DIR}/data-sources/country-grid/mexico.quad.png ${CMAKE_CURRENT_BINARY_DIR}/data-sources/mexico.quad.png
+   COMMAND ${CMAKE_COMMAND} -E create_symlink ${PROJECT_SOURCE_DIR}/data-sources/wikipedia-wikidata/README.md  ${CMAKE_CURRENT_BINARY_DIR}/data-sources/Wikipedia-Wikidata.md
    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Centos-7.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Centos-7.md
    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-16.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-16.md
    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/bash2md.sh ${PROJECT_SOURCE_DIR}/vagrant/Install-on-Ubuntu-18.sh ${CMAKE_CURRENT_BINARY_DIR}/appendix/Install-on-Ubuntu-18.md
index 49cd6e72da8a6fd64cdf27ddd3103f7e70eab58e..d87fda7a7e9d42c4b227d0cfdb67c44b6b646dc3 100644 (file)
@@ -28,6 +28,7 @@ pages:
         - 'US Census (Tiger)': 'data-sources/US-Tiger.md'
         - 'GB Postcodes': 'data-sources/GB-Postcodes.md'
         - 'Country Grid': 'data-sources/Country-Grid.md'
+        - 'Wikipedia & Wikidata': 'data-sources/Wikipedia-Wikidata.md'
     - 'Appendix':
         - 'Installation on CentOS 7' : 'appendix/Install-on-Centos-7.md'
         - 'Installation on Ubuntu 16' : 'appendix/Install-on-Ubuntu-16.md'
diff --git a/utils/import_wikipedia.sh b/utils/import_wikipedia.sh
deleted file mode 100755 (executable)
index f053499..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-psqlcmd=psql wikipedia2013
-mysql2pgsqlcmd=./mysql2pgsql.perl /dev/stdin /dev/stdout
-
-language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" )
-
-# wikipedia pages and links
-echo "CREATE TABLE linkcounts (language text, title text, count integer, sumcount integer, lat double, lon double );"  | $psqlcmd
-echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title text );"  | $psqlcmd
-
-for i in "${language[@]}"
-do
-    wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
-    wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
-    wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
-    wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
-done
-
-for i in "${language[@]}"
-do
-    gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | $mysql2pgsqlcmd | $psqlcmd
-    gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | $mysql2pgsqlcmd | $psqlcmd
-    gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | $mysql2pgsqlcmd | $psqlcmd
-    gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | $mysql2pgsqlcmd | $psqlcmd
-done
-
-for i in "${language[@]}"
-do
-    echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | $psqlcmd
-    echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | $psqlcmd
-    echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | $psqlcmd
-    echo "alter table ${i}pagelinkcount add column othercount integer;" | $psqlcmd
-    echo "update ${i}pagelinkcount set othercount = 0;" | $psqlcmd
-    for j in "${language[@]}"
-    do
-        echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | $psqlcmd
-    done
-    echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | $psqlcmd
-done
-
-echo "update wikipedia_article set importance = log(totalcount)/log((select max(totalcount) from wikipedia_article))" | $psqlcmd
-
-# precalculated lat,lon from dbpedia
-wget http://downloads.dbpedia.org/current/en/geo_coordinates_en.nq.bz2
-bzip2 -dc geo_coordinates_en.nq.bz2 | grep http://www.georss.org/georss/point | sed 's|<http://dbpedia.org/resource/[^>]*> *<http://www.georss.org/georss/point> "\(-\?[-0-9.E]\+\) \(-\?[-0-9.E]\+\)"@en <http://\([a-z][a-z]\).wikipedia.org/wiki/\([^#]\+\)#> .|update pagelinks set lat=\1, lon=\2 where language = '"'"'\3'"'"' and title = decode_url_part('"'"'\4'"'"');|g' | $psqlcmd
-
-# media wiki dumper
-wget https://github.com/bcollier/mwdumper/blob/master/build/mwdumper.jar
-
-# latest english wikipedia articles
-wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
-java -jar mwdumper.jar --format=sql:1.5 enwiki-latest-pages-articles.xml.bz2 | ./mysql2pgsql.perl /dev/stdin /dev/stdout | sed 's/"text (/text ("/g' | sed 's/"old_flags)"/"old_flags")/g' | sed 's/"revision (/revision ("/g' | sed 's/"rev_deleted)"/"rev_deleted")/g' | sed 's/"page (/page ("/g' | sed 's/"page_len)"/"page_len")/g' | sed "s/DATE_ADD(E'1970-01-01', INTERVAL UNIX_TIMESTAMP() SECOND)[+]//g" | sed 's/RAND()/0/g' | $psqlcmd