- if [[ $TEST_SUITE == "monaco" ]]; then wget --no-verbose --output-document=../data/monaco.osm.pbf http://download.geofabrik.de/europe/monaco-latest.osm.pbf; fi
- if [[ $TEST_SUITE == "monaco" ]]; then /usr/bin/env php ./utils/setup.php --osm-file ../data/monaco.osm.pbf --osm2pgsql-cache 1000 --all 2>&1 | grep -v 'ETA (seconds)'; fi
- if [[ $TEST_SUITE == "monaco" ]]; then /usr/bin/env php ./utils/specialphrases.php --wiki-import | psql -d test_api_nominatim >/dev/null; fi
+ - if [[ $TEST_SUITE == "monaco" ]]; then /usr/bin/env php ./utils/check_import_finished.php; fi
notifications:
email: false
)
set(CUSTOMSCRIPTS
+ utils/check_import_finished.php
utils/country_languages.php
- utils/importWikipedia.php
utils/export.php
utils/query.php
utils/setup.php
#!/bin/bash
psqlcmd() {
- psql wikiprocessingdb
+ psql --quiet wikiprocessingdb
}
mysql2pgsqlcmd() {
./mysql2pgsql.perl /dev/stdin /dev/stdout
}
+download() {
+ echo "Downloading $1"
+ wget --quiet --no-clobber --tries 3
+}
+
+# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
+# requires Bash 4.0
+readarray -t LANGUAGES < languages.txt
+
+
-# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
+echo "====================================================================="
+echo "Download wikidata dump tables"
+echo "====================================================================="
-language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" )
+# 114M wikidatawiki-latest-geo_tags.sql.gz
+# 1.7G wikidatawiki-latest-page.sql.gz
+# 1.2G wikidatawiki-latest-wb_items_per_site.sql.gz
+download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz
+download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz
+download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz
-# get a few wikidata dump tables
-wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz
-wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz
-wget https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz
+echo "====================================================================="
+echo "Import wikidata dump tables"
+echo "====================================================================="
-# import wikidata tables
+echo "Importing wikidatawiki-latest-geo_tags"
+gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd
-gzip -dc wikidatawiki-latest-geo_tags.sql.gz | mysql2pgsqlcmd | psqlcmd
-gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd
+echo "Importing wikidatawiki-latest-page"
+gzip -dc wikidatawiki-latest-page.sql.gz | mysql2pgsqlcmd | psqlcmd
+
+echo "Importing wikidatawiki-latest-wb_items_per_site"
gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd
-# get wikidata places from wikidata query API
+
+
+
+
+echo "====================================================================="
+echo "Get wikidata places from wikidata query API"
+echo "====================================================================="
+
+echo "Number of place types:"
+wc -l wikidata_place_types.txt
while read F ; do
- wget "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json
+ echo "Querying for place type $F..."
+ wget --quiet "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json
jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt
awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv
cat $F.csv >> wikidata_place_dump.csv
done < wikidata_place_types.txt
-# import wikidata places
-
-echo "CREATE TABLE wikidata_place_dump (item text, instance_of text);" | psqlcmd
-echo "COPY wikidata_place_dump (item, instance_of) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv' DELIMITER ',' CSV;" | psqlcmd
-
-echo "CREATE TABLE wikidata_place_type_levels (place_type text, level integer);" | psqlcmd
-echo "COPY wikidata_place_type_levels (place_type, level) FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv' DELIMITER ',' CSV HEADER;" | psqlcmd
-# create derived tables
+echo "====================================================================="
+echo "Import wikidata places"
+echo "====================================================================="
+
+echo "CREATE TABLE wikidata_place_dump (
+ item text,
+ instance_of text
+ );" | psqlcmd
+
+echo "COPY wikidata_place_dump (item, instance_of)
+ FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv'
+ DELIMITER ','
+ CSV
+ ;" | psqlcmd
+
+echo "CREATE TABLE wikidata_place_type_levels (
+ place_type text,
+ level integer
+ );" | psqlcmd
+
+echo "COPY wikidata_place_type_levels (place_type, level)
+ FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv'
+ DELIMITER ','
+ CSV
+ HEADER
+ ;" | psqlcmd
+
+
+
+
+echo "====================================================================="
+echo "Create derived tables"
+echo "====================================================================="
+
+echo "CREATE TABLE geo_earth_primary AS
+ SELECT gt_page_id,
+ gt_lat,
+ gt_lon
+ FROM geo_tags
+ WHERE gt_globe = 'earth'
+ AND gt_primary = 1
+ AND NOT( gt_lat < -90
+ OR gt_lat > 90
+ OR gt_lon < -180
+ OR gt_lon > 180
+ OR gt_lat=0
+ OR gt_lon=0)
+ ;" | psqlcmd
+
+echo "CREATE TABLE geo_earth_wikidata AS
+ SELECT DISTINCT geo_earth_primary.gt_page_id,
+ geo_earth_primary.gt_lat,
+ geo_earth_primary.gt_lon,
+ page.page_title,
+ page.page_namespace
+ FROM geo_earth_primary
+ LEFT OUTER JOIN page
+ ON (geo_earth_primary.gt_page_id = page.page_id)
+ ORDER BY geo_earth_primary.gt_page_id
+ ;" | psqlcmd
+
+echo "ALTER TABLE wikidata_place_dump
+ ADD COLUMN ont_level integer,
+ ADD COLUMN lat numeric(11,8),
+ ADD COLUMN lon numeric(11,8)
+ ;" | psqlcmd
+
+echo "UPDATE wikidata_place_dump
+ SET ont_level = wikidata_place_type_levels.level
+ FROM wikidata_place_type_levels
+ WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type
+ ;" | psqlcmd
+
+echo "CREATE TABLE wikidata_places
+ AS
+ SELECT DISTINCT ON (item) item,
+ instance_of,
+ MAX(ont_level) AS ont_level,
+ lat,
+ lon
+ FROM wikidata_place_dump
+ GROUP BY item,
+ instance_of,
+ ont_level,
+ lat,
+ lon
+ ORDER BY item
+ ;" | psqlcmd
+
+echo "UPDATE wikidata_places
+ SET lat = geo_earth_wikidata.gt_lat,
+ lon = geo_earth_wikidata.gt_lon
+ FROM geo_earth_wikidata
+ WHERE wikidata_places.item = geo_earth_wikidata.page_title
+ ;" | psqlcmd
+
+
+
+
+echo "====================================================================="
+echo "Process language pages"
+echo "====================================================================="
+
+
+echo "CREATE TABLE wikidata_pages (
+ item text,
+ instance_of text,
+ lat numeric(11,8),
+ lon numeric(11,8),
+ ips_site_page text,
+ language text
+ );" | psqlcmd
+
+for i in "${LANGUAGES[@]}"
+do
+ echo "CREATE TABLE wikidata_${i}_pages AS
+ SELECT wikidata_places.item,
+ wikidata_places.instance_of,
+ wikidata_places.lat,
+ wikidata_places.lon,
+ wb_items_per_site.ips_site_page
+ FROM wikidata_places
+ LEFT JOIN wb_items_per_site
+ ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id)
+ WHERE ips_site_id = '${i}wiki'
+ AND LEFT(wikidata_places.item,1) = 'Q'
+ ORDER BY wikidata_places.item
+ ;" | psqlcmd
+
+ echo "ALTER TABLE wikidata_${i}_pages
+ ADD COLUMN language text
+ ;" | psqlcmd
+
+ echo "UPDATE wikidata_${i}_pages
+ SET language = '${i}'
+ ;" | psqlcmd
+
+ echo "INSERT INTO wikidata_pages
+ SELECT item,
+ instance_of,
+ lat,
+ lon,
+ ips_site_page,
+ language
+ FROM wikidata_${i}_pages
+ ;" | psqlcmd
+done
-echo "CREATE TABLE geo_earth_primary AS SELECT gt_page_id, gt_lat, gt_lon FROM geo_tags WHERE gt_globe = 'earth' AND gt_primary = 1 AND NOT( gt_lat < -90 OR gt_lat > 90 OR gt_lon < -180 OR gt_lon > 180 OR gt_lat=0 OR gt_lon=0) ;" | psqlcmd
-echo "CREATE TABLE geo_earth_wikidata AS SELECT DISTINCT geo_earth_primary.gt_page_id, geo_earth_primary.gt_lat, geo_earth_primary.gt_lon, page.page_title, page.page_namespace FROM geo_earth_primary LEFT OUTER JOIN page ON (geo_earth_primary.gt_page_id = page.page_id) ORDER BY geo_earth_primary.gt_page_id;" | psqlcmd
+echo "ALTER TABLE wikidata_pages
+ ADD COLUMN wp_page_title text
+ ;" | psqlcmd
+echo "UPDATE wikidata_pages
+ SET wp_page_title = REPLACE(ips_site_page, ' ', '_')
+ ;" | psqlcmd
+echo "ALTER TABLE wikidata_pages
+ DROP COLUMN ips_site_page
+ ;" | psqlcmd
-echo "ALTER TABLE wikidata_place_dump ADD COLUMN ont_level integer, ADD COLUMN lat numeric(11,8), ADD COLUMN lon numeric(11,8);" | psqlcmd
-echo "UPDATE wikidata_place_dump SET ont_level = wikidata_place_type_levels.level FROM wikidata_place_type_levels WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type;" | psqlcmd
-echo "CREATE TABLE wikidata_places AS SELECT DISTINCT ON (item) item, instance_of, MAX(ont_level) AS ont_level, lat, lon FROM wikidata_place_dump GROUP BY item, instance_of, ont_level, lat, lon ORDER BY item;" | psqlcmd
-echo "UPDATE wikidata_places SET lat = geo_earth_wikidata.gt_lat, lon = geo_earth_wikidata.gt_lon FROM geo_earth_wikidata WHERE wikidata_places.item = geo_earth_wikidata.page_title" | psqlcmd
-# process language pages
+echo "====================================================================="
+echo "Add wikidata to wikipedia_article table"
+echo "====================================================================="
-echo "CREATE TABLE wikidata_pages (item text, instance_of text, lat numeric(11,8), lon numeric(11,8), ips_site_page text, language text );" | psqlcmd
+echo "UPDATE wikipedia_article
+ SET lat = wikidata_pages.lat,
+ lon = wikidata_pages.lon,
+ wd_page_title = wikidata_pages.item,
+ instance_of = wikidata_pages.instance_of
+ FROM wikidata_pages
+ WHERE wikipedia_article.language = wikidata_pages.language
+ AND wikipedia_article.title = wikidata_pages.wp_page_title
+ ;" | psqlcmd
-for i in "${language[@]}"
-do
- echo "CREATE TABLE wikidata_${i}_pages as select wikidata_places.item, wikidata_places.instance_of, wikidata_places.lat, wikidata_places.lon, wb_items_per_site.ips_site_page FROM wikidata_places LEFT JOIN wb_items_per_site ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id) WHERE ips_site_id = '${i}wiki' AND LEFT(wikidata_places.item,1) = 'Q' order by wikidata_places.item;" | psqlcmd
- echo "ALTER TABLE wikidata_${i}_pages ADD COLUMN language text;" | psqlcmd
- echo "UPDATE wikidata_${i}_pages SET language = '${i}';" | psqlcmd
- echo "INSERT INTO wikidata_pages SELECT item, instance_of, lat, lon, ips_site_page, language FROM wikidata_${i}_pages;" | psqlcmd
-done
+echo "CREATE TABLE wikipedia_article_slim
+ AS
+ SELECT * FROM wikipedia_article
+ WHERE wikidata_id IS NOT NULL
+ ;" | psqlcmd
-echo "ALTER TABLE wikidata_pages ADD COLUMN wp_page_title text;" | psqlcmd
-echo "UPDATE wikidata_pages SET wp_page_title = REPLACE(ips_site_page, ' ', '_');" | psqlcmd
-echo "ALTER TABLE wikidata_pages DROP COLUMN ips_site_page;" | psqlcmd
+echo "ALTER TABLE wikipedia_article
+ RENAME TO wikipedia_article_full
+ ;" | psqlcmd
+echo "ALTER TABLE wikipedia_article_slim
+ RENAME TO wikipedia_article
+ ;" | psqlcmd
-# add wikidata to wikipedia_article table
-echo "UPDATE wikipedia_article SET lat = wikidata_pages.lat, lon = wikidata_pages.lon, wd_page_title = wikidata_pages.item, instance_of = wikidata_pages.instance_of FROM wikidata_pages WHERE wikipedia_article.language = wikidata_pages.language AND wikipedia_article.title = wikidata_pages.wp_page_title;" | psqlcmd
-echo "CREATE TABLE wikipedia_article_slim AS SELECT * FROM wikipedia_article WHERE wikidata_id IS NOT NULL;" | psqlcmd
-echo "ALTER TABLE wikipedia_article RENAME TO wikipedia_article_full;" | psqlcmd
-echo "ALTER TABLE wikipedia_article_slim RENAME TO wikipedia_article;" | psqlcmd
-# clean up intermediate tables
+echo "====================================================================="
+echo "Dropping intermediate tables"
+echo "====================================================================="
echo "DROP TABLE wikidata_place_dump;" | psqlcmd
echo "DROP TABLE geo_earth_primary;" | psqlcmd
-for i in "${language[@]}"
+for i in "${LANGUAGES[@]}"
do
echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd
done
#!/bin/bash
psqlcmd() {
- psql wikiprocessingdb
+ psql --quiet wikiprocessingdb
}
mysql2pgsqlcmd() {
./mysql2pgsql.perl /dev/stdin /dev/stdout
}
+download() {
+ echo "Downloading $1"
+ wget --quiet --no-clobber --tries 3
+}
+
+
+# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
+# requires Bash 4.0
+readarray -t LANGUAGES < languages.txt
+
+
+
+echo "====================================================================="
+echo "Create wikipedia calculation tables"
+echo "====================================================================="
+
+echo "CREATE TABLE linkcounts (
+ language text,
+ title text,
+ count integer,
+ sumcount integer,
+ lat double precision,
+ lon double precision
+ );" | psqlcmd
-# list the languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
+echo "CREATE TABLE wikipedia_article (
+ language text NOT NULL,
+ title text NOT NULL,
+ langcount integer,
+ othercount integer,
+ totalcount integer,
+ lat double precision,
+ lon double precision,
+ importance double precision,
+ title_en text,
+ osm_type character(1),
+ osm_id bigint
+ );" | psqlcmd
-language=( "ar" "bg" "ca" "cs" "da" "de" "en" "es" "eo" "eu" "fa" "fr" "ko" "hi" "hr" "id" "it" "he" "lt" "hu" "ms" "nl" "ja" "no" "pl" "pt" "kk" "ro" "ru" "sk" "sl" "sr" "fi" "sv" "tr" "uk" "vi" "vo" "war" "zh" )
+echo "CREATE TABLE wikipedia_redirect (
+ language text,
+ from_title text,
+ to_title text
+ );" | psqlcmd
-# create wikipedia calculation tables
-echo "CREATE TABLE linkcounts (language text, title text, count integer, sumcount integer, lat double precision, lon double precision);" | psqlcmd
-echo "CREATE TABLE wikipedia_article (language text NOT NULL, title text NOT NULL, langcount integer, othercount integer, totalcount integer, lat double precision, lon double precision, importance double precision, title_en text, osm_type character(1), osm_id bigint );" | psqlcmd
-echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title text );" | psqlcmd
-# download individual wikipedia language tables
+echo "====================================================================="
+echo "Download individual wikipedia language tables"
+echo "====================================================================="
-for i in "${language[@]}"
+
+for i in "${LANGUAGES[@]}"
do
- wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
- wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
- wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
- wget https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
+ echo "Language: $i"
+
+ # english is the largest
+ # 1.7G enwiki-latest-page.sql.gz
+ # 6.2G enwiki-latest-pagelinks.sql.gz
+ # 355M enwiki-latest-langlinks.sql.gz
+ # 128M enwiki-latest-redirect.sql.gz
+
+ # example of smaller languge turkish
+ # 53M trwiki-latest-page.sql.gz
+ # 176M trwiki-latest-pagelinks.sql.gz
+ # 106M trwiki-latest-langlinks.sql.gz
+ # 3.2M trwiki-latest-redirect.sql.gz
+
+ download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz
+ download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz
+ download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz
+ download https://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz
done
-# import individual wikipedia language tables
-for i in "${language[@]}"
+
+
+echo "====================================================================="
+echo "Import individual wikipedia language tables"
+echo "====================================================================="
+
+for i in "${LANGUAGES[@]}"
do
+ echo "Language: $i"
+
+ echo "Importing ${i}wiki-latest-pagelinks"
gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | mysql2pgsqlcmd | psqlcmd
- gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd
+
+ echo "Importing ${i}wiki-latest-page"
+ gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | mysql2pgsqlcmd | psqlcmd
+
+ echo "Importing ${i}wiki-latest-langlinks"
gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | mysql2pgsqlcmd | psqlcmd
- gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd
+
+ echo "Importing ${i}wiki-latest-redirect"
+ gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | mysql2pgsqlcmd | psqlcmd
done
-# process language tables and associated pagelink counts
-for i in "${language[@]}"
-do
- echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd
- echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | psqlcmd
- echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | psqlcmd
- echo "alter table ${i}pagelinkcount add column othercount integer;" | psqlcmd
- echo "update ${i}pagelinkcount set othercount = 0;" | psqlcmd
-done
-for i in "${language[@]}"
+
+echo "====================================================================="
+echo "Process language tables and associated pagelink counts"
+echo "====================================================================="
+
+
+for i in "${LANGUAGES[@]}"
do
- for j in "${language[@]}"
+ echo "Language: $i"
+
+ echo "CREATE TABLE ${i}pagelinkcount
+ AS
+ SELECT pl_title AS title,
+ COUNT(*) AS count
+ FROM ${i}pagelinks
+ WHERE pl_namespace = 0
+ GROUP BY pl_title
+ ;" | psqlcmd
+
+ echo "INSERT INTO linkcounts
+ SELECT '${i}',
+ pl_title,
+ COUNT(*)
+ FROM ${i}pagelinks
+ WHERE pl_namespace = 0
+ GROUP BY pl_title
+ ;" | psqlcmd
+
+ echo "INSERT INTO wikipedia_redirect
+ SELECT '${i}',
+ page_title,
+ rd_title
+ FROM ${i}redirect
+ JOIN ${i}page ON (rd_from = page_id)
+ WHERE page_namespace = 0
+ AND rd_namespace = 0
+ ;" | psqlcmd
+
+ echo "ALTER TABLE ${i}pagelinkcount
+ ADD COLUMN othercount integer
+ ;" | psqlcmd
+
+ echo "UPDATE ${i}pagelinkcount
+ SET othercount = 0
+ ;" | psqlcmd
+
+ for j in "${LANGUAGES[@]}"
do
- echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | psqlcmd
+ echo "UPDATE ${i}pagelinkcount
+ SET othercount = ${i}pagelinkcount.othercount + x.count
+ FROM (
+ SELECT page_title AS title,
+ count
+ FROM ${i}langlinks
+ JOIN ${i}page ON (ll_from = page_id)
+ JOIN ${j}pagelinkcount ON (ll_lang = '${j}' AND ll_title = title)
+ ) AS x
+ WHERE x.title = ${i}pagelinkcount.title
+ ;" | psqlcmd
done
- echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | psqlcmd
+
+ echo "INSERT INTO wikipedia_article
+ SELECT '${i}',
+ title,
+ count,
+ othercount,
+ count + othercount
+ FROM ${i}pagelinkcount
+ ;" | psqlcmd
done
-# calculate importance score for each wikipedia page
-echo "update wikipedia_article set importance = log(totalcount)/log((select max(totalcount) from wikipedia_article))" | psqlcmd
-# clean up intermediate tables to conserve space
+echo "====================================================================="
+echo "Calculate importance score for each wikipedia page"
+echo "====================================================================="
+
+echo "UPDATE wikipedia_article
+ SET importance = LOG(totalcount)/LOG((SELECT MAX(totalcount) FROM wikipedia_article))
+ ;" | psqlcmd
+
+
+
+
+
+echo "====================================================================="
+echo "Clean up intermediate tables to conserve space"
+echo "====================================================================="
-for i in "${language[@]}"
+for i in "${LANGUAGES[@]}"
do
- echo "DROP TABLE ${i}pagelinks;" | psqlcmd
- echo "DROP TABLE ${i}page;" | psqlcmd
- echo "DROP TABLE ${i}langlinks;" | psqlcmd
- echo "DROP TABLE ${i}redirect;" | psqlcmd
+ echo "DROP TABLE ${i}pagelinks;" | psqlcmd
+ echo "DROP TABLE ${i}page;" | psqlcmd
+ echo "DROP TABLE ${i}langlinks;" | psqlcmd
+ echo "DROP TABLE ${i}redirect;" | psqlcmd
echo "DROP TABLE ${i}pagelinkcount;" | psqlcmd
done
--- /dev/null
+ar
+bg
+ca
+cs
+da
+de
+en
+es
+eo
+eu
+fa
+fr
+ko
+hi
+hr
+id
+it
+he
+lt
+hu
+ms
+nl
+ja
+no
+pl
+pt
+kk
+ro
+ru
+sk
+sl
+sr
+fi
+sv
+tr
+uk
+vi
+war
+zh
\ No newline at end of file
and recompile (`cmake .. && make`).
-## I see the error "ERROR: permission denied for language c"
+### I see the error "ERROR: permission denied for language c"
`nominatim.so`, written in C, is required to be installed on the database
server. Some managed database (cloud) services like Amazon RDS do not allow
filesystem does not fully support 'mmap'. A notable candidate is virtualbox's
vboxfs.
+### I see the error: "clang: Command not found" on CentOS
+
+On CentOS 7 users reported `/opt/rh/llvm-toolset-7/root/usr/bin/clang: Command not found`.
+Double-check clang is installed. Instead of `make` try running `make CLANG=true`.
+
### nominatim UPDATE failed: ERROR: buffer 179261 is not owned by resource owner Portal
Several users [reported this](https://github.com/openstreetmap/Nominatim/issues/1168) during the initial import of the database. It's
Import all data necessary to compute addresses down to house number level.
* **settings/import-full.style**
Default style that also includes points of interest.
+* **settings/import-extratags.style**
+ Like the full style but also adds most of the OSM tags into the extratags
+ column.
The style can be changed with the configuration `CONST_Import_Style`.
street | 42h | 400 GB | 180 GB
address | 59h | 500 GB | 260 GB
full | 80h | 575 GB | 300 GB
+extratags | 80h | 585 GB | 310 GB
You can also customize the styles further. For a description of the
style format see [the development section](../develop/Import.md).
swapping or you are getting out-of-memory errors, reduce the cache size or
even consider using a flatnode file.
+### Verify import finished
+
+Run this script to verify all required tables and indices got created successfully.
+
+```sh
+./utils/check_import_finished.php
+```
+
+
## Tuning the database
Accurate word frequency information for search terms helps PostgreSQL's query
./utils/update.php --import-osmosis-all
-(Note that even though the old name "import-osmosis-all" has been kept for compatibility reasons, Osmosis is not required to run this - it uses pyosmium behind the scenes.)
+(Note that even though the old name "import-osmosis-all" has been kept for
+compatibility reasons, Osmosis is not required to run this - it uses pyosmium
+behind the scenes.)
If you have imported multiple country extracts and want to keep them
up-to-date, have a look at the script in
return ($this->getOne($sSQL, array(':tablename' => $sTableName)) == 1);
}
+ /**
+ * Check if an index exists in the database. Optional filtered by tablename
+ *
+ * @param string $sTableName
+ *
+ * @return boolean
+ */
+ public function indexExists($sIndexName, $sTableName = null)
+ {
+ return in_array($sIndexName, $this->getListOfIndices($sTableName));
+ }
+
+ /**
+ * Returns a list of index names in the database, optional filtered by tablename
+ *
+ * @param string $sTableName
+ *
+ * @return array
+ */
+ public function getListOfIndices($sTableName = null)
+ {
+ // table_name | index_name | column_name
+ // -----------------------+---------------------------------+--------------
+ // country_name | idx_country_name_country_code | country_code
+ // country_osm_grid | idx_country_osm_grid_geometry | geometry
+ // import_polygon_delete | idx_import_polygon_delete_osmid | osm_id
+ // import_polygon_delete | idx_import_polygon_delete_osmid | osm_type
+ // import_polygon_error | idx_import_polygon_error_osmid | osm_id
+ // import_polygon_error | idx_import_polygon_error_osmid | osm_type
+ $sSql = <<< END
+SELECT
+ t.relname as table_name,
+ i.relname as index_name,
+ a.attname as column_name
+FROM
+ pg_class t,
+ pg_class i,
+ pg_index ix,
+ pg_attribute a
+WHERE
+ t.oid = ix.indrelid
+ and i.oid = ix.indexrelid
+ and a.attrelid = t.oid
+ and a.attnum = ANY(ix.indkey)
+ and t.relkind = 'r'
+ and i.relname NOT LIKE 'pg_%'
+ FILTERS
+ ORDER BY
+ t.relname,
+ i.relname,
+ a.attname
+END;
+
+ $aRows = null;
+ if ($sTableName) {
+ $sSql = str_replace('FILTERS', 'and t.relname = :tablename', $sSql);
+ $aRows = $this->getAll($sSql, array(':tablename' => $sTableName));
+ } else {
+ $sSql = str_replace('FILTERS', '', $sSql);
+ $aRows = $this->getAll($sSql);
+ }
+
+ $aIndexNames = array_unique(array_map(function ($aRow) {
+ return $aRow['index_name'];
+ }, $aRows));
+ sort($aIndexNames);
+
+ return $aIndexNames;
+ }
+
/**
* Since the DSN includes the database name, checks if the connection works.
*
}
- function format_distance($fDistance)
+ function format_distance($fDistance, $bInMeters = false)
{
- // $fDistance is in meters
- if ($fDistance < 1)
- {
- return '0';
- }
- elseif ($fDistance < 1000)
- {
- return'<abbr class="distance" title="'.$fDistance.'">~'.(round($fDistance,0)).' m</abbr>';
- }
- else
- {
- return'<abbr class="distance" title="'.$fDistance.'">~'.(round($fDistance/1000,1)).' km</abbr>';
+ if ($bInMeters) {
+ // $fDistance is in meters
+ if ($fDistance < 1) {
+ return '0';
+ }
+ elseif ($fDistance < 1000) {
+ return '<abbr class="distance" title="'.$fDistance.' meters">~'.(round($fDistance,0)).' m</abbr>';
+ }
+ else {
+ return '<abbr class="distance" title="'.$fDistance.' meters">~'.(round($fDistance/1000,1)).' km</abbr>';
+ }
+ } else {
+ if ($fDistance == 0) {
+ return '0';
+ } else {
+ return '<abbr class="distance" title="spheric distance '.$fDistance.'">'.(round($fDistance,4)).'</abbr>';
+ }
}
}
}
- function _one_row($aAddressLine){
+ function _one_row($aAddressLine, $bDistanceInMeters = false){
$bNotUsed = isset($aAddressLine['isaddress']) && !$aAddressLine['isaddress'];
echo '<tr class="' . ($bNotUsed?'notused':'') . '">'."\n";
echo ' <td>' . osmLink($aAddressLine) . "</td>\n";
echo ' <td>' . (isset($aAddressLine['rank_address']) ? $aAddressLine['rank_address'] : '') . "</td>\n";
echo ' <td>' . ($aAddressLine['admin_level'] < 15 ? $aAddressLine['admin_level'] : '') . "</td>\n";
- echo ' <td>' . format_distance($aAddressLine['distance'])."</td>\n";
+ echo ' <td>' . format_distance($aAddressLine['distance'], $bDistanceInMeters)."</td>\n";
echo ' <td>' . detailsLink($aAddressLine,'details >') . "</td>\n";
echo "</tr>\n";
}
{
headline('Linked Places');
foreach ($aLinkedLines as $aAddressLine) {
- _one_row($aAddressLine);
+ _one_row($aAddressLine, true);
}
}
headline3($sGroupHeading);
foreach ($aHierarchyLines as $aAddressLine) {
- _one_row($aAddressLine);
+ _one_row($aAddressLine, true);
}
}
if (count($aHierarchyLines) >= 500) {
-Subproject commit 3a045d7f15adc05d34a719da4266418e9663b322
+Subproject commit 5f3f736348e0e76b0fb0da22aa55c7aa75ee517e
},
{
"keys" : ["country_code", "ISO3166-1", "is_in:country_code", "is_in:country",
- "addr:country", "addr:country", "addr:country_code"],
+ "addr:country", "addr:country_code"],
"values" : {
"" : "country"
}
},
{
"keys" : ["country_code", "ISO3166-1", "is_in:country_code", "is_in:country",
- "addr:country", "addr:country", "addr:country_code"],
+ "addr:country", "addr:country_code"],
"values" : {
"" : "country"
}
--- /dev/null
+[
+{
+ "keys" : ["*source"],
+ "values" : {
+ "" : "skip"
+ }
+},
+{
+ "keys" : ["name:prefix", "name:suffix", "name:botanical", "wikidata",
+ "*:wikidata"],
+ "values" : {
+ "" : "extra"
+ }
+},
+{
+ "keys" : ["ref", "int_ref", "nat_ref", "reg_ref", "loc_ref", "old_ref",
+ "iata", "icao", "pcode", "pcode:*"],
+ "values" : {
+ "" : "ref"
+ }
+},
+{
+ "keys" : ["name", "name:*", "int_name", "int_name:*", "nat_name", "nat_name:*",
+ "reg_name", "reg_name:*", "loc_name", "loc_name:*",
+ "old_name", "old_name:*", "alt_name", "alt_name:*", "alt_name_*",
+ "official_name", "official_name:*", "place_name", "place_name:*",
+ "short_name", "short_name:*", "brand"],
+ "values" : {
+ "" : "name"
+ }
+},
+{
+ "keys" : ["addr:housename"],
+ "values" : {
+ "" : "name,house"
+ }
+},
+{
+ "keys" : ["emergency"],
+ "values" : {
+ "fire_hydrant" : "skip",
+ "yes" : "skip",
+ "no" : "skip",
+ "" : "main"
+ }
+},
+{
+ "keys" : ["historic", "military"],
+ "values" : {
+ "no" : "skip",
+ "yes" : "skip",
+ "" : "main"
+ }
+},
+{
+ "keys" : ["natural"],
+ "values" : {
+ "yes" : "skip",
+ "no" : "skip",
+ "coastline" : "skip",
+ "" : "main,with_name"
+ }
+},
+{
+ "keys" : ["landuse"],
+ "values" : {
+ "cemetry" : "main,with_name",
+ "" : "main,fallback,with_name"
+ }
+},
+{
+ "keys" : ["highway"],
+ "values" : {
+ "no" : "skip",
+ "turning_circle" : "skip",
+ "mini_roundabout" : "skip",
+ "noexit" : "skip",
+ "crossing" : "skip",
+ "traffic_signals" : "main,with_name",
+ "service" : "main,with_name",
+ "cycleway" : "main,with_name",
+ "path" : "main,with_name",
+ "footway" : "main,with_name",
+ "steps" : "main,with_name",
+ "bridleway" : "main,with_name",
+ "track" : "main,with_name",
+ "byway": "main,with_name",
+ "motorway_link" : "main,with_name",
+ "trunk_link" : "main,with_name",
+ "primary_link" : "main,with_name",
+ "secondary_link" : "main,with_name",
+ "tertiary_link" : "main,with_name",
+ "" : "main"
+ }
+},
+{
+ "keys" : ["railway"],
+ "values" : {
+ "level_crossing" : "skip",
+ "no" : "skip",
+ "" : "main,with_name"
+ }
+},
+{
+ "keys" : ["man_made"],
+ "values" : {
+ "survey_point" : "skip",
+ "cutline" : "skip",
+ "" : "main"
+ }
+},
+{
+ "keys" : ["aerialway"],
+ "values" : {
+ "pylon" : "skip",
+ "no" : "skip",
+ "" : "main"
+ }
+},
+{
+ "keys" : ["boundary"],
+ "values" : {
+ "" : "main,with_name"
+ }
+},
+{
+ "keys" : ["amenity"],
+ "values" : {
+ "restaurant" : "main,operator",
+ "fuel" : "main,operator"
+ }
+},
+{
+ "keys" : ["aeroway", "amenity", "club", "craft", "leisure",
+ "office", "mountain_pass"],
+ "values" : {
+ "no" : "skip",
+ "" : "main"
+ }
+},
+{
+ "keys" : ["shop"],
+ "values" : {
+ "no" : "skip",
+ "" : "main,operator"
+ }
+},
+{
+ "keys" : ["tourism"],
+ "values" : {
+ "yes" : "skip",
+ "no" : "skip",
+ "" : "main,operator"
+ }
+},
+{
+ "keys" : ["bridge", "tunnel"],
+ "values" : {
+ "" : "main,with_name_key"
+ }
+},
+{
+ "keys" : ["waterway"],
+ "values" : {
+ "riverbank" : "skip",
+ "" : "main,with_name"
+ }
+},
+{
+ "keys" : ["place"],
+ "values" : {
+ "" : "main"
+ }
+},
+{
+ "keys" : ["junction"],
+ "values" : {
+ "" : "main,fallback,with_name"
+ }
+},
+{
+ "keys" : ["postal_code", "postcode", "addr:postcode",
+ "tiger:zip_left", "tiger:zip_right"],
+ "values" : {
+ "" : "postcode,fallback"
+ }
+},
+{
+ "keys" : ["country_code", "ISO3166-1", "is_in:country_code", "is_in:country",
+ "addr:country", "addr:country_code"],
+ "values" : {
+ "" : "country"
+ }
+},
+{
+ "keys" : ["addr:housenumber", "addr:conscriptionnumber", "addr:streetnumber"],
+ "values" : {
+ "" : "address,house"
+ }
+},
+{
+ "keys" : ["addr:interpolation"],
+ "values" : {
+ "" : "interpolation,address"
+ }
+},
+{
+ "keys" : ["addr:*", "is_in:*", "tiger:county", "is_in"],
+ "values" : {
+ "" : "address"
+ }
+},
+{
+ "keys" : ["building"],
+ "values" : {
+ "no" : "skip",
+ "" : "main,fallback,with_name"
+ }
+},
+{
+ "keys" : ["note", "note:*", "source", "source*", "attribution",
+ "comment", "fixme", "FIXME", "created_by", "tiger:*", "NHD:*",
+ "nhd:*", "gnis:*", "geobase:*", "KSJ2:*", "yh:*",
+ "osak:*", "naptan:*", "CLC:*", "import", "it:fvg:*",
+ "type", "lacounty:*", "ref:ruian:*", "building:ruian:type",
+ "ref:linz:*"],
+ "values" : {
+ "" : "skip"
+ }
+},
+{
+ "keys" : [""],
+ "values" : {
+ "" : "extra"
+ }
+}
+]
},
{
"keys" : ["country_code", "ISO3166-1", "is_in:country_code", "is_in:country",
- "addr:country", "addr:country", "addr:country_code"],
+ "addr:country", "addr:country_code"],
"values" : {
"" : "country"
}
},
{
"keys" : ["country_code", "ISO3166-1", "is_in:country_code", "is_in:country",
- "addr:country", "addr:country", "addr:country_code"],
+ "addr:country", "addr:country_code"],
"values" : {
"" : "country"
}
--- /dev/null
+<?php
+
+require_once(CONST_BasePath.'/lib/init-cmd.php');
+
+$term_colors = array(
+ 'green' => "\033[92m",
+ 'red' => "\x1B[31m",
+ 'normal' => "\033[0m"
+);
+
+$print_success = function ($message = 'OK') use ($term_colors) {
+ echo $term_colors['green'].$message.$term_colors['normal']."\n";
+};
+
+$print_fail = function ($message = 'Failed') use ($term_colors) {
+ echo $term_colors['red'].$message.$term_colors['normal']."\n";
+};
+
+
+$oDB = new Nominatim\DB;
+
+
+echo 'Checking database got created ... ';
+if ($oDB->databaseExists()) {
+ $print_success();
+} else {
+ $print_fail();
+ echo <<< END
+ Hints:
+ * Is the database server started?
+ * Check the CONST_Database_DSN variable in build/settings/local.php
+ * Try connecting to the database with the same settings
+
+END;
+ exit(1);
+}
+
+
+echo 'Checking nominatim.so module installed ... ';
+$sStandardWord = $oDB->getOne("SELECT make_standard_name('a')");
+if ($sStandardWord === 'a') {
+ $print_success();
+} else {
+ $print_fail();
+ echo <<< END
+ The Postgresql extension nominatim.so was not found in the database.
+ Hints:
+ * Check the output of the CMmake/make installation step
+ * Does nominatim.so exist?
+ * Does nominatim.so exist on the database server?
+ * Can nominatim.so be accessed by the database user?
+
+END;
+ exit(1);
+}
+
+echo 'Checking place table ... ';
+if ($oDB->tableExists('place')) {
+ $print_success();
+} else {
+ $print_fail();
+ echo <<< END
+ * The import didn't finish.
+ Hints:
+ * Check the output of the utils/setup.php you ran.
+ Usually the osm2pgsql step failed. Check for errors related to
+ * the file you imported not containing any places
+ * harddrive full
+ * out of memory (RAM)
+ * osm2pgsql killed by other scripts, for consuming to much memory
+
+END;
+ exit(1);
+}
+
+
+
+echo 'Checking indexing status ... ';
+$iUnindexed = $oDB->getOne('SELECT count(*) FROM placex WHERE indexed_status > 0');
+if ($iUnindexed == 0) {
+ $print_success();
+} else {
+ $print_fail();
+ echo <<< END
+ The indexing didn't finish. There is still $iUnindexed places. See the
+ question 'Can a stopped/killed import process be resumed?' in the
+ troubleshooting guide.
+
+END;
+ exit(1);
+}
+
+echo "Search index creation\n";
+$aExpectedIndices = array(
+ // sql/indices.src.sql
+ 'idx_word_word_id',
+ 'idx_place_addressline_address_place_id',
+ 'idx_placex_rank_search',
+ 'idx_placex_rank_address',
+ 'idx_placex_pendingsector',
+ 'idx_placex_parent_place_id',
+ 'idx_placex_geometry_reverse_lookuppoint',
+ 'idx_placex_geometry_reverse_lookuppolygon',
+ 'idx_placex_geometry_reverse_placenode',
+ 'idx_location_area_country_place_id',
+ 'idx_osmline_parent_place_id',
+ 'idx_osmline_parent_osm_id',
+ 'idx_place_osm_unique',
+ 'idx_postcode_id',
+ 'idx_postcode_postcode',
+
+ // sql/indices_search.src.sql
+ 'idx_search_name_nameaddress_vector',
+ 'idx_search_name_name_vector',
+ 'idx_search_name_centroid'
+);
+
+foreach ($aExpectedIndices as $sExpectedIndex) {
+ echo "Checking index $sExpectedIndex ... ";
+ if ($oDB->indexExists($sExpectedIndex)) {
+ $print_success();
+ } else {
+ $print_fail();
+ echo <<< END
+ Hints:
+ * Rerun the setup.php --create-search-indices step
+
+END;
+ exit(1);
+ }
+}
+
+if (CONST_Use_US_Tiger_Data) {
+ echo 'Checking TIGER table exists ... ';
+ if ($oDB->tableExists('location_property_tiger')) {
+ $print_success();
+ } else {
+ $print_fail();
+ echo <<< END
+ Table 'location_property_tiger' does not exist. Run the TIGER data
+ import again.
+
+END;
+ exit(1);
+ }
+}
+
+
+
+
+exit(0);
+++ /dev/null
-<?php
-
-require_once(CONST_BasePath.'/lib/init-cmd.php');
-ini_set('memory_limit', '800M');
-
-$aCMDOptions
- = array(
- 'Create and setup nominatim search system',
- array('help', 'h', 0, 1, 0, 0, false, 'Show Help'),
- array('quiet', 'q', 0, 1, 0, 0, 'bool', 'Quiet output'),
- array('verbose', 'v', 0, 1, 0, 0, 'bool', 'Verbose output'),
-
- array('create-tables', '', 0, 1, 0, 0, 'bool', 'Create wikipedia tables'),
- array('parse-articles', '', 0, 1, 0, 0, 'bool', 'Parse wikipedia articles'),
- array('link', '', 0, 1, 0, 0, 'bool', 'Try to link to existing OSM ids'),
- );
-getCmdOpt($_SERVER['argv'], $aCMDOptions, $aCMDResult, true, true);
-
-/*
-$sTestPageText = <<<EOD
-{{Coord|47|N|2|E|type:country_region:FR|display=title}}
-{{ Infobox Amusement park
-| name = Six Flags Great Adventure
-| image = [[File:SixFlagsGreatAdventure logo.png]]
-| caption = Six Flags Great Adventure logo
-| location = [[Jackson, New Jersey|Jackson]]
-| location2 = New Jersey
-| location3 = United States
-| address = 1 Six Flags Boulevard<ref name="drivedir"/>
-| season = March/April through October/November
-| opening_date = July 1, 1974
-| previous_names = Great Adventure
-| area_acre = 2200
-| rides = 45 park admission rides
-| coasters = 12
-| water_rides = 2
-| owner = [[Six Flags]]
-| general_manager =
-| homepage = [http://www.sixflags.com/parks/greatadventure/ Six Flags Great Adventure]
-}}
-EOD;
-var_dump(_templatesToProperties(_parseWikipediaContent($sTestPageText)));
-exit;
-//| coordinates = {{Coord|40|08|16.65|N|74|26|26.69|W|region:US-NJ_type:landmark|display=inline,title}}
-*/
-/*
-
- $a = array();
- $a[] = 'test';
-
- $oDB = new Nominatim\DB();
- $oDB->connect();
-
- if ($aCMDResult['drop-tables'])
- {
- $oDB->query('DROP TABLE wikipedia_article');
- $oDB->query('DROP TABLE wikipedia_link');
- }
-*/
-
-if ($aCMDResult['create-tables']) {
- $sSQL = <<<'EOD'
-CREATE TABLE wikipedia_article (
- language text NOT NULL,
- title text NOT NULL,
- langcount integer,
- othercount integer,
- totalcount integer,
- lat double precision,
- lon double precision,
- importance double precision,
- title_en text,
- osm_type character(1),
- osm_id bigint,
- infobox_type text,
- population bigint,
- website text
-);
- $oDB->query($sSQL);
-
- $oDB->query("SELECT AddGeometryColumn('wikipedia_article', 'location', 4326, 'GEOMETRY', 2)");
-
- $sSQL = <<<'EOD'
-CREATE TABLE wikipedia_link (
- from_id INTEGER,
- to_name text
- );
-EOD;
- $oDB->query($sSQL);
-}
-
-
-function degreesAndMinutesToDecimal($iDegrees, $iMinutes = 0, $fSeconds = 0, $sNSEW = 'N')
-{
- $sNSEW = strtoupper($sNSEW);
- return ($sNSEW == 'S' || $sNSEW == 'W'?-1:1) * ((float)$iDegrees + (float)$iMinutes/60 + (float)$fSeconds/3600);
-}
-
-
-function _parseWikipediaContent($sPageText)
-{
- $sPageText = str_replace("\n", ' ', $sPageText);
- $sPageText = preg_replace('#<!--.*?-->#m', '', $sPageText);
- $sPageText = preg_replace('#<math>.*?<\\/math>#m', '', $sPageText);
-
- $aPageText = preg_split('#({{|}}|\\[\\[|\\]\\]|[|])#', $sPageText, -1, PREG_SPLIT_DELIM_CAPTURE);
-
- $aPageProperties = array();
- $sPageBody = '';
- $aTemplates = array();
- $aLinks = array();
-
- $aTemplateStack = array();
- $aState = array('body');
- foreach ($aPageText as $i => $sPart) {
- switch ($sPart) {
- case '{{':
- array_unshift($aTemplateStack, array('', array()));
- array_unshift($aState, 'template');
- break;
- case '}}':
- if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
- $aTemplate = array_shift($aTemplateStack);
- array_shift($aState);
-
- $aTemplates[] = $aTemplate;
- }
- break;
- case '[[':
- $sLinkPage = '';
- $sLinkSyn = '';
- array_unshift($aState, 'link');
- break;
- case ']]':
- if ($aState[0] == 'link' || $aState[0] == 'linksynonim') {
- if (!$sLinkSyn) $sLinkSyn = $sLinkPage;
- if (substr($sLinkPage, 0, 6) == 'Image:') $sLinkSyn = substr($sLinkPage, 6);
-
- $aLinks[] = array($sLinkPage, $sLinkSyn);
-
- array_shift($aState);
- switch ($aState[0]) {
- case 'template':
- $aTemplateStack[0][0] .= trim($sPart);
- break;
- case 'templateparam':
- $aTemplateStack[0][1][0] .= $sLinkSyn;
- break;
- case 'link':
- $sLinkPage .= trim($sPart);
- break;
- case 'linksynonim':
- $sLinkSyn .= $sPart;
- break;
- case 'body':
- $sPageBody .= $sLinkSyn;
- break;
- default:
- var_dump($aState, $sPageName, $aTemplateStack, $sPart, $aPageText);
- fail('unknown state');
- }
- }
- break;
- case '|':
- if ($aState[0] == 'template' || $aState[0] == 'templateparam') {
- // Create a new template paramater
- $aState[0] = 'templateparam';
- array_unshift($aTemplateStack[0][1], '');
- }
- if ($aState[0] == 'link') $aState[0] = 'linksynonim';
- break;
- default:
- switch ($aState[0]) {
- case 'template':
- $aTemplateStack[0][0] .= trim($sPart);
- break;
- case 'templateparam':
- $aTemplateStack[0][1][0] .= $sPart;
- break;
- case 'link':
- $sLinkPage .= trim($sPart);
- break;
- case 'linksynonim':
- $sLinkSyn .= $sPart;
- break;
- case 'body':
- $sPageBody .= $sPart;
- break;
- default:
- var_dump($aState, $aPageText);
- fail('unknown state');
- }
- break;
- }
- }
- return $aTemplates;
-}
-
-function _templatesToProperties($aTemplates)
-{
- $aPageProperties = array();
- foreach ($aTemplates as $iTemplate => $aTemplate) {
- $aParams = array();
- foreach (array_reverse($aTemplate[1]) as $iParam => $sParam) {
- if (($iPos = strpos($sParam, '=')) === false) {
- $aParams[] = trim($sParam);
- } else {
- $aParams[trim(substr($sParam, 0, $iPos))] = trim(substr($sParam, $iPos+1));
- }
- }
- $aTemplates[$iTemplate][1] = $aParams;
- if (!isset($aPageProperties['sOfficialName']) && isset($aParams['official_name']) && $aParams['official_name']) $aPageProperties['sOfficialName'] = $aParams['official_name'];
- if (!isset($aPageProperties['iPopulation']) && isset($aParams['population']) && $aParams['population'] && preg_match('#^[0-9.,]+#', $aParams['population'])) {
- $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population']);
- }
- if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_total']) && $aParams['population_total'] && preg_match('#^[0-9.,]+#', $aParams['population_total'])) {
- $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_total']);
- }
- if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_urban']) && $aParams['population_urban'] && preg_match('#^[0-9.,]+#', $aParams['population_urban'])) {
- $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_urban']);
- }
- if (!isset($aPageProperties['iPopulation']) && isset($aParams['population_estimate']) && $aParams['population_estimate'] && preg_match('#^[0-9.,]+#', $aParams['population_estimate'])) {
- $aPageProperties['iPopulation'] = (int)str_replace(array(',', '.'), '', $aParams['population_estimate']);
- }
- if (!isset($aPageProperties['sWebsite']) && isset($aParams['website']) && $aParams['website']) {
- if (preg_match('#^\\[?([^ \\]]+)[^\\]]*\\]?$#', $aParams['website'], $aMatch)) {
- $aPageProperties['sWebsite'] = $aMatch[1];
- if (strpos($aPageProperties['sWebsite'], ':/'.'/') === false) {
- $aPageProperties['sWebsite'] = 'http:/'.'/'.$aPageProperties['sWebsite'];
- }
- }
- }
- if (!isset($aPageProperties['sTopLevelDomain']) && isset($aParams['cctld']) && $aParams['cctld']) {
- $aPageProperties['sTopLevelDomain'] = str_replace(array('[', ']', '.'), '', $aParams['cctld']);
- }
-
- if (!isset($aPageProperties['sInfoboxType']) && strtolower(substr($aTemplate[0], 0, 7)) == 'infobox') {
- $aPageProperties['sInfoboxType'] = trim(substr($aTemplate[0], 8));
- // $aPageProperties['aInfoboxParams'] = $aParams;
- }
-
- // Assume the first template with lots of params is the type (fallback for infobox)
- if (!isset($aPageProperties['sPossibleInfoboxType']) && count($aParams) > 10) {
- $aPageProperties['sPossibleInfoboxType'] = trim($aTemplate[0]);
- // $aPageProperties['aInfoboxParams'] = $aParams;
- }
-
- // do we have a lat/lon
- if (!isset($aPageProperties['fLat'])) {
- if (isset($aParams['latd']) && isset($aParams['longd'])) {
- $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['latd'], @$aParams['latm'], @$aParams['lats'], @$aParams['latNS']);
- $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['longd'], @$aParams['longm'], @$aParams['longs'], @$aParams['longEW']);
- }
- if (isset($aParams['lat_degrees']) && isset($aParams['lat_degrees'])) {
- $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams['lat_degrees'], @$aParams['lat_minutes'], @$aParams['lat_seconds'], @$aParams['lat_direction']);
- $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams['long_degrees'], @$aParams['long_minutes'], @$aParams['long_seconds'], @$aParams['long_direction']);
- }
- if (isset($aParams['latitude']) && isset($aParams['longitude'])) {
- if (preg_match('#[0-9.]+#', $aParams['latitude']) && preg_match('#[0-9.]+#', $aParams['longitude'])) {
- $aPageProperties['fLat'] = (float)$aParams['latitude'];
- $aPageProperties['fLon'] = (float)$aParams['longitude'];
- }
- }
- if (strtolower($aTemplate[0]) == 'coord') {
- if (isset($aParams[3]) && (strtoupper($aParams[3]) == 'N' || strtoupper($aParams[3]) == 'S')) {
- $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], $aParams[2], $aParams[3]);
- $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[4], $aParams[5], $aParams[6], $aParams[7]);
- } elseif (isset($aParams[0]) && isset($aParams[1]) && isset($aParams[2]) && (strtoupper($aParams[2]) == 'N' || strtoupper($aParams[2]) == 'S')) {
- $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aParams[0], $aParams[1], 0, $aParams[2]);
- $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aParams[3], $aParams[4], 0, $aParams[5]);
- } elseif (isset($aParams[0]) && isset($aParams[1]) && (strtoupper($aParams[1]) == 'N' || strtoupper($aParams[1]) == 'S')) {
- $aPageProperties['fLat'] = (strtoupper($aParams[1]) == 'N'?1:-1) * (float)$aParams[0];
- $aPageProperties['fLon'] = (strtoupper($aParams[3]) == 'E'?1:-1) * (float)$aParams[2];
- } elseif (isset($aParams[0]) && is_numeric($aParams[0]) && isset($aParams[1]) && is_numeric($aParams[1])) {
- $aPageProperties['fLat'] = (float)$aParams[0];
- $aPageProperties['fLon'] = (float)$aParams[1];
- }
- }
- if (isset($aParams['Latitude']) && isset($aParams['Longitude'])) {
- $aParams['Latitude'] = str_replace(' ', ' ', $aParams['Latitude']);
- $aParams['Longitude'] = str_replace(' ', ' ', $aParams['Longitude']);
- if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS]) to ([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
- $aPageProperties['fLat'] =
- (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
- +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
- } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([NS])#', $aParams['Latitude'], $aMatch)) {
- $aPageProperties['fLat'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
- }
-
- if (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW]) to ([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
- $aPageProperties['fLon'] =
- (degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4])
- +degreesAndMinutesToDecimal($aMatch[5], $aMatch[7], 0, $aMatch[8])) / 2;
- } elseif (preg_match('#^([0-9]+)°( ([0-9]+)′)? ([EW])#', $aParams['Longitude'], $aMatch)) {
- $aPageProperties['fLon'] = degreesAndMinutesToDecimal($aMatch[1], $aMatch[3], 0, $aMatch[4]);
- }
- }
- }
- }
- if (isset($aPageProperties['sPossibleInfoboxType'])) {
- if (!isset($aPageProperties['sInfoboxType'])) $aPageProperties['sInfoboxType'] = '#'.$aPageProperties['sPossibleInfoboxType'];
- unset($aPageProperties['sPossibleInfoboxType']);
- }
- return $aPageProperties;
-}
-
-if (isset($aCMDResult['parse-wikipedia'])) {
- $oDB = new Nominatim\DB();
- $oDB->connect();
-
- $sSQL = 'select page_title from content where page_namespace = 0 and page_id %10 = ';
- $sSQL .= $aCMDResult['parse-wikipedia'];
- $sSQL .= ' and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\' and page_content ilike \'%lon%\'))';
- $aArticleNames = $oDB->getCol($sSQL);
- /* $aArticleNames = $oDB->getCol($sSQL = 'select page_title from content where page_namespace = 0
- and (page_content ilike \'%{{Coord%\' or (page_content ilike \'%lat%\'
- and page_content ilike \'%lon%\')) and page_title in (\'Virginia\')');
- */
- foreach ($aArticleNames as $sArticleName) {
- $sPageText = $oDB->getOne('select page_content from content where page_namespace = 0 and page_title = \''.pg_escape_string($sArticleName).'\'');
- $aP = _templatesToProperties(_parseWikipediaContent($sPageText));
-
- if (isset($aP['sInfoboxType'])) {
- $aP['sInfoboxType'] = preg_replace('#\\s+#', ' ', $aP['sInfoboxType']);
- $sSQL = 'update wikipedia_article set ';
- $sSQL .= 'infobox_type = \''.pg_escape_string($aP['sInfoboxType']).'\'';
- $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
- $oDB->query($sSQL);
- }
- if (isset($aP['iPopulation'])) {
- $sSQL = 'update wikipedia_article set ';
- $sSQL .= 'population = \''.pg_escape_string($aP['iPopulation']).'\'';
- $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
- $oDB->query($sSQL);
- }
- if (isset($aP['sWebsite'])) {
- $sSQL = 'update wikipedia_article set ';
- $sSQL .= 'website = \''.pg_escape_string($aP['sWebsite']).'\'';
- $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
- $oDB->query($sSQL);
- }
- if (isset($aP['fLat']) && ($aP['fLat']!='-0' || $aP['fLon']!='-0')) {
- if (!isset($aP['sInfoboxType'])) $aP['sInfoboxType'] = '';
- echo $sArticleName.'|'.$aP['sInfoboxType'].'|'.$aP['fLat'].'|'.$aP['fLon'] ."\n";
- $sSQL = 'update wikipedia_article set ';
- $sSQL .= 'lat = \''.pg_escape_string($aP['fLat']).'\',';
- $sSQL .= 'lon = \''.pg_escape_string($aP['fLon']).'\'';
- $sSQL .= ' where language = \'en\' and title = \''.pg_escape_string($sArticleName).'\';';
- $oDB->query($sSQL);
- }
- }
-}
-
-
-function nominatimXMLStart($hParser, $sName, $aAttr)
-{
- global $aNominatRecords;
- switch ($sName) {
- case 'PLACE':
- $aNominatRecords[] = $aAttr;
- break;
- }
-}
-
-
-function nominatimXMLEnd($hParser, $sName)
-{
-}
-
-
-if (isset($aCMDResult['link'])) {
- $oDB = new Nominatim\DB();
- $oDB->connect();
-
- $aWikiArticles = $oDB->getAll("select * from wikipedia_article where language = 'en' and lat is not null and osm_type is null and totalcount < 31 order by importance desc limit 200000");
-
- // If you point this script at production OSM you will be blocked
- $sNominatimBaseURL = 'http://SEVERNAME/search.php';
-
- foreach ($aWikiArticles as $aRecord) {
- $aRecord['name'] = str_replace('_', ' ', $aRecord['title']);
-
- $sURL = $sNominatimBaseURL.'?format=xml&accept-language=en';
-
- echo "\n-- ".$aRecord['name'].', '.$aRecord['infobox_type']."\n";
- $fMaxDist = 0.0000001;
- $bUnknown = false;
- switch (strtolower($aRecord['infobox_type'])) {
- case 'former country':
- continue 2;
- case 'sea':
- $fMaxDist = 60; // effectively turn it off
- $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
- break;
- case 'country':
- case 'island':
- case 'islands':
- case 'continent':
- $fMaxDist = 60; // effectively turn it off
- $sURL .= '&featuretype=country';
- $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
- break;
- case 'prefecture japan':
- $aRecord['name'] = trim(str_replace(' Prefecture', ' ', $aRecord['name']));
- // intentionally no break
- case 'state':
- case '#us state':
- case 'county':
- case 'u.s. state':
- case 'u.s. state symbols':
- case 'german state':
- case 'province or territory of canada':
- case 'indian jurisdiction':
- case 'province':
- case 'french region':
- case 'region of italy':
- case 'kommune':
- case '#australia state or territory':
- case 'russian federal subject':
- $fMaxDist = 4;
- $sURL .= '&featuretype=state';
- $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
- break;
- case 'protected area':
- $fMaxDist = 1;
- $sURL .= '&nearlat='.$aRecord['lat'];
- $sURL .= '&nearlon='.$aRecord['lon'];
- $sURL .= '&viewbox='.($aRecord['lon']-$fMaxDist).','.($aRecord['lat']+$fMaxDist).','.($aRecord['lon']+$fMaxDist).','.($aRecord['lat']-$fMaxDist);
- break;
- case 'settlement':
- $bUnknown = true;
- // intentionally no break
- case 'french commune':
- case 'italian comune':
- case 'uk place':
- case 'italian comune':
- case 'australian place':
- case 'german place':
- case '#geobox':
- case 'u.s. county':
- case 'municipality':
- case 'city japan':
- case 'russian inhabited locality':
- case 'finnish municipality/land area':
- case 'england county':
- case 'israel municipality':
- case 'russian city':
- case 'city':
- $fMaxDist = 0.2;
- $sURL .= '&featuretype=settlement';
- $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
- break;
- case 'mountain':
- case 'mountain pass':
- case 'river':
- case 'lake':
- case 'airport':
- $fMaxDist = 0.2;
- $sURL .= '&viewbox='.($aRecord['lon']-0.5).','.($aRecord['lat']+0.5).','.($aRecord['lon']+0.5).','.($aRecord['lat']-0.5);
- break;
- case 'ship begin':
- $fMaxDist = 0.1;
- $aTypes = array('wreck');
- $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
- $sURL .= '&nearlat='.$aRecord['lat'];
- $sURL .= '&nearlon='.$aRecord['lon'];
- break;
- case 'road':
- case 'university':
- case 'company':
- case 'department':
- $fMaxDist = 0.005;
- $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
- $sURL .= '&bounded=1';
- $sURL .= '&nearlat='.$aRecord['lat'];
- $sURL .= '&nearlon='.$aRecord['lon'];
- break;
- default:
- $bUnknown = true;
- $fMaxDist = 0.005;
- $sURL .= '&viewbox='.($aRecord['lon']-0.01).','.($aRecord['lat']+0.01).','.($aRecord['lon']+0.01).','.($aRecord['lat']-0.01);
- // $sURL .= "&bounded=1";
- $sURL .= '&nearlat='.$aRecord['lat'];
- $sURL .= '&nearlon='.$aRecord['lon'];
- echo '-- Unknown: '.$aRecord['infobox_type']."\n";
- break;
- }
- $sNameURL = $sURL.'&q='.urlencode($aRecord['name']);
-
- var_Dump($sNameURL);
- $sXML = file_get_contents($sNameURL);
-
- $aNominatRecords = array();
- $hXMLParser = xml_parser_create();
- xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
- xml_parse($hXMLParser, $sXML, true);
- xml_parser_free($hXMLParser);
-
- if (!isset($aNominatRecords[0])) {
- $aNameParts = preg_split('#[(,]#', $aRecord['name']);
- if (count($aNameParts) > 1) {
- $sNameURL = $sURL.'&q='.urlencode(trim($aNameParts[0]));
- var_Dump($sNameURL);
- $sXML = file_get_contents($sNameURL);
-
- $aNominatRecords = array();
- $hXMLParser = xml_parser_create();
- xml_set_element_handler($hXMLParser, 'nominatimXMLStart', 'nominatimXMLEnd');
- xml_parse($hXMLParser, $sXML, true);
- xml_parser_free($hXMLParser);
- }
- }
-
- // assume first is best/right
- for ($i = 0; $i < count($aNominatRecords); $i++) {
- $fDiff = ($aRecord['lat']-$aNominatRecords[$i]['LAT']) * ($aRecord['lat']-$aNominatRecords[$i]['LAT']);
- $fDiff += ($aRecord['lon']-$aNominatRecords[$i]['LON']) * ($aRecord['lon']-$aNominatRecords[$i]['LON']);
- $fDiff = sqrt($fDiff);
- if ($bUnknown) {
- // If it was an unknown type base it on the rank of the found result
- $iRank = (int)$aNominatRecords[$i]['PLACE_RANK'];
- if ($iRank <= 4) $fMaxDist = 2;
- elseif ($iRank <= 8) $fMaxDist = 1;
- elseif ($iRank <= 10) $fMaxDist = 0.8;
- elseif ($iRank <= 12) $fMaxDist = 0.6;
- elseif ($iRank <= 17) $fMaxDist = 0.2;
- elseif ($iRank <= 18) $fMaxDist = 0.1;
- elseif ($iRank <= 22) $fMaxDist = 0.02;
- elseif ($iRank <= 26) $fMaxDist = 0.001;
- else $fMaxDist = 0.001;
- }
- echo '-- FOUND "'.substr($aNominatRecords[$i]['DISPLAY_NAME'], 0, 50);
- echo '", '.$aNominatRecords[$i]['CLASS'].', '.$aNominatRecords[$i]['TYPE'];
- echo ', '.$aNominatRecords[$i]['PLACE_RANK'].', '.$aNominatRecords[$i]['OSM_TYPE'];
- echo " (dist:$fDiff, max:$fMaxDist)\n";
- if ($fDiff > $fMaxDist) {
- echo "-- Diff too big $fDiff (max: $fMaxDist)".$aRecord['lat'].','.$aNominatRecords[$i]['LAT'].' & '.$aRecord['lon'].','.$aNominatRecords[$i]['LON']." \n";
- } else {
- $sSQL = 'update wikipedia_article set osm_type=';
- switch ($aNominatRecords[$i]['OSM_TYPE']) {
- case 'relation':
- $sSQL .= "'R'";
- break;
- case 'way':
- $sSQL .= "'W'";
- break;
- case 'node':
- $sSQL .= "'N'";
- break;
- }
- $sSQL .= ', osm_id='.$aNominatRecords[$i]['OSM_ID']." where language = '".pg_escape_string($aRecord['language'])."' and title = '".pg_escape_string($aRecord['title'])."'";
- $oDB->query($sSQL);
- break;
- }
- }
- }
-}
+++ /dev/null
-DROP TABLE entity;
-DROP TABLE entity_label;
-DROP TABLE entity_description;
-DROP TABLE entity_alias;
-DROP TABLE entity_link;
-DROP TABLE entity_property;
-
-CREATE TABLE entity (
- entity_id bigint,
- title text,
- pid bigint,
- qid bigint,
- datatype text,
- CONSTRAINT pk_entity PRIMARY KEY(entity_id)
-);
-
-CREATE TABLE entity_label (
- entity_id bigint,
- language text,
- label text,
- CONSTRAINT pk_entity_label PRIMARY KEY(entity_id,language)
-);
-
-CREATE TABLE entity_description (
- entity_id bigint,
- language text,
- description text,
- CONSTRAINT pk_entity_description PRIMARY KEY(entity_id,language)
-);
-
-CREATE TABLE entity_alias (
- entity_id bigint,
- language text,
- alias text,
- CONSTRAINT pk_entity_alias PRIMARY KEY(entity_id,language,alias)
-);
-
-CREATE TABLE entity_link (
- entity_id bigint,
- target text,
- value text,
- CONSTRAINT pk_entity_link PRIMARY KEY(entity_id,target)
-);
-
-CREATE TABLE entity_link_hit (
- entity_id bigint,
- target text,
- value text,
- hits bigint,
- CONSTRAINT pk_entity_link_hit PRIMARY KEY(entity_id,target)
-);
-
-CREATE TABLE link_hit (
- target text,
- value text,
- hits bigint,
- CONSTRAINT pk_link_hit PRIMARY KEY(target,value)
-);
-
-CREATE TABLE entity_property (
- entity_id bigint,
- order_id bigint,
- pid bigint,
- string text,
- toqid bigint,
- location geometry,
- datetime timestamp with time zone,
- CONSTRAINT pk_entity_property PRIMARY KEY(entity_id, order_id)
-);
-
-CREATE TABLE import_link_hit (
- target text,
- value text,
- hits bigint
-);
+++ /dev/null
-PSQL=/usr/lib/postgresql/9.2/bin/psql -d wikidata
-
-cat create.sql | $PSQL
-
-cat entity.csv | $PSQL -c "COPY entity from STDIN WITH CSV"
-cat entity_label.csv | $PSQL -c "COPY entity_label from STDIN WITH CSV"
-cat entity_description.csv | $PSQL -c "COPY entity_description from STDIN WITH CSV"
-cat entity_alias.csv | $PSQL -c "COPY entity_alias from STDIN WITH CSV"
-cat entity_link.csv | $PSQL -c "COPY entity_link from STDIN WITH CSV"
-cat entity_property.csv | $PSQL -c "COPY entity_property from STDIN WITH CSV"
-
-$PSQL -c "create index idx_entity_link_target on entity_link using btree (target,value)"
-$PSQL -c "create index idx_entity_qid on entity using btree (qid)"
-$PSQL -c "create table property_label_en as select pid,null::text as label from entity where pid is not null"
-$PSQL -c "update property_label_en set label = x.label from (select pid,label,language from entity join entity_label using (entity_id) where pid is not null and language = 'en') as x where x.pid = property_label_en.pid"
-$PSQL -c "create unique index idx_property_label_en on property_label_en using btree (pid)"
-$PSQL -c "alter table entity add column label_en text"
-$PSQL -c "update entity set label_en = label from entity_label where entity.entity_id = entity_label.entity_id and language = 'en'"
-$PSQL -c "alter table entity add column description_en text"
-$PSQL -c "update entity set description_en = description from entity_description where entity.entity_id = entity_description.entity_id and language = 'en'"
-
-cat totals.txt | $PSQL -c "COPY import_link_hit from STDIN WITH CSV DELIMITER ' '"
-$PSQL -c "truncate link_hit"
-$PSQL -c "insert into link_hit select target||'wiki', replace(catch_decode_url_part(value), '_', ' '), sum(hits) from import_link_hit where replace(catch_decode_url_part(value), '_', ' ') is not null group by target||'wiki', replace(dcatch_decode_url_part(value), '_', ' ')"
-$PSQL -c "truncate entity_link_hit"
-$PSQL -c "insert into entity_link_hit select entity_id, target, value, coalesce(hits,0) from entity_link left outer join link_hit using (target, value)"
-$PSQL -c "create table entity_hit as select entity_id,sum(hits) as hits from entity_link_hit group by entity_id"
-$PSQL -c "create unique index idx_entity_hit on entity_hit using btree (entity_id)"
+++ /dev/null
-#!/usr/bin/php -Cq
-<?php
-
-$hFile = @fopen('wikidatawiki-20130623-pages-articles.xml', 'r');
-
-$hFileEntity = fopen('entity.csv', 'w');
-$hFileEntityLabel = fopen('entity_label.csv', 'w');
-$hFileEntityDescription = fopen('entity_description.csv', 'w');
-$hFileEntityAlias = fopen('entity_alias.csv', 'w');
-$hFileEntityLink = fopen('entity_link.csv', 'w');
-$hFileEntityProperty = fopen('entity_property.csv', 'w');
-
-$iCount = 0;
-
-$sTitle = '';
-$iNS = false;
-$iID = false;
-
-if ($hFile) {
- while (($sLine = fgets($hFile, 4000000)) !== false) {
- if (substr($sLine, 0, 11) == ' <title>') {
- $sTitle = substr($sLine, 11, -9);
- } elseif (substr($sLine, 0, 8) == ' <ns>') {
- $iNS = (int)substr($sLine, 8, -6);
- } elseif (substr($sLine, 0, 8) == ' <id>') {
- $iID = (int)substr($sLine, 8, -6);
- } elseif (substr($sLine, 0, 33) == ' <text xml:space="preserve">') {
- if ($iNS == -2) continue;
- if ($iNS == -1) continue;
- if ($iNS == 1) continue;
- if ($iNS == 2) continue;
- if ($iNS == 3) continue;
- if ($iNS == 4) continue;
- if ($iNS == 5) continue;
- if ($iNS == 6) continue;
- if ($iNS == 7) continue;
- if ($iNS == 8) continue;
- if ($iNS == 9) continue;
- if ($iNS == 10) continue;
- if ($iNS == 11) continue;
- if ($iNS == 12) continue;
- if ($iNS == 13) continue;
- if ($iNS == 14) continue;
- if ($iNS == 15) continue;
- if ($iNS == 121) continue;
- if ($iNS == 123) continue;
- if ($iNS == 829) continue;
- if ($iNS == 1198) continue;
- if ($iNS == 1199) continue;
- $sText = html_entity_decode(substr($sLine, 33, -8), ENT_COMPAT, 'UTF-8');
- $aArticle = json_decode($sText, true);
-
- if (array_diff(array_keys($aArticle), array('label', 'description', 'aliases', 'links', 'entity', 'claims', 'datatype')) != array()) {
- // DEBUG
- var_dump($sTitle);
- var_dump(array_keys($aArticle));
- var_dump($aArticle);
- exit;
- }
-
- $iPID = $iQID = null;
- if ($aArticle['entity'][0] == 'p') {
- $iPID = (int) substr($aArticle['entity'], 1);
- } elseif ($aArticle['entity'][0] == 'q') {
- $iQID = (int) substr($aArticle['entity'], 1);
- } else {
- continue;
- }
-
- echo '.';
-
- fputcsv($hFileEntity, array($iID, $sTitle, $iPID, $iQID, @$aArticle['datatype']));
-
- foreach ($aArticle['label'] as $sLang => $sLabel) {
- fputcsv($hFileEntityLabel, array($iID, $sLang, $sLabel));
- // echo "insert into entity_label values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
- }
-
- foreach ($aArticle['description'] as $sLang => $sLabel) {
- fputcsv($hFileEntityDescription, array($iID, $sLang, $sLabel));
- // echo "insert into entity_description values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
- }
-
- foreach ($aArticle['aliases'] as $sLang => $aLabels) {
- $aUniqueAlias = array();
- foreach ($aLabels as $sLabel) {
- if (!isset($aUniqueAlias[$sLabel]) && $sLabel) {
- fputcsv($hFileEntityAlias, array($iID, $sLang, $sLabel));
- // echo "insert into entity_alias values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
- $aUniqueAlias[$sLabel] = true;
- }
- }
- }
-
- foreach ($aArticle['links'] as $sLang => $sLabel) {
- fputcsv($hFileEntityLink, array($iID, $sLang, $sLabel));
- // echo "insert into entity_link values (".$iID.",'".pg_escape_string($sLang)."','".pg_escape_string($sLabel)."');\n";
- }
-
-
- if (isset($aArticle['claims'])) {
- //
- foreach ($aArticle['claims'] as $iClaim => $aClaim) {
- //
- $bFail = false;
- if ($aClaim['m'][0] == 'novalue') continue;
- if ($aClaim['m'][0] == 'somevalue') continue;
- $iPID = (int)$aClaim['m'][1];
- if ($aClaim['m'][0] != 'value') $bFail = true;
- if ($aClaim['m'][2]== 'wikibase-entityid') {
- //
- if ($aClaim['m'][3]['entity-type'] != 'item') $bFail = true;
- fputcsv($hFileEntityProperty, array($iID, $iClaim, $iPID, null, $aClaim['m'][3]['numeric-id'], null, null));
- // echo "insert into entity_property values (nextval('seq_entity_property'),".$iID.",".$iPID.",null,".$aClaim['m'][3]['numeric-id'].",null);\n";
- } elseif ($aClaim['m'][2] == 'globecoordinate') {
- //
- if ($aClaim['m'][3]['globe'] != 'http://www.wikidata.org/entity/Q2') $bFail = true;
- fputcsv(
- $hFileEntityProperty,
- array(
- $iID,
- $iClaim,
- $iPID,
- null,
- null,
- 'SRID=4326;POINT('.((float) $aClaim['m'][3]['longitude']).' '.((float)$aClaim['m'][3]['latitude']).')', null
- )
- );
- /* echo "insert into entity_property values (nextval('seq_entity_property'),";
- * echo $iID.",".$iPID.",null,null,ST_SetSRID(ST_MakePoint(".((float)$aClaim['m'][3]['longitude']);
- * echo ", ".((float)$aClaim['m'][3]['latitude'])."),4326));\n";
- */
- } elseif ($aClaim['m'][2] == 'time') {
- // TODO!
- /*
- if ($aClaim['m'][3]['calendarmodel'] == 'http://www.wikidata.org/entity/Q1985727') {
- // Gregorian
- if (preg_match('#(\\+|-)0*([0-9]{4})-([0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2})Z#', $aClaim['m'][3]['time'], $aMatch)) {
- if ((int)$aMatch[2] < 4700 && ) {
- $sDateString = $aMatch[2].'-'.$aMatch[3].($aClaim['m'][3]['timezone']>=0?'+':'').$aClaim['m'][3]['timezone'].($aMatch[1]=='-'?' bc':'');
- fputcsv($hFileEntityProperty, array($iID,$iClaim,$iPID,null,null,null,$sDateString));
- }
- } else {
- // $bFail = true;
- }
- } elseif ( $aClaim['m'][3]['calendarmodel'] != 'http://www.wikidata.org/entity/Q1985786') {
- // Julian
- if (preg_match('#(\\+|-)0*([0-9]{4})-([0-9]{2})-([0-9]{2})T([0-9]{2}:[0-9]{2}:[0-9]{2})Z#', $aClaim['m'][3]['time'], $aMatch)) {
- var_dump($aMatch);
- exit;
- $iDayCount = juliantojd(2, 11, 1732);
- var_dump($iDayCount, jdtogregorian($iDayCount));
- } else {
- $bFail = true;
- exit;
- }
- exit;
- } else {
- // $bFail = true;
- }
- */
- } elseif ($aClaim['m'][2] == 'string') {
- // echo "insert into entity_property values (nextval('seq_entity_property'),".$iID.",".$iPID.",'".pg_escape_string($aClaim['m'][3])."',null,null);\n";
- fputcsv($hFileEntityProperty, array($iID, $iClaim, $iPID, $aClaim['m'][3], null, null, null));
- } else {
- $bFail = true;
- }
-
- // Don't care about sources: if ($aClaim['refs'] != array()) $bFail = true;
-
- if ($bFail) {
- var_dump($sTitle);
- var_dump($aClaim);
- } else {
- // process
- }
- }
- }
- }
- }
- fclose($hFile);
- fclose($hFileEntity);
- fclose($hFileEntityLabel);
- fclose($hFileEntityDescription);
- fclose($hFileEntityAlias);
- fclose($hFileEntityLink);
- fclose($hFileEntityProperty);
-}
+++ /dev/null
-<?php
-
-for ($iTimestamp = mktime(0, 0, 0, 5, 1, 2013); $iTimestamp < mktime(0, 0, 0, 6, 15, 2013); $iTimestamp += 24*60*60) {
- $sYear = date('Y', $iTimestamp);
- $sMonth = date('Y-m', $iTimestamp);
- $sDay = date('Ymd', $iTimestamp);
-
- for ($iHour = 0; $iHour < 24; $iHour++) {
- $sFilename = sprintf('pagecounts-'.$sDay.'-%02d0000', $iHour);
- echo $sFilename."\n";
- if (!file_exists($sFilename.'.gz')) {
- exec('wget http://dumps.wikimedia.org/other/pagecounts-raw/'.$sYear.'/'.$sMonth.'/'.$sFilename.'.gz');
- }
-
- exec('gzip -dc '.$sFilename.'.gz'.' | grep -e "^[a-z]\{2\} [^ :]\+ [0-9]\+" > hour.txt');
-
- $hPrevTotals = @fopen('totals.txt', 'r');
- $hDayTotals = @fopen('hour.txt', 'r');
- $hNewTotals = @fopen('newtotals.txt', 'w');
-
- $sPrevKey = $sDayKey = true;
- $sPrevLine = true;
- $sDayLine = true;
-
- do {
- if ($sPrevKey === $sDayKey) {
- if ($sPrevLine !== true) fputs($hNewTotals, "$sPrevKey ".($iPrevValue+$iDayValue)."\n");
- $sPrevLine = true;
- $sDayLine = true;
- } elseif ($sDayKey !== false && ($sPrevKey > $sDayKey || $sPrevKey === false)) {
- fputs($hNewTotals, "$sDayKey ".($iDayValue)."\n");
- $sDayLine = true;
- } elseif ($sPrevKey !== false && ($sDayKey > $sPrevKey || $sDayKey === false)) {
- fputs($hNewTotals, "$sPrevKey ".($iPrevValue)."\n");
- $sPrevLine = true;
- }
-
- if ($sPrevLine === true) {
- $sPrevLine = $hPrevTotals?fgets($hPrevTotals, 4096):false;
- if ($sPrevLine !== false) {
- $aPrevLine = explode(' ', $sPrevLine);
- $sPrevKey = $aPrevLine[0].' '.$aPrevLine[1];
- $iPrevValue = (int)$aPrevLine[2];
- } else {
- $sPrevKey = false;
- $iPrevValue = 0;
- }
- }
-
- if ($sDayLine === true) {
- $sDayLine = $hDayTotals?fgets($hDayTotals, 4096):false;
- if ($sDayLine !== false) {
- preg_match('#^([a-z]{2}) ([^ :]+) ([0-9]+) [0-9]+$#', $sDayLine, $aMatch);
- $sDayKey = $aMatch[1].' '.$aMatch[2];
- $iDayValue = (int)$aMatch[3];
- } else {
- $sDayKey = false;
- $iDayValue = 0;
- }
- }
- } while ($sPrevLine !== false || $sDayLine !== false);
-
- @fclose($hPrevTotals);
- @fclose($hDayTotals);
- @fclose($hNewTotals);
-
- @unlink('totals.txt');
- rename('newtotals.txt', 'totals.txt');
- }
-}
-
-// Notes:
-/*
- gzip -dc $FILE.gz | grep -e "^en [^ :]\+ [0-9]\+" |
- sed "s#\(^[a-z]\{2\}\) \([^ :]\+\) \([0-9]\+\) [0-9]\+#update wikipedia_article set hit_count = coalesce(hit_count,0) + \3 where language = '\1'
- and title = catch_decode_url_part('\2');#g" | /opt/mapquest/stdbase-dev$
- cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = s,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g"
- cat totals.txt | sed "s#\(^[a-z]\{2\}\) \([^ ]\+\) \([0-9]\+\)\$#update entity_link set hits = coalesce(hits,0) + \3 where target = '\1wiki' and value = catch_decode_url_part('\2');#g"
-*/