X-Git-Url: https://git.openstreetmap.org/nominatim.git/blobdiff_plain/a8608e2b796fd4f7885baddea391add5e308389a..9e35e5c2b02887e361972ffbb20e65bbba0d02c7:/utils/import_wikipedia.sh diff --git a/utils/import_wikipedia.sh b/utils/import_wikipedia.sh index 0a15e2d0..f053499c 100755 --- a/utils/import_wikipedia.sh +++ b/utils/import_wikipedia.sh @@ -11,32 +11,32 @@ echo "CREATE TABLE wikipedia_redirect (language text, from_title text, to_title for i in "${language[@]}" do - wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz - wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz - wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz - wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz + wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-page.sql.gz + wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-pagelinks.sql.gz + wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-langlinks.sql.gz + wget http://dumps.wikimedia.org/${i}wiki/latest/${i}wiki-latest-redirect.sql.gz done for i in "${language[@]}" do - gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | $mysql2pgsqlcmd | $psqlcmd - gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | $mysql2pgsqlcmd | $psqlcmd - gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | $mysql2pgsqlcmd | $psqlcmd - gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | $mysql2pgsqlcmd | $psqlcmd + gzip -dc ${i}wiki-latest-pagelinks.sql.gz | sed "s/\`pagelinks\`/\`${i}pagelinks\`/g" | $mysql2pgsqlcmd | $psqlcmd + gzip -dc ${i}wiki-latest-page.sql.gz | sed "s/\`page\`/\`${i}page\`/g" | $mysql2pgsqlcmd | $psqlcmd + gzip -dc ${i}wiki-latest-langlinks.sql.gz | sed "s/\`langlinks\`/\`${i}langlinks\`/g" | $mysql2pgsqlcmd | $psqlcmd + gzip -dc ${i}wiki-latest-redirect.sql.gz | sed "s/\`redirect\`/\`${i}redirect\`/g" | $mysql2pgsqlcmd | $psqlcmd done for i in "${language[@]}" do - echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | $psqlcmd - echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | $psqlcmd - echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | $psqlcmd - echo "alter table ${i}pagelinkcount add column othercount integer;" | $psqlcmd - echo "update ${i}pagelinkcount set othercount = 0;" | $psqlcmd - for j in "${language[@]}" - do - echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | $psqlcmd - done - echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | $psqlcmd + echo "create table ${i}pagelinkcount as select pl_title as title,count(*) as count from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | $psqlcmd + echo "insert into linkcounts select '${i}',pl_title,count(*) from ${i}pagelinks where pl_namespace = 0 group by pl_title;" | $psqlcmd + echo "insert into wikipedia_redirect select '${i}',page_title,rd_title from ${i}redirect join ${i}page on (rd_from = page_id) where page_namespace = 0 and rd_namespace = 0;" | $psqlcmd + echo "alter table ${i}pagelinkcount add column othercount integer;" | $psqlcmd + echo "update ${i}pagelinkcount set othercount = 0;" | $psqlcmd + for j in "${language[@]}" + do + echo "update ${i}pagelinkcount set othercount = ${i}pagelinkcount.othercount + x.count from (select page_title as title,count from ${i}langlinks join ${i}page on (ll_from = page_id) join ${j}pagelinkcount on (ll_lang = '${j}' and ll_title = title)) as x where x.title = ${i}pagelinkcount.title;" | $psqlcmd + done + echo "insert into wikipedia_article select '${i}', title, count, othercount, count+othercount from ${i}pagelinkcount;" | $psqlcmd done echo "update wikipedia_article set importance = log(totalcount)/log((select max(totalcount) from wikipedia_article))" | $psqlcmd