#!/usr/bin/env perl use 5.010; use strict; use warnings; use Pod::Usage (); use Getopt::Long (); BEGIN { eval "require MediaWiki::API; require YAML::XS;" or do { print "You have to install some modules via CPAN to run this:\n"; print " sudo cpanp MediaWiki::API YAML::XS\n"; exit 1; }; } use MediaWiki::API; use YAML::XS qw(Dump); use Test::More 'no_plan'; =head1 NAME update-wiki-pages - Scrape the wiki for key/value wiki description pages =head1 SYNOPSIS perl script/misc/update-wiki-pages config/wiki_pages.yml Or with prove(1): prove -e 'perl script/misc/update-wiki-pages' config/wiki_pages.yml =cut # Get the command-line options Getopt::Long::Parser->new( config => [ qw< bundling no_ignore_case no_require_order pass_through > ], )->getoptions( 'h|help' => \my $help, ) or help(); # On --help help() if $help; my $out_file = $ARGV[0]; $out_file //= 'config/wiki_pages.yml'; help() unless -f $out_file; # Get a API interface my $mw = MediaWiki::API->new(); ok($mw, "Got a MediaWiki API"); $mw->{config}->{api_url} = 'http://wiki.openstreetmap.org/w/api.php'; # All our goodies my (%feature, %count); # This is what you get on: ## http://wiki.openstreetmap.org/w/index.php?search=Template:KeyDescription&fulltext=Search&fulltext=Search for my $lang ('', map { "${_}:" } qw[ Pt Fi De It HU Cz Fr RU Pl ]) { ok(1, " Templates for language '$lang'"); # Key pages ok(1, " Getting key pages"); my $cnt = stick_content_in_hash("key", "Template:${lang}KeyDescription", \%feature); $cnt += stick_content_in_hash("key", "Template:${lang}Feature", \%feature); ok(1, " Got $cnt key pages"); $count{key} += $cnt; # Value pages ok(1, " Getting value pages"); $cnt = stick_content_in_hash("tag", "Template:${lang}ValueDescription", \%feature); ok(1, " Got $cnt value pages"); $count{value} += $cnt; } ok(1, "Got a total of $count{$_} ${_}s") for qw[ key value ]; # Dump to .yml file open my $out, ">", $out_file or die "Can't open file '$out_file' supplied on the command line"; say $out "# THIS FILE IS AUTOGENERATED WITH THE script/misc/update-wiki-pages"; say $out "# PROGRAM DO NOT MANUALLY EDIT IT"; say $out ""; say $out Dump(\%feature); close $out; exit 0; sub stick_content_in_hash { my ($key, $title, $hash) = @_; my $ukey = ucfirst $key; my $space_to_underscore = sub { my $txt = shift; $txt =~ s/ /_/g; $txt; }; my $count = 0; my $process_link = sub { my $link = shift; $count++; ok(1, " ... got $count links") if $count % 200 == 0; my $title = $link->{title}; my $lang; my $key_name; if ($title =~ /^$ukey:(?.*?)$/) { # English by default $lang = "en"; $key_name = $space_to_underscore->($+{key_name}); } elsif ($title =~ /^(?[^:]+):$ukey:(?.*?)$/) { $lang = lc $+{lang}; $key_name = $space_to_underscore->($+{key_name}); } if ($lang && !exists($hash->{$lang}->{$key}->{$key_name})) { $hash->{$lang}->{$key}->{$key_name} = $title; } }; get_embeddedin( $title, sub { my $link = shift; $process_link->($link); get_redirects( $link->{title}, sub { my $link = shift; $process_link->($link) if exists($link->{redirect}); } ); } ); return $count; } sub process_list { my $callback = shift; my $links = shift; for my $link (@$links) { $callback->($link); } } sub get_embeddedin { my ($title, $callback) = @_; my $articles = $mw->list( { action => 'query', list => 'embeddedin', eititle => $title, eifilterredir => 'nonredirects', # Doesn't work for De:* and anything non-en. Odd. # einamespace => '0|8', eilimit => '200', }, { max => '0', hook => sub { process_list($callback, @_) }, skip_encoding => 1, } ) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } sub get_redirects { my ($title, $callback) = @_; my $articles = $mw->list( { action => 'query', list => 'backlinks', bltitle => $title, blfilterredir => 'redirects', # Doesn't work for De:* and anything non-en. Odd. # einamespace => '0|8', bllimit => '200', }, { max => '0', hook => sub { process_list($callback, @_) }, skip_encoding => 1, } ) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } sub help { my %arg = @_; Pod::Usage::pod2usage( -verbose => $arg{ verbose }, -exitval => $arg{ exitval } || 0, ); }