#!/usr/bin/env perl use feature ':5.10'; use strict; use warnings; use File::Slurp qw(slurp); use YAML::Syck qw(Dump Load LoadFile DumpFile); BEGIN { $YAML::Syck::Headless = 1; $YAML::Syck::SortKeys = 1; } use WWW::Mechanize; use HTML::TableParser::Grid; use Pod::Usage (); use Getopt::Long (); use Data::Dump 'dump'; use File::Spec::Functions qw(catfile); use Storable; use autodie; =head1 NAME merge-from-translatewiki - Get new translations from L and selectively merge them with ours =head1 SYNOPSIS # Run this normally, hopefully... merge-from-translatewiki --locales-dir=config/locales # Diff the existing files: config/locales$ for i in $(ls *yml | grep -v en.yml); do perl ../../script/locale/diff --dump-flat $i > $i.0 ;done # Merge and find out what changed: rails_port$ perl script/locale/merge-from-translatewiki --locales-dir config/locales # Or, more complexy: rails_port$ for i in $(svn st config/locales/ | egrep '^M|\\?' | awk '{print $2}' | grep 'yml$'); do rm -v $i; done && svn up config/locales && perl script/locale/merge-from-translatewiki --locales-dir config/locales && svn st config/locales # Diff: config/locales$ for i in $(ls *yml | grep -v en.yml); do perl ../../script/locale/diff --dump-flat $i > $i.1 ;done && for i in $(ls *yml | grep -v en.yml); do diff -ru $i.*; done =head1 DESCRIPTION Translatewiki's export process L. This script imports new messages from it while tiptoeing around known bugs. =head1 OPTIONS =over =item -h, --help Print this help message. =item --locales-dir The locales dir we'll merge stuff into. F by default. =item --only-new Only import translations that don't exists for us yet. =item --cache Write a L cache for things downloaded from Translatewiki and use it if it exists. =back =head1 AUTHOR Evar ArnfjErE Bjarmason =cut # Get the command-line options Getopt::Long::Parser->new( config => [ qw< bundling no_ignore_case no_require_order pass_through > ], )->getoptions( 'h|help' => \my $help, 'locales-dir=s' => \(my $locales_dir = 'config/locales'), 'only-new' => \my $only_new, 'cache' => \my $cache, ) or help(); # On --help help() if $help; help() unless $locales_dir and -d $locales_dir; ### ### Main ### ### Get Translatewiki data my %translatewiki_languages = translatewiki_languages(); # Don't process English from Translatewiki delete $translatewiki_languages{en}; #say Dump \%translatewiki_languages; my @translatewiki_languages_codes = keys %translatewiki_languages; my %translatewiki_translations = get_translatewiki_translations(@translatewiki_languages_codes); #say Dump \%translatewiki_translations; ### Get our existing data my %my_translations; my @my_yaml_files = glob catfile($locales_dir, '*.yml'); for my $my_yaml_file (@my_yaml_files) { my $basename = basename($my_yaml_file); my $tw_lang = lc $basename; say STDERR "Loading my translation $tw_lang ($my_yaml_file)"; $my_translations{$tw_lang} = load_and_flatten_yaml(scalar slurp($my_yaml_file)); } say "loaded my translations"; ## Write out merged data for my $translatewiki_lang (sort @translatewiki_languages_codes) { my $rails_lang = $translatewiki_lang; $rails_lang =~ s/(?<=-)(\w+)/\U$1\E/; my $out_file = catfile($locales_dir, $rails_lang . '.yml'); unless (-f $out_file) { # No translation like this exists say STDERR "$rails_lang has no existing translation. Importing as-is from Translatewiki to $out_file"; my $expanded = expand_hash($translatewiki_translations{$translatewiki_lang}); my $out = +{ $rails_lang => $expanded }; spit_out($out_file, $out); } elsif (ref $my_translations{$translatewiki_lang} eq 'HASH' and not $only_new) { say STDERR "$rails_lang has existing translations. Merging the old translation with the new Translatewiki one"; # Get the data my %tw = %{ $translatewiki_translations{$translatewiki_lang} }; my %me = %{ $my_translations{$translatewiki_lang} }; my %en = %{ $my_translations{en} }; # Use %tw to start with my %new = %tw; ### Merge stuff ## These keys shouldn't be removed but are due to ## Translatewiki fail (they were missing in the original ## import) my @url_keys = qw( browse.relation_member.entry changeset.changeset.id geocoder.search_osm_namefinder.suffix_suburb html.dir layouts.intro_3_bytemark layouts.intro_3_ucl layouts.project_name.h1 layouts.project_name.title printable_name.with_version site.edit.anon_edits layouts.help_wiki_url layouts.shop_url notifier.gpx_notification.failure.import_failures_url notifier.signup_confirm_plain.the_wiki_url notifier.signup_confirm_plain.wiki_signup_url trace.edit.visibility_help_url trace.trace_form.help_url trace.trace_form.visibility_help_url ); for my $key (@url_keys) { if ( exists $me{$key} and not exists $new{$key} ) { $new{$key} = $me{$key} if $me{$key} ne $en{$key}; } } ## When foo exists in this file but only foo.one, foo,other ## etc in English or the original file we don't want to throw away what we have my @plural_keys = qw( zero one many few other two ); while (my ($me_k, $me_v) = each %me) { if (not exists $tw{ $me_k } and not exists $en{ $me_k } and ( exists $en{ $me_k . '.zero' } or exists $en{ $me_k . '.one' } or exists $en{ $me_k . '.many' } or exists $en{ $me_k . '.few' } or exists $en{ $me_k . '.other' } or exists $en{ $me_k . '.two' })) { #say STDERR "Bringing back nuked plural form '$me_k' Setting it to '$me{ $me_k }'"; $new{ $me_k } = $me{ $me_k }; } } # Both arrays and strings are supported in the site key. Avoid removing e.g.: # -site.key.table.entry.school: 學校;大學 # Just because en.yml has site.key.table.entry.school.0 and site.key.table.entry.school.1 while (my ($me_k, $me_v) = each %me) { next unless $me_k =~ /^site\.key\.table\.entry/; next if $me_k =~ /\.\d+$/; if (ref $en{ $me_k } eq 'ARRAY' and not ref $me{ $me_k }) { $new{ $me_k } = $me{ $me_k }; } } # There are a bunch of keys on Translatewiki that are # equivalent to English for some reason. Probably because they # were there at import time. Nuke them. while (my ($new_k, $new_v) = each %new) { if (exists $en{ $new_k } and $en{ $new_k } eq $new_v) { #say "Purging dupe in $rails_lang: $new_k=$new_v"; delete $new{ $new_k }; } } my $expanded = expand_hash( \%new ); my $out = +{ $rails_lang => $expanded }; spit_out($out_file, $out); } elsif (not $only_new) { die "Internal error on $translatewiki_lang"; } } sub spit_out { my ($file, $data) = @_; my $yaml_out = Dump $data; open my $fh, ">", $file; print $fh $yaml_out; close $fh; } # # YAML stuff # sub mark_utf8 { my ($hash) = @_; # Mark as UTF-8 map { if (ref $_ eq 'ARRAY') { map { utf8::decode($_) } @$_ } else { utf8::decode($_) } } values %$hash; } sub iterate { my ($hash, @path) = @_; my @ret; while (my ($k, $v) = each %$hash) { if (ref $v eq 'HASH') { push @ret => iterate($v, @path, $k); } else { push @ret => join(".",@path, $k), $v; } } return @ret; } sub expand_hash { my ($flat_hash) = @_; my %new_hash; while (my ($k, $v) = each %$flat_hash) { #say "Inserting $k=$v"; insert_string_deep(\%new_hash, $k, $v); } \%new_hash; } # Fails under strict in certain cases: ## Inserting browse.start_rjs.object_list.history.type.way=Vía [[id]] ## Inserting activerecord.models.relation_tag=Etiqueta de la relación ## Inserting browse.changeset_details.has_nodes.one=Tiene el siguiente {{count}} nodo: ## Can't use string ("Tiene {{count}} nodos:") as a HASH ref while "strict refs" in use at script/locale/merge-from-translatewiki line 234. # Line 234 = my $p = \$h; $p = \$$p->{$_} for split /\./, $ks; # sub insert_string_deep_X { # my ($h, $ks, $v) = @_; # my $p = \$h; $p = \$$p->{$_} for split /\./, $ks; # $$p = $v; # } sub insert_string_deep { my ($hash, $key, $value) = @_; my @key = split /\./, $key; my $h = $hash; my $i = 0; for my $k (@key) { $i ++; if ($i == @key) { $h->{$k} = $value; } else { if (ref $h->{$k}) { $h = $h->{$k}; } else { $h = $h->{$k} = {}; } } } } # # Get language from Translatewiki # sub get_translatewiki_translations { my @languages = @_; my $cache_file = "/tmp/merge-from-translatewiki.storable"; if ($cache) { if (-f $cache_file) { my $c = retrieve($cache_file); return %$c; } } my %translatewiki_languages; my $all_count = scalar @languages; say "Translatewiki has $all_count languages I'm about to get"; my $count = 0; for my $lang (@languages) { $count ++; say STDERR "Getting language $count/$all_count ($lang) from Translatewiki"; my $yaml = get_language_from_translatewiki($lang); my $flat_data = load_and_flatten_yaml($yaml); $translatewiki_languages{$lang} = $flat_data; } if ($cache) { store \%translatewiki_languages, $cache_file; } return %translatewiki_languages; } sub get_language_from_translatewiki { my ($lang) = @_; my $mech = WWW::Mechanize->new; $mech->get("http://translatewiki.net/w/i.php?title=Special%3ATranslate&task=export-to-file&group=out-osm-site&language=$lang"); die "Couldn't get lang $lang lang from Translatewiki" unless $mech->success; return $mech->content; } # # from language list # sub translatewiki_languages { my $mech = WWW::Mechanize->new; $mech->get('http://translatewiki.net/wiki/Translating:OpenStreetMap/stats/trunk/site'); die "Couldn't get translatewiki table" unless $mech->success; my $content = $mech->content; my ($sortable) = $content =~ m[({$keys[0]}; # Flatten it my $flat_data = { iterate($data) }; mark_utf8($flat_data); $flat_data; } # # Help # sub help { my %arg = @_; Pod::Usage::pod2usage( -verbose => $arg{ verbose }, -exitval => $arg{ exitval } || 0, ); }