#!/usr/bin/env perl
use feature ':5.10';
use strict;
use warnings;
use YAML::Syck qw(Dump LoadFile);
use Test::Differences;
use Pod::Usage ();
use Getopt::Long ();

=head1 NAME

locale-diff - Compare two YAML files and print how their datastructures differ

=head1 SYNOPSIS

    # --keys is the default
    diff en.yml is.yml
    diff --keys en.yml is.yml

    # --untranslated-values compares prints keys whose values don't differ
    diff --untranslated-values en.yml is.yml

    # --untranslated-values-all compares prints keys whose values
    # don't differ. Ignoring the blacklist which prunes things
    # unlikley to be translated
    diff --untranslated-values-all en.yml is.yml

    # Check that interpolated variables ({{var}} and [[var]]) are the same
    diff --validate-variables en.yml is.yml

=head1 DESCRIPTION

This utility prints the differences between two YAML files using
L<Test::Differences>. The purpose of it is to diff the files is
F<config/locales> to find out what keys need to be added to the
translated files when F<en.yml> changes.

=head1 OPTIONS

=over

=item -h, --help

Print this help message.

=item --keys

Show the hash keys that differ between the two files, useful merging
new entries from F<en.yml> to a local file.

=item --untranslated-values

Show keys that B<exist in both the compared files> and whose values
are exactly the same. Use C<--keys> to a list of values that hasn't
been merged.

The values are pruned according to global and language specific
blacklists found in the C<__DATA__> section of this script.

This helps to find untranslated values.

=item --untranslated-values-all

Like C<--untranslated-values> but ignores blacklists.

=item --validate-variables

Check that interpolated Ruby i18n variables (C<{{foo}}> and
C<[[foo]]>) are equivalent in the two provided files.

=item --dump-flat

Dump a flat version of the translation hash in YAML format,
i.e. "foo.bar" instead of "{foo}->{bar}".

=back

=head1 AUTHOR

E<AElig>var ArnfjE<ouml>rE<eth> Bjarmason <avarab@gmail.com>

=cut

# Get the command-line options
Getopt::Long::Parser->new(
    config => [ qw< bundling no_ignore_case no_require_order pass_through > ],
)->getoptions(
    'h|help' => \my $help,
    'keys' => \my $keys,
    'dump-flat' => \my $dump_flat,
    'untranslated-values' => \my $untranslated_values,
    'untranslated-values-all' => \my $untranslated_values_all,
    'validate-variables' => \my $validate_variables,
    'reconstruct' => \my $reconstruct,
) or help();

# --keys is the default
$keys = 1 if not $untranslated_values_all and not $untranslated_values and not $validate_variables and not $dump_flat;

# On --help
help() if $help;

# If we're not given two .yml files
help() if (@ARGV != 2 or (!-f $ARGV[0] or !-f $ARGV[1])) and not $dump_flat || $reconstruct;

my ($from, $to) = @ARGV;

my $from_data = LoadFile($from);
my $from_parsed = { iterate($from_data->{basename($from)}) };

if ($dump_flat)
{
    mark_utf8($from_parsed);

    print Dump $from_parsed;

    exit 0;
}

if ($reconstruct) {
    mark_utf8($from_parsed);

    my %out;
    while (my ($k, $v) = each %$from_parsed) {
        insert_string_deep(\%out, $k, $v);
    }

    print Dump { basename($from) => \%out };

    exit 0;
}

my $to_data   = LoadFile($to);
my $to_parsed = { iterate($to_data->{basename($to)}) };

if ($keys)
{
    print_key_differences($from_parsed, $to_parsed);
}
elsif ($untranslated_values or $untranslated_values_all)
{
    my @untranslated = untranslated_keys($from_parsed, $to_parsed);

    # Prune according to blacklist
    if ($untranslated_values) {
        @untranslated = prune_untranslated_with_blacklist(basename($to), @untranslated);
    }

    say for @untranslated;
} elsif ($validate_variables)
{
    print_validate_variables($from_parsed, $to_parsed);
}

exit 0;

sub print_key_differences
{
    my ($f, $t) = @_;

    # Hack around Test::Differences wanting a Test::* module loaded
    $INC{"Test.pm"} = 1;
    sub Test::ok { print shift }

    # Diff the tree
    eq_or_diff([ sort keys %$f ], [ sort keys %$t ]);
}

sub untranslated_keys
{
    my ($from_parsed, $to_parsed) = @_;
    sort grep { exists $to_parsed->{$_} and $from_parsed->{$_} eq $to_parsed->{$_} } keys %$from_parsed;
}

sub prune_untranslated_with_blacklist
{
    my ($language, @keys) = @_;
    my %keys;
    @keys{@keys} = ();

    my $end_yaml = LoadFile(*DATA);
    my $untranslated_values = $end_yaml->{untranslated_values};
    my $default = $untranslated_values->{default};
    my $this_language = $untranslated_values->{$language} || {};

    my %bw_list = (%$default, %$this_language);
    
    while (my ($key, $blacklisted) = each %bw_list)
    {
        # FIXME: Does syck actually support true/false booleans in yaml?
        delete $keys{$key} if $blacklisted eq 'true'
    }

    sort keys %keys;
}

sub print_validate_variables
{
    my ($f, $t) = @_;

    while (my ($key, $val) = each %$f)
    {
        next if exists $f->{$key} and not exists $t->{$key};

        my @from_var = parse_variables_from_string($f->{$key});
        my @to_var   = parse_variables_from_string($t->{$key});

        unless (@from_var ~~ @to_var) {
            say "$key in $from has (@from_var) and $to has (@to_var)";
        }

    }
}

sub parse_variables_from_string
{
    my ($string) = @_;

    # This probably matches most of the variables
    my $var = qr/ [a-z0-9_]+? /xs;

    if (my @var = $string =~ m/ \{\{ ($var) \}\} | \[\[ ($var) \]\] /gsx) {
        return sort grep { defined } @var;
    } else {
        return;
    }
}

sub iterate
{
    my ($hash, @path) = @_;
    my @ret;
        
    while (my ($k, $v) = each %$hash)
    {
        if (ref $v eq 'HASH')
        {
             push @ret => iterate($v, @path, $k);
        }
        else
        {
            push @ret => join(".",@path, $k), $v;
        }
    }

    return @ret;
}

# $s = 'foo.bar.baz.spam.eggs.ham'; $h = \%h; $h = $h->{$_} = {} for split /\./, $s; \%h
# ==> {foo => {bar => {baz => {spam => {eggs => {ham => {}}}}}}}
sub insert_string_deep {
    my ($h, $ks, $v) = @_;
    my $p = \$h; $p = \$$p->{$_} for split /\./, $ks;
    $$p = $v;
}

# sub insert_string_deep
# {
#     my ($hash, $key, $value) = @_;
#
#     my @key = split /\./, $key;
#     my $h = $hash;
#
#     my $i = 0;
#     for my $k (@key) {
#         $i ++;
#         if ($i == @key) {
#             $h->{$k} = $value;
#         } else {
#             if (ref $h->{$k}) {
#                 $h = $h->{$k};
#             } else {
#                 $h = $h->{$k} = {};
#             }
#         }
#     }
# }

sub basename
{
    my $name = shift;
    $name =~ s[\..*?$][];
    $name =~ s[.*/][];
    $name;
}

sub mark_utf8
{
    my ($hash) = @_;

    # Mark as UTF-8
    map { if (ref $_ eq 'ARRAY') { map { utf8::decode($_) } @$_ } else {  utf8::decode($_) } } values %$hash;
}

sub help
{
    my %arg = @_;

    Pod::Usage::pod2usage(
        -verbose => $arg{ verbose },
        -exitval => $arg{ exitval } || 0,
    );
}

__DATA__
untranslated_values:

  # Default/Per language blacklist/whitelist for the
  # --untranslated-values switch. "true" as a value indicates that the
  # key is to be blacklisted, and "false" that it's to be
  # whitelisted. "false" is only required to whitelist a key
  # blacklisted by default on a per-language basis.

  default:
    html.dir: true
    layouts.intro_3_bytemark: true
    layouts.intro_3_ucl: true
    layouts.project_name.h1: true
    layouts.project_name.title: true
    site.index.license.project_url: true
    browse.relation_member.entry: true

    # #{{id}}
    changeset.changeset.id: true

  de:
    activerecord.attributes.message.sender: true
    activerecord.attributes.trace.name: true
    activerecord.models.changeset: true
    activerecord.models.relation: true
    browse.changeset.changeset: true
    browse.changeset.changesetxml: true
    browse.changeset.osmchangexml: true
    browse.changeset.title: true
    browse.common_details.version: true
    browse.containing_relation.relation: true
    browse.relation.relation: true
    browse.relation.relation_title: true
    browse.start_rjs.details: true
    browse.start_rjs.object_list.details: true
    browse.tag_details.tags: true
    changeset.changesets.id: true
    export.start.export_button: true
    export.start.format: true
    export.start.output: true
    export.start.zoom: true
    export.start_rjs.export: true
    layouts.export: true
    layouts.shop: true
    site.edit.anon_edits: true
    site.index.license.license_name: true
    site.index.permalink: true
    site.key.table.entry.park: true
    site.search.submit_text: true
    trace.edit.tags: true
    trace.trace.in: true
    trace.trace_form.tags: true
    trace.trace_optionals.tags: true
    trace.view.tags: true
    user.account.public editing.enabled link: true

  is:
    # ({{link}})
    site.edit.anon_edits: true

    # Creative Commons Attribution-Share Alike 2.0
    site.index.license.license_name: true

    # http://creativecommons.org/licenses/by-sa/2.0/
    site.index.license.license_url: true

    # {{id}}
    printable_name.with_id: true
    
    # {{name}} ({{id}})
    printable_name.with_name: true

    # {{type}} 
    geocoder.search_osm_namefinder.prefix: true

    # {{suffix}}, {{parentname}}
    geocoder.search_osm_namefinder.suffix_suburb: true