#!/usr/bin/perl

use File::Find;
use URI::Encode 'uri_decode';
use File::Slurp 'edit_file_lines';
use Digest::MD5 'md5_hex';
use Encode      'encode_utf8';


setlocale(&POSIX::LC_ALL => "pt");
use locale;


my $path = shift || "pt";

our $i = 0;

print STDERR "processing [$path].";
find { wanted => \&process, follow => 1} => $path;
print STDERR "done\n";

sub process {
    return unless /\.xml$/;
    $i++;
    print STDERR "." unless $i%1000;
    my $file = $_;
    edit_file_lines \&edit_line => $file

}

sub edit_line {
    my $title = undef;

    s{(['"])http://pt\.wikipedia\.org/[^=]+title=(.+?)\1}{'"'.path($2).'"'}ge;

    s{(['"])http://pt\.wikipedia\.org/wiki/(.+?)\1}{'"'.path($2).'"'}ge;

    s{(['"])http://pt\.wikipedia\.org/([^/]+?)\1}{'"'.path($2).'"'}ge;
}

sub path {
    my $title = uri_decode(shift());
    my ($dir, $file) = title2path($title);
    return "$dir/$file";
}


sub title2path {
    my $title = shift;

    chomp $title;

    my $dir  = lc $title;
    my $file = $title;

    $dir  =~ s/[^[:alpha:]0-9]/_/g;
    $file =~ s/[ :!?'"\(\\\/\)\[\]\{\}\&]/_/g;

    $dir = sprintf('../../../%s/%s/%s',
                   my_pos($dir, 0),
                   my_pos($dir, 1),
                   my_pos($dir, 2));

    my $md5 = md5_hex(encode_utf8($file));
    $file = substr($file, 0, 240) if length $file >= 240;
    $file = sprintf("%s.%s.xml", $file, substr($md5,0,6));

    return ($dir, $file);
}


sub my_pos {
   my ($str, $pos) = @_;
   if (length($str) > $pos) {
       substr($str, $pos, 1)
   } else {
       '_'
   }
}
