#!/usr/bin/perl

use MediaWiki::DumpFile::Pages;

use File::Path  qw.make_path.;
use POSIX       qw.setlocale.;
use Encode      qw.encode_utf8.;
use Digest::MD5 qw.md5_hex.;

setlocale(&POSIX::LC_ALL => "pt");
use locale;


my $i = 0;
$input = shift || die "No file specified.";
our $id;
if ($input =~ /(\d+)/) {
    $id = $1;
} else {
    $id = $input;
}

my $pages = MediaWiki::DumpFile::Pages->new(input => $input,
                                            fast_mode => 0);


open REDIRECTS, ">:utf8", "$id.redirects.lst";

while (defined($page = $pages->next)) {

    $i++;
    my $title = $page->title;
    my ($dir, $file) = title2path($title);

    make_path $dir;

    if ($page->redirect) {
        print STDERR "+++ $title [redirect]...\n";
        if (open OUT, ">:utf8", "$dir/$file") {
          my $text = $page->revision->text;
          $text =~ m/\[\[([^]]+)\]\]/;
          print OUT "<a href='http://pt.wikipedia.org/wiki/$1'>$1</a>\n<!-- $text -->";
          close OUT;
          my ($dir2, $file2) = title2path($1);
          print REDIRECTS "$dir/$file\t$dir2/$file2\n";
       } else {
          log_ ("failed to write redirect for $title [$file]: $!");
       }

    }
    else {
        $title =~ s/'/'\\''/g;
        print STDERR "*** $title... [$file]\n";


        `mw-render -c cdb/wikiconf.txt -x --writer=xhtml --output=$id.xhtml '$title'`;
        log_("mw-render failed for $title [$file]") unless -f "$id.xhtml";
        `xmllint --format $id.xhtml > $dir/$file`;
        unlink "$id.xhtml";
    }
}

sub title2path {
    my $title = shift;

    chomp $title;

    my $dir  = lc $title;
    my $file = $title;

    $dir  =~ s/[^[:alpha:]0-9]/_/g;
    $file =~ s/[ :!?'"\(\\\/\)\[\]\{\}\&]/_/g;

    $dir = sprintf('OUT/%s/%s/%s',
                   my_pos($dir, 0),
                   my_pos($dir, 1),
                   my_pos($dir, 2));

    my $md5 = md5_hex(encode_utf8($file));

    $file = substr($file, 0, 240) if length $file >= 240;

    $file = sprintf("%s.%s.xml", $file, substr($md5,0,6));

    return ($dir, $file);
}

sub my_pos {
   my ($str, $pos) = @_;
   if (length($str) > $pos) {
       substr($str, $pos, 1)
   } else {
       '_'
   }
}

sub log_ {
  my $msg = shift;
  open X, ">>$id.log" or die "$!";
  print X $msg,"\n";
  close X;
}
