#!/usr/bin/perl 

#########################################################
# Script developed by Eckhard Bick, eckhard.bick@mail.dk
#
# Modified by Rui Vilela, ruivilela@di.uminho.pt
#
# Further fixes, cleanup by Ben Wing, ben@666.com, 10/05
#
#########################################################

use strict;

use Getopt::Long;

my $validate = 1; # whether to do validation checks on the source file

&GetOptions ('validate!' => \$validate); # -validate or -novalidate

my $predepth =0; # depth of = signs at previous line
my $depth =0; # depth of = signs at this line
my $prevline_word = 0; # whether the previous line had a lexical item
my $prevline = ""; # previous line, for error reporting
my $sentek =0; # whether we are in the middle of a sentence
my $sent =1; # number of current sentence
my $lema="";
my $proc="";
my $morf="";
my @procs=();

my $dia =(gmtime)[3];
my $mes =((gmtime)[4])+1;
my $ano =1900+(gmtime)[5];
my $descr;

if (defined ($ARGV[0]) && $ARGV[0]=~/Floresta_(\d\.\d)/) {
    $descr="Floresta";
} elsif (defined ($ARGV[0]) && $ARGV[0]=~/(C[PF])_(\d\.\d)/) {
    my $versao=$2;
    $descr=$1;
    $descr=($descr=~/CF/ ? "Bosque : CETEMPúblico" : "Bosque : CETENFolha");
    $descr.=" ".$versao;
} else {
    $descr = "Unknown";
}

print "######################################################\n";
print '#Floresta Sintá(c)tica'."\n";
print '#Linguateca, pólo de Braga - http://linguateca.di.uminho.pt'."\n";
print "#Data: $dia/$mes/$ano\n";
print "#$descr \(Importado do formato árvores deitadas\)\n";
print '#Contactos: Rui Vilela <ruivilela@di.uminho.pt>'."\n";
print "######################################################\n";

while (<>) {
    chomp;
    next if (/^\*/);
	s#\\\(#{#g;
	s#\\\)#}#g;
	s#<sam->(.+)#<sam->$1\+#;
    s/[ \t]+$//;
    s/^<.*> *$//;
    s/^(=*) *\(/$1\{/;
    s/^(=*) *\)/$1\}/;
    s/^(=*):/$1\;/;
    if (s/^(C[FP]\d+\-\d+\s*.*|AM-\d+|(LIT|FAL|CIE)-\S+)/\n\#$sent $1/) {print ; $sent++} #comments
    elsif (s/^SOURCE.*//g) {
	if ($sentek) {print "\n"; $sentek=0;} # sentence end
	print "\n";
    }
    elsif (s/&&//g) {
	if ($sentek) {
	    print "\)" x ($depth+1);
	    print "\n";
	    $sentek=0;
	}
    }
    elsif (s/^(A[0-9]+\/[0-9]+|A[123]) *$/ \#$1/g) {
        print "\n"; #print "\n$_\n";
	$sentek =1;
	$predepth=0;
	# Set depth to -1 originally to catch errors with a missing top-level
	# node; we correct this to 0 before using it.
	$depth = -1;
	$prevline_word = 0;
	$prevline = "";
    }
    elsif ($sentek) {
	$predepth = $depth;
	# count depth in = signs
	$depth = /^=*/ && length ($&);
#	warn "+++Excessive indentation increase, line $.: $_\n"
#	  if $validate && $depth - $predepth > 1;
#	warn "+++Branch incorrectly inserted within lexical item, line $.: $prevline\n"
#	  if $validate && $prevline_word && $depth > $predepth;
	$predepth = 0 if $predepth < 0;
	# if ($depth) {print "---depth $depth\n";}
	# The strange logic here is necessary only for the old VISL format.
	# Once we are sure we have discarded this format, we can simplify
	# this to if ($depth == $predepth).
	if ($depth == $predepth && $sentek == 3 && $predepth > 0) {print "\)";} # cave double empty lines between trees
	elsif ($depth < $predepth) {
	    my $closure = $predepth - $depth + 1; # how many right parentheses
	    print "\)" x $closure;
	}
	
	$prevline = $_;
	$_=~s#NER:#NER_#g;
	($lema)=($_=~/\'(.+?)\'/);
	$lema=~s#:#DOISPONTOS#;
	@procs=($_=~/<([\w\-]+?)>/g);
	$_=~s#<[\w-]+?>##g;
	($morf)=($_=~/\' +(.+?)\)/);
	$morf=~s# #_#g;
	$proc=":".(join ":", grep ! /sam-/, grep !/-sam/, grep !/poss/, sort @procs).":";
	s/^=*//; # erase indentation
	s/[\t ]+/ /g; # collapse tabs and other spacing
	# FIXME! Handle hyphens at EOL and directly after ===; this happens in
	# the CP and the hyphens should be removed.
	$prevline_word = s/(:[^ \(]+) *\(.*?\)( *(\S+)?)/$1:$lema:$morf:$proc$2/ && $3; # no morph etc.? space* is necessary because Kim e.g. has space-isolated some morphs
	s/\#.*//; # no comments (has to come after morph, Kim has used #K INSIDE morf!
	s/\(/{/g; # parens (e.g. in phone numbers) cannot remain
	s/\)/}/g;
#	s/:/+/g;
	if ($sentek ==1) {print "\($_"; $sentek =2;}
	elsif (! /^ *$/) {print " \($_"; $sentek =3;}
	if (/^$/) {$sentek=0;}
    }
}
if ($depth >0) {print "\)\n";} # only if corpus doesn't end in double newline
