#!/usr/bin/perl
$sent=0; # sentence-id

#################################################################################
#Florest conversion to tigerXML format
#
#Originly written by Eckhard Bick, eckhard.bick@mail.dk, http://beta.visl.sdu.dk
#
#Modified by Rui Vilela, ruivilela@di.uminho.pt, http://linguateca.di.uminho.pt
#################################################################################

sub to_html{
    $_=$_[0];
    s/^\s+//;s/\s+$//;
    s/&/&amp;/g;s/</&lt;/;s/>/&gt;/;
    return $_;
}

sub tags_errors{
    $_=$_[0];
    s/^\s+//;s/\s+$//;
    s/\\\'/\'/g;
    s/\\\(/\(/g;
    s/\\\)/\)/g;
    s/[><]//g;s/&/&amp;/g;s/\"/&quot;/g;
    return $_;
}

sub calc_extra{
    $_=$_[0];
    if ( /\([^<>]*<([^<>]+)>(.*)\)/ ){$a=$1;$v=$2;
        if ( $v=~/[^<>]*<([^<>]+)>(.*)/){$b=$1;$v=$2;
            if ( $v=~/[^<>]*<([^<>]+)>.*/ ){$c=$1;
                return "extra=\"".$a."\" extra2=\"".$b."\" extra3=\"".$c."\"";
            }else{return "extra=\"".$a."\" extra2=\"".$b."\" extra3=\"--\"";}
        }else{return "extra=\"".$a."\" extra2=\"--\" extra3=\"--\"";}
    }else{return "extra=\"--\" extra2=\"--\" extra3=\"--\"";}
}
    

print "<\?xml version=\"1.0\" encoding=\"ISO-8859-1\"\?>\n";
print "<corpus id=\"FlorestaSintactica\">\n";
print "<head>\nQWERTYUIOP\n</head>\n"; ###String para separação
print "<body>\n";

while (<>) {
    next if (/^(\s)?<.*>(\s)*$/);
    s/^&&$/ /;
    if (/^SOURCE: (.*)$/) {  #Fonte
	$source =$1; $sent++; $sentslut=0; $source =~ s/\"//g;
    }
    elsif (/^(C[PF]\d+\-\d+)\s+(.*)$/) {   #Identificação
	$sentref=$1; $text=$2;
	$text=~s/\"/&quot;/g;
	$text=~s/>/&gt;/g;
	$text=~s/</&lt;/g;
	$text=~s/&/&amp;/g;
    }
    elsif (/^A([1-9](\/[1-9])?)\s*$/)  {   #Número de Ambiguidade global
	$forest=$1;
	if ($forest>1){
	    $sent++;$sentslut=0; #$source é igual ; ambiguidade global alternativa da árvore
	}
	print '<s id="s' . $sent . '" ref="' . $sentref . '" source="' . $source . '" forest="' . $forest . '" text="' . $text . '">' . "\n";
	print '<graph root="s' . $sent . '_500">' . "\n";
	print '<terminals>' . "\n"; $t=0; $nt=0; $terminals++;
	#if ($terminals > $terminalsend +1) {$difference =$terminals-$terminalsend; print "\n!!! $difference\n";}
    }
    elsif (/^\s*$/ && $t > 0 && $sentslut==0) {  #linha em branco, imprime código XML ou ignora a linha
	$sentslut=1; # don't print twice if more than one empty line, eller "rubbish" in between sentence and next SOURCE
	&printxml;
    }
    else {
	m/[^=]/g; $depth = pos;
	if ($t==0 && $nt==0) { # top node
	    $depth =1;  #alterar para 0 para suporte de ad.visl
	    $mother=0;
	}else{
	    $mother=$spawning{$depth-1};
	}
	if ( (/\)\t([^ \n]+)/) && ($_!~/\(<\w+>\)/) ){ # Extrair palavra , pode conter etiqueta secundária,(CJT:? (<predicate>))
	    $t++;
	    $word=$1;
	    m/^=*([A-Za-z\-<>\?\&\/]+(?:\[[\+-]\d\])?):([^\(\t]+)\(\'([^ \t]+)\'[ \)]/;  #[-+n] e / Ambiguidade local e de função
	    $func=$1; $pos=$2; $lemma=$3;
	    $func=~s/\/[A-Z<>]+//; #Ignorar ambiguidade local
	    $func=~s/\[[\+-]\d\]//; #Ignorar ambiguidade de função
	    if ($func =~ s/^-//) {$disright=1; $disright{$depth}=1;} #Descontinuidade em nós terminais
	    if ($pos =~ s/-$//) {$disleft=1; $left{$func}=$nt+1; $secmother=0;} #descontinuidades #?

	    #if (! $pos || ! $lemma) {print STDERR "$1\t$2\t$3\n";}
	    if (! $pos)  {$pos ="xx";} 
	    if (! $lemma)  {$lemma ="xx";}
	    if (/\'( +<[^>]+>)* +([A-Za-z&][^\)\t]+)\)/) {$morph=$2;}
	    else {$morph ="--";}
	    ###############  Multipla informação para o atributo extra
	    $extra=&calc_extra($_);
	    print '<t id="s' . $sent . '_' . $t . '" word="' . tags_errors($word) . '"';
	    print ' lemma="' . tags_errors($lemma) . '" pos="' . $pos . '"';
	    print ' morph="' . $morph . '" ' . $extra . '/>' . "\n";
	    $size{$mother}++;
	    $id ="s$sent\_$t";
	    $daughter{$mother,$size{$mother}} =$id;
	    $func{$id} =$func;
	    if ($disright{$depth-1} && $secmother) { # secondary edges til far left mother
		$size{$secmother}++;
		$daughter{$secmother,$size{$secmother}} ="*$id";    
	    }
	}
	elsif ( (/^=*([^A-Z0-9=\n\&]+)\s*$/) && !(/\?\:/) ) { #punctuation, but not ?:
	        $t++ ;$ponto=$1;
		$size{$mother}++;
		$id="s$sent\_$t";
		$func{$id} ="--";
		$daughter{$mother,$size{$mother}} =$id;
		print '<t id="s' . $sent . '_' . $t . '" word="' . tags_errors($ponto) . '" lemma="--" pos="pu" morph="--" extra="--" extra2="--" extra3="--"/>' . "\n";
	}

	elsif (/^(=*)([A-Za-z\-<>\?\&\/]+(?:\[[\+-]\d\])?):([a-zA-Z\-\?\&]+)\s?(\(.*\))?[ \t]*$/) {      #Nós terminais
	    $func =$2; $form =$3; #$more =$4; 
	    $func=~s/\/[A-Z<>]+//; #Ignorar ambiguidade local
	    $func=~s/\[[\+-]\d\]//; #Ignorar ambiguidade de função
	    if ($form =~ s/-$//) {$disleft=1; $left{$func}=$nt+1; $secmother=0;} #descontinuidades
	    if ($func =~ s/^-//) {$disright=1; $disright{$depth}=1;}
	    else {$disright=0;}
	    if ($disright && $mother < $left{$func}) { # if there is no higher level par node in between halves, i.e. not in the case of coordination of -P:vp
		$disright{$depth}=1;
		$spawning{$depth} =$left{$func}; # evt. secondary mother
	    }
	    else {
		$nt500 =$mother + 500; $motherid ="s$sent\_$nt500";
		if ($disright && $left{$func} && $mother > $left{$func} && ($func{$motherid} =~ /CJT/)) {
		    $secmother =$left{$func};
		}
		else {$disright{$depth}=0;}
		$nt++;
		$form{$nt} =$form;
		#$depth{$nt} =$depth; ??
		$spawning{$depth} =$nt;
		$size{$mother}++;
		$nt500 =$nt + 500;
		$id ="s$sent\_$nt500";
		$daughter{$mother,$size{$mother}} =$id;
		$func{$id} =$func;
		#if ($more) {$more{$id} =$more;print STDERR "$more\n"}
		#else {$more{$id} = "--";}
		if ($disright{$depth-1} && $secmother) { # secondary edges til far left mother
		    $size{$secmother}++;
		    $daughter{$secmother,$size{$secmother}} ="*$id";    
		}
	    }
	}
    }


}
if ($t >0 || $nt>0) {&printxml;} # last sentence possibly without empty line after it

print "<\/body>\n";
print "<\/corpus>\n";

sub printxml {
    print "<\/terminals>\n"; $terminalsend++;
    print "<nonterminals>\n";
    print '<nt id="s' . $sent . '_500" cat="s">';
    if ($nt) {$id="s$sent\_501";}
    else {$id="s$sent\_1";}
    print '<edge label="' . to_html($func{$id}) . '" idref="' . $id . '"/>';
    print '</nt>';
    for ($i =1; $i <=$nt; $i++) {
	$nt500 =$i + 500;
	$id ="s$sent\_$nt500";
	print '<nt id="' . $id . '" cat="' . $form{$i} . '">'; 
	for ($j =1; $j <= $size{$i}; $j++) {
	    $daughter =$daughter{$i,$j} ;
	    if ($daughter =~ s/^\*//) {
		print '<secedge label="*" idref="' . $daughter . '"/>';
	    }
	    else {
		print '<edge label="' . to_html($func{$daughter}) . '" idref="' . $daughter . '"/>';
	    }
	}
	print '</nt>';	
	$size{$i} =0; # nonterminal node sizes reset for next sentence    
    }
    print '</nonterminals>';
    print '</graph>';
    print '</s>'."\n\n";
    $nt =0; $t =0; $sentslut =1;
    for ($k =0; $k <= 20; $k++) {
	$spawning{$k} =0; # reset depth dependent spawning nodes
    }
}

