#!/usr/bin/perl ######################################################### # Script developed by Eckhard Bick, eckhard.bick@mail.dk # # Modified by Rui Vilela, ruivilela@di.uminho.pt # # Further fixes, cleanup by Ben Wing, ben@666.com, 10/05 # ######################################################### use strict; use Getopt::Long; my $validate = 1; # whether to do validation checks on the source file &GetOptions ('validate!' => \$validate); # -validate or -novalidate my $predepth =0; # depth of = signs at previous line my $depth =0; # depth of = signs at this line my $prevline_word = 0; # whether the previous line had a lexical item my $prevline = ""; # previous line, for error reporting my $sentek =0; # whether we are in the middle of a sentence my $sent =1; # number of current sentence my $lema=""; my $proc=""; my $morf=""; my @procs=(); my $dia =(gmtime)[3]; my $mes =((gmtime)[4])+1; my $ano =1900+(gmtime)[5]; my $descr; if (defined ($ARGV[0]) && $ARGV[0]=~/Floresta_(\d\.\d)/) { $descr="Floresta"; } elsif (defined ($ARGV[0]) && $ARGV[0]=~/(C[PF])_(\d\.\d)/) { my $versao=$2; $descr=$1; $descr=($descr=~/CF/ ? "Bosque : CETEMPúblico" : "Bosque : CETENFolha"); $descr.=" ".$versao; } else { $descr = "Unknown"; } print "######################################################\n"; print '#Floresta Sintá(c)tica'."\n"; print '#Linguateca, pólo de Braga - http://linguateca.di.uminho.pt'."\n"; print "#Data: $dia/$mes/$ano\n"; print "#$descr $Importado do formato árvores deitadas$\n"; print '#Contactos: Rui Vilela '."\n"; print "######################################################\n"; while (<>) { chomp; next if (/^\*/); s#\\$#{#g; s#\\$#}#g; s#(.+)#$1\+#; s/[ \t]+$//; s/^<.*> *$//; s/^(=*) *$/$1\{/; s/^(=*) *$/$1\}/; s/^(=*):/$1\;/; if (s/^(C[FP]\d+\-\d+\s*.*|AM-\d+|(LIT|FAL|CIE)-\S+)/\n\#$sent $1/) {print ; $sent++} #comments elsif (s/^SOURCE.*//g) { if ($sentek) {print "\n"; $sentek=0;} # sentence end print "\n"; } elsif (s/&&//g) { if ($sentek) { print "\)" x ($depth+1); print "\n"; $sentek=0; } } elsif (s/^(A[0-9]+\/[0-9]+|A[123]) *$/ \#$1/g) { print "\n"; #print "\n$_\n"; $sentek =1; $predepth=0; # Set depth to -1 originally to catch errors with a missing top-level # node; we correct this to 0 before using it. $depth = -1; $prevline_word = 0; $prevline = ""; } elsif ($sentek) { $predepth = $depth; # count depth in = signs $depth = /^=*/ && length ($&); # warn "+++Excessive indentation increase, line $.: $_\n" # if $validate && $depth - $predepth > 1; # warn "+++Branch incorrectly inserted within lexical item, line $.: $prevline\n" # if $validate && $prevline_word && $depth > $predepth; $predepth = 0 if $predepth < 0; # if ($depth) {print "---depth $depth\n";} # The strange logic here is necessary only for the old VISL format. # Once we are sure we have discarded this format, we can simplify # this to if ($depth == $predepth). if ($depth == $predepth && $sentek == 3 && $predepth > 0) {print "\)";} # cave double empty lines between trees elsif ($depth < $predepth) { my $closure = $predepth - $depth + 1; # how many right parentheses print "\)" x $closure; } $prevline = $_; $_=~s#NER:#NER_#g; ($lema)=($_=~/\'(.+?)\'/); $lema=~s#:#DOISPONTOS#; @procs=($_=~/<([\w\-]+?)>/g); $_=~s#<[\w-]+?>##g; ($morf)=($_=~/\' +(.+?)\)/); $morf=~s# #_#g; $proc=":".(join ":", grep ! /sam-/, grep !/-sam/, grep !/poss/, sort @procs).":"; s/^=*//; # erase indentation s/[\t ]+/ /g; # collapse tabs and other spacing # FIXME! Handle hyphens at EOL and directly after ===; this happens in # the CP and the hyphens should be removed. $prevline_word = s/(:[^ $]+) *\(.*?$( *(\S+)?)/$1:$lema:$morf:$proc$2/ && $3; # no morph etc.? space* is necessary because Kim e.g. has space-isolated some morphs s/\#.*//; # no comments (has to come after morph, Kim has used #K INSIDE morf! s/$/{/g; # parens (e.g. in phone numbers) cannot remain s/$/}/g; # s/:/+/g; if ($sentek ==1) {print "$$_"; $sentek =2;} elsif (! /^ *$/) {print " \($_"; $sentek =3;} if (/^$/) {$sentek=0;} } } if ($depth >0) {print "$\n";} # only if corpus doesn't end in double newline