#!/usr/bin/perl -w

#qrna2col.pl

use strict;
use vars qw ($opt_c $opt_d $opt_f $opt_g $opt_i $opt_l $opt_q $opt_r $opt_s $opt_u);  # required if strict used
use Getopt::Std;
use constant GNUPLOT => '/usr/bin/gnuplot';

getopts ('i:c:d:fg:l:q:rs:u:');     # ('aci:p:o:') means 'ac' are flags, 'i:p:o:' gets following scalar.


# Print a helpful message if the user provides no input file.
if (!@ARGV) { 
        print "usage:  qrna2col.pl [options] file.qrna\n\n";
	print "options:\n";
        print "-c <case>          :  cases (default is case = 1)\n";
        print "                        possible cases are:\n";
        print "                        0=GLOBAL\n";  
        print "                        1=LOCAL_DIAG_VITERBI 2=LOCAL_DIAG_FORWARD\n";
        print "                        3=LOCAL_SEMI_VITERBI 4=LOCAL_SEMI_FORWARD\n";
        print "                        5=LOCAL_FULL_VITERBI 6=LOCAL_FULL_FORWARD\n";
	print "-d <qfile_dir>     : location of the .q file. (default is assumes ../qfile)\n";
	print "-f                 :  create qfile-type file with the sequences\n";
	print "-g <typetarget>    :  which type of loci you want to analyze (default is all)\n";
        print "                        possible types of loci are:\n";
        print "                        OTH | COD | RNA \n";
	print "-i <id_max>        :  max identity of alignments analysed (default is id_max = 100)\n";
	print "-l <id_max>        :  min fraction of the window that has to score for it to be taken (default all)\n";
	print "-q <qfile>         : name of the qfile  (default given by the qrna name)\n";
        print "-r                 : remove non-canonical pairs\n";
        print "-s <type_of_score> : type of score (sigmoidal | simple)       [default = sigmoidal]\n";
	print "-u <cutoff>        : default is cutoff = 0\n";
       exit;
}
my $verbose;
undef $verbose;

my $file  = shift;
my $tag;
my $type;

my $dir;
my $filename;

if ($file =~ /^(\S+)\/([^\/]+)$/) {
    $dir  = $1;
    $filename = $2;
}
else {
    $dir  = "";
    $filename = $file;
}
print "qrna_file: $filename\n";
print "dir:  $dir/\n";

my $qfile;
if ($filename =~ /^(\S+\.q)\./ ) { $qfile = $1; }
elsif ($opt_q) { $qfile = $opt_q; } 
else           { print "You need to provide the name of the qfile\n"; die;  }

my $qfile_dir;
if ($opt_d) { $qfile_dir = $opt_d; } 
else        { $qfile_dir = "../qfile";}

$qfile = "$qfile_dir/$qfile";

print "q_file: $qfile\n"; 

my $output   = "$dir";

my $n_in_ali = 0; 

my $typetarget; 
if ($opt_g) { $typetarget = $opt_g; } 
else        { $typetarget = "all";  }
$verbose = $typetarget;

my $cutoff;
if (defined($opt_u)) { $cutoff = $opt_u; }
else                 { $cutoff = 0;     }

my $id_max;
if (defined($opt_i)) { $id_max = $opt_i; }
else                 { $id_max = 100;    }

my $type_of_score;
if ($opt_s) { $type_of_score = $opt_s;      }
else        { $type_of_score = "sigmoidal"; }
if ($type_of_score =~ /^simple$/ || $type_of_score =~ /^sigmoidal$/) { ;}
else { print "wrong type of score. options are: 'simple' or 'sigmoidal'"; die; }

my $case;
if ($opt_c) { $case = $opt_c; }
else        { $case = 1;      }

if   ($case==0) { $tag = "GLOBAL";        }
elsif($case==1) { $tag = "LOCAL_DIAG_VITERBI"; }
elsif($case==2) { $tag = "LOCAL_DIAG_FORWARD"; }
elsif($case==3) { $tag = "LOCAL_SEMI_VITERBI"; }
elsif($case==4) { $tag = "LOCAL_SEMI_FORWARD"; }
elsif($case==5) { $tag = "LOCAL_FULL_VITERBI"; }
elsif($case==6) { $tag = "LOCAL_FULL_FORWARD"; }

my $len_percen_cutoff;
if (defined($opt_l)) { $len_percen_cutoff = $opt_l; }
else                 { $len_percen_cutoff = 0.0;    }

my $sqrt2 = sqrt(2.0);

my $col_file;
if ($opt_r) { $col_file = "$file.$typetarget.CUTOFF$cutoff.L$len_percen_cutoff.canpairs.col"; }
else        { $col_file = "$file.$typetarget.CUTOFF$cutoff.L$len_percen_cutoff.allpairs.col"; }

my $fasta_file = "$file.$typetarget.CUTOFF$cutoff.L$len_percen_cutoff.col.q";

my $num = 0;

my $version;

my $coor1;
my $coor2;

my $othsc;
my $rnasc;
my $codsc;

my $rnalod;
my $codlod;

my $othlodsigm;
my $rnalodsigm;
my $codlodsigm;

my $rna;
my $cod;

my $qfile_name1;
my $qfile_name2;

my $name1;
my $name2;
my $seq1;
my $seq2;

my @name1;
my @name2;
my $rest;

my @lloci1;
my @lloci2;

my @rloci1;
my @rloci2;

my @type1;
my @type2;

my @codsc1;
my @codsc2;

my @rnasc1;
my @rnasc2;

my $startblast1;
my $startblast2;

my $seq = 0;

my $idx = 0;

my $len;
my $len_new;
my $id;
my $id_new;

my $score;

my $shuffle = 0;

my $time;

my $coor1l;
my $coor1r;
my $coor2l;
my $coor2r;
my $startwin1;
my $startwin2;
my $endwin1;
my $endwin2;
my $new = 1;

my $nnamesseq = 0;

my $qrna;

my $hasrnass = 0;
my $rnass = 0;
my $len_motif;
my $len_cutoff;
my $ssisrev = 0;
my $ss;
my $ali_num = 0;

open_col_file   ($col_file);
if ($opt_f) { open_fasta_file ($fasta_file); }

open (FILE,"$file") || die;
while (<FILE>) {
    
    if (/^\#.+(qrna.+)/) {
	$version = $1;
	#print "version: $version\n"; 
    }

    elsif (/^\#\s(\d+)/) { $ali_num ++; }
    
    elsif (/^Divergence time \(\S+\):\s+(\S+)/) {
	$time = $1;
    }
    elsif (/^\#.+shuffled/) { 
	$shuffle = 1; 
    }
    elsif (/^>(\S+[\/\-\:]\d+[><]\d+\-)(.+)$/ && $seq == 0) { 
	$qfile_name1 = $1;
	$rest = $2;
	
	$qfile_name1 =~ s/\\//g;
	if ($qfile_name1 =~ /(\S+)[\/\-\:](\d+[><]\d+)/) {
	    $name1 = $1;
	    $startblast1 = $2; 
	}
       
	$seq = 1; 
	$nnamesseq++; 
    }
    
    elsif (/^>(\S+[\/\-\:]\d+[><]\d+\-)(.+)$/ && $seq == 1) { 
	$qfile_name2 = $1;
	$rest = $2;

	$qfile_name2 =~ s/\\//g;
	if ($qfile_name2 =~ /(\S+)[\/\-\:](\d+[><]\d+)/) {
	    $name2 = $1;
	    $startblast2 = $2; 
	}
	
	# regex metacharacters: \ | ( ) [ { ^ $ * + ? .
	#
	#$qfile_name2_quote = quotemeta $qfile_name2;
	
	$seq = 0; 
	$nnamesseq++; 
    } 
    
    elsif (/^length alignment:\s+(\S+) \(id=(\d+\.\d+)\)/) { 
	$len = $1;
	$id  = $2;
	$ssisrev = 0;
	$ss      = "";

	$len_cutoff = $len_percen_cutoff * $len;

    }
    
    elsif (/^posX: (.+)$/ ) { 
	$coor1 = $1;
	
 	if ($coor1 =~ /^(\d+)-(\d+)\s+\[\d+-\d+\]\((\d+)\)/) {
	    
	    #remember conventions for qrna output:
	    #
	    #      posX: 0-62 [0-59](60) 
	    #
	    # is an alignment of 63 positions with 3 gaps. 
	    # So the actual positions are from 0 to 59 not to 62.
	    #
	    #
	    $startwin1 = $1;
	    $endwin1   = $2;
	}

	#paranoia
	if ( ($startwin1 > $endwin1) || $startwin1 < 0 || $endwin1 < 0 ) { print "got ends of the window wrong\n"; die; }
	

    }
    
    elsif (/^posY: (.+)$/) { 
	$coor2 = $1;
	
	if ($coor2 =~ /^(\d+)-(\d+)\s+\[\d+-\d+\]\((\d+)\)/) {
	    
	    #remember conventions for qrna output:
	    #
	    #      posX: 0-62 [0-59](60) 
	    #
	    # is an alignment of 63 positions with 3 gaps. 
	    # So the actual positions are from 0 to 59 not to 62.
	    #
	    #
	    $startwin2 = $1;
	    $endwin2 = $2;
	}
	#paranoia
	if ( ($startwin2 > $endwin2) || $startwin2 < 0 || $endwin2 < 0 ) { print "got ends of the window wrong\n"; die; }
	if ( ($startwin1 != $startwin2) || $endwin1 != $endwin2) { print "got ends of the two window wrong\n"; die; }
		
    } 
    
    elsif (/\s+SS/        && $rnass == 0) { $hasrnass = 1; $rnass = 1; }

    elsif (/\s+SS\s+(.+)/ && $rnass == 1) { $ss .= $1;     $rnass = 0; }

    elsif (/^$tag/) { $num = 1; }
    
    elsif (/^RNA ends\s+\*\(([\+\-])\)\s+=\s+\(\d+\.\.\[(\d+)\]/) { 

	$len_motif = $2;

	if ($1 =~ /^\+$/) { $ssisrev = 0; }
	else              { $ssisrev = 1; }

    }

    elsif (/winner = (\S+)/) { $type = $1; }
    
    elsif (/^\s+ OTH = \s+(\S+)\s+ COD = \s+(\S+)\s+ RNA = \s+(\S+)/ && $num == 1) { 
	$othsc = $1; 
	$codsc = $2; 
	$rnasc = $3; 
	
	$codlod = $codsc - $othsc;
	$rnalod = $rnasc - $othsc;
	
	$othlodsigm = - log(exp(log(2.0)*($codsc-$othsc)) + exp(log(2.0)*($rnasc-$othsc)))/log(2.0);
	$codlodsigm = - log(exp(log(2.0)*($othsc-$codsc)) + exp(log(2.0)*($rnasc-$codsc)))/log(2.0);
	$rnalodsigm = - log(exp(log(2.0)*($othsc-$rnasc)) + exp(log(2.0)*($codsc-$rnasc)))/log(2.0);
	
	undef($score);
	if    ($type =~/^RNA$/) { $score = $rnalodsigm; }
	elsif ($type =~/^COD$/) { $score = $codlodsigm; }
	elsif ($type =~/^OTH$/) { $score = $othlodsigm; }
	else                    { print "wrong type ($type)\n"; die; }

	get_ali_qfile("$qfile", $ali_num, \$seq1, \$seq2, $qfile_name1, $startwin1, $endwin1, $qfile_name2, $startwin2, $endwin2);
	
	$qrna = "QRNA";
	if ($shuffle == 1) { $type = "sh$type"; $qrna = "shQRNA"; }
	
	$num = 0;
	
	if ($score) {

	    if ($typetarget =~ /^all$/ || $type =~ /$typetarget$/) 
	    {
		if ($score >= $cutoff && $len_motif >= $len_cutoff) 
		{ 
		    write_to_col("$col_file", $version, $type, $cutoff, $startwin1, $endwin1, $qfile_name1, $seq1, 
				 $hasrnass, $ss, $ssisrev, $id, $score); 
		    write_to_col("$col_file", $version, $type, $cutoff, $startwin2, $endwin2, $qfile_name2, $seq2, 
				 $hasrnass, $ss, $ssisrev, $id, $score); 

		    if ($opt_f) { write_to_fasta("$fasta_file", $startwin1, $endwin1, $qfile_name1, $seq1, $qfile_name2, $seq2); }
		}
	    }
  
	}
    }
    
    else  { next; }
    
}
close (FILE);
close (COL);
close (FASTA);



#########################################################################################
sub get_ali_qfile {

    my ($qfile, $ali_num, $seq1_ref, $seq2_ref, $name1, $startwin1, $endwin1, $name2, $startwin2, $endwin2) = @_;

    my $seq1;
    my $seq2;

    my $flag1 = 0;
    my $flag2 = 0;    
    my $name;

    my $n_ali = 0;

    my $num = 0;

    open (QFILE,"$qfile") || die;
    while (<QFILE>) {
	
	if (/^\>(\S+)/ && $num == 0) {
	    $n_ali ++;

	    $name = $1; $name =~ s/\\//g;
	    $flag1 = 0;

	    if ($n_ali == $ali_num && $name1 =~ /^$name$/) {
		$flag1 = 1;
		$seq1 = "";
	    }
	    $num = 1;

	   if ($n_ali > $ali_num) { last; }	
	}
	elsif (/^\>(\S+)/ && $num == 1 && $n_ali <= $ali_num) {
	    $name = $1; $name =~ s/\\//g;
	    $flag2 = 0;

	    if ($n_ali == $ali_num && $name2 =~ /^$name$/) {
		$flag2 = 1;
		$seq2 = "";
	    }

	    $num = 0;
	} 
	elsif ($flag1 == 1 && $num == 1) { $seq1 .= $_; }
	elsif ($flag2 == 1 && $num == 0) { $seq2 .= $_; }

    }

    close (QFILE);

    if (!$seq1 || !$seq2) { print "did not find this alignment in qfile:\n$name1\n$name2\n"; die; }

    $seq1 =~ s/ //g; $seq1 =~ s/\n//g; $seq1 =~ s/\t//g;
    $seq2 =~ s/ //g; $seq2 =~ s/\n//g; $seq2 =~ s/\t//g;

    $$seq1_ref = $seq1;
    $$seq2_ref = $seq2;
}

sub iscan {
    
    my ($x, $y) = @_;
    my $is_wc = 0;

    if ($x =~ /A/ && $y =~ /U/) { $is_wc = 1; }
    if ($x =~ /A/ && $y =~ /T/) { $is_wc = 1; }
    if ($x =~ /U/ && $y =~ /A/) { $is_wc = 1; }
    if ($x =~ /T/ && $y =~ /A/) { $is_wc = 1; }
    if ($x =~ /C/ && $y =~ /G/) { $is_wc = 1; }
    if ($x =~ /G/ && $y =~ /C/) { $is_wc = 1; }
    if ($x =~ /U/ && $y =~ /G/) { $is_wc = 1; }
    if ($x =~ /T/ && $y =~ /G/) { $is_wc = 1; }
    if ($x =~ /G/ && $y =~ /U/) { $is_wc = 1; }
    if ($x =~ /G/ && $y =~ /T/) { $is_wc = 1; }

    return $is_wc;
}

sub isnt {

    my ($char) = @_;
    
    my $isnt = 1;

    if ($char =~ /^\.$/ || $char =~ /^\-$/ ) { $isnt = 0; }

    return $isnt;
}


sub open_col_file {
    my ($col_file) = @_;

    open (COL,">$col_file") || die;
    print COL "; generated by qrna2col.pl\n";
    print COL "; ================================================================\n";

}

sub open_fasta_file {
    my ($fasta_file) = @_;

    open (FASTA,">$fasta_file") || die;

}

sub ss_to_ct {

    my ($ssisrev, $ss, $ct_ref) = @_;

    my $sscp = $ss;
    my @ct;
    my $ct;
    my $char;
    my @stack;
    my $n_pairs = 0;
    my $posi = 0;
    my $posj;

    while ($sscp) {
	$sscp =~ s/^(.)//; $char = $1;
	
	if    ($char =~ ">") { push(@stack, $posi); $n_pairs++; }
	elsif ($char =~ "<") { if (@stack) { $posj = pop(@stack); $ct[$posi] = $posj; $ct[$posj] = $posi;} else { $ct[$posi] = -1; } } 
	else                 { $ct[$posi] = -1; }

	$posi ++;
    }

    # more left brackets than right brackets
    while (@stack) {
	$posi = pop(@stack); $ct[$posi] = -1;
    }

    my $len = length($ss);

   if ($ssisrev == 1) { 
       @ct = reverse(@ct); 

       for (my $i = 0; $i < $len; $i++) { if ($ct[$i] > -1) { $ct[$i] = $len - 1 - $ct[$i]; }}
   }
    
    $ct = join("*", @ct);

    #print "Len $posi\nNpairs $n_pairs\n$ss\n$ct\n";
    
    $$ct_ref = $ct;
}

sub write_to_col {

    my ($col_file, $version, $type, $cutoff, $startwin, $endwin, $name, $seq, $rnass, $ss, $ssisrev, $id, $score) = @_;

    my $seqchar;
    my $label;
    my $word;

    my $ct;
    my @ct;
    my $char = "";
    
    my $pos_ali = 0;
    my $pos = 0;

    my $start = $startwin + 1;
    my $end   = $endwin + 1;

    my $len = $end - $start + 1;

    print COL "; QRNA\t$version\n";
    print COL "; TYPE\t$type (cutoff=$cutoff)\n";
    print COL "; COL1 \tlabel\n";
    print COL "; COL2 \tresidue\n";
    print COL "; COL3 \tseqpos\n";
    print COL "; COL4 \talignpos\n";
    print COL "; COL5 \talign_bp\n";    
    print COL "; ENTRY\t$name\n";
    print COL "; LEN_A\t$len\n";
    print COL "; START\t$start\n";
    print COL "; ID\t$id\n";
    print COL "; SCORE\t$score\n";
    print COL "; ISREV\t$ssisrev\n";
    print COL "; ----------\n";

    if ($rnass == 1) {
	ss_to_ct($ssisrev, $ss, \$ct);
	
	@ct = split(/\*/, $ct);
    }
    my $i = 0;

    my @seq = split(//, $seq);

    while ($seq) {
    
	$seq =~ s/^(.)//;   $seqchar = $1;
	
	$pos_ali ++;
	
	if (isnt($seqchar) == 1) { $label = "N"; $pos ++; $word = $pos; }
	else                     { $label = "G"; $word = ".";           }
	
	if ($pos_ali >= $start && $pos_ali <= $end) {
	    
	    if ($rnass == 1) {
		$char = $ct[$i];
		
		if ($char == -1) { $char = "."; }
		else             { 
		    my $sq  = $seq[$i]; if ($sq ne $seqchar) { print "write_to_col(): bad characters\n"; die; }
		    my $sqc = $seq[$char];

		    if ($opt_r && iscan($sq, $sqc) == 0) { $ct[$char] = -1; $char  = ".";    } # remove non canonical pairs
		    else                                 {                  $char += $start; } 
		}

		$i ++;
	    }

	    print COL "$label\t$seqchar\t$word\t$pos_ali\t$char\n";
	}
	
	if ($pos_ali == $end) { last; }
	
    }
    
    if ($pos_ali != $start+$len-1) { print " bad aligment start=$start end=$end len=$len ($pos_ali)\n"; die; }
    print COL "; **********\n";

    
}

sub write_to_fasta {

    my ($fasta_file, $startwin, $endwin, $name1, $seq1, $name2, $seq2) = @_;

    my $start = $startwin + 1;
    my $end   = $endwin + 1;

    $seq1 =~ s/(.{50})/$1\n/g;
    $seq2 =~ s/(.{50})/$1\n/g;

    print FASTA ">$name1$start-$end\n";
    print FASTA "$seq1\n";
    print FASTA ">$name2$start-$end\n";
    print FASTA "$seq2\n";

}
