#!/usr/bin/perl -w

#loci_from_gff.pl

use strict;
use vars qw ($opt_f $opt_g $opt_l $opt_o);  # required if strict used
use Getopt::Std;
use constant GNUPLOT => '/usr/bin/gnuplot';

getopts ('f:g:l:o:');     # ('aci:p:o:') means 'ac' are flags, 'i:p:o:' gets following scalar.


# Print a helpful message if the user provides no input file.
if (!@ARGV) { 
        print "usage:  loci_from_gff.pl [options] gff1 gff2\n\n";
	print "options:\n";
	print "-f <feature1>     :  which type of feature from gff1    [ default is all]\n";
	print "-g <feature2>     :  which type of feature from gff2    [ default is all]\n";
        print "-l <len>          :  until which coordinate    [ default is all]\n";
	print "-o <output>       :  output file [default = gff2.loci.gff]\n";
       exit;
}

my $gff1 = shift;
my $gff2 = shift;

my $typetarget1;
if ($opt_f) { $typetarget1 = $opt_f; }
else        { $typetarget1 = "all";  }

my $typetarget2;
if ($opt_g) { $typetarget2 = $opt_g; }
else        { $typetarget2 = "all";  }

my $output;
if ($opt_o) { $output = $opt_o; }
else        { $output = $gff2.".".$typetarget2."loci.gff"; }

my $len;
if ($opt_l) { if ($opt_l < 0) { print "bad len\n"; die; } else { $len = $opt_l; } }
else        { $len = -1;  }


loci_from_gff ("$output", $len, $gff1, $typetarget1, $gff2, $typetarget2);


sub loci_from_gff {
    
    my ($outgff, $len,  $gff1, $typetarget1, $gff2, $typetarget2) = @_;
    
    my $line;
    
    my $seq;
    my $source;
    my $feature;
    my $lend;
    my $rend;
    
    my $n_win       = 0;
    my $n_win_FN    = 0;
    my $n_win_type  = 0;
    my $n_win_total = 0;


    my $n_win1_total = 0;
    my $n_win1_type  = 0;
    
    my @seq1;
    my $source1;
    my @feature1;
    my @lend1;
    my @rend1;
    my @bitstring;

    parse_gff ($gff1, $typetarget1, $len, \$n_win1_total, \$n_win1_type, \@seq1, \$source1, \@feature1, \@lend1, \@rend1, \@bitstring);
    
    open (OUT,">$outgff") || die;

    open (GFF2,"$gff2") || die;
    while (<GFF2>) {
	
	if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\d+)\s+(\d+)\s+/) {
	    $line = $_;
	    
	    $seq     = $1;
	    $source  = $2;
	    $feature = $3;
	    $lend    = $4;
	    $rend    = $5;
	    
	    if ($rend < $lend) { print "loci_from_gff():parsing error feature $feature lend = $lend rend = $rend\n"; die; }
	    
	    if ($len == -1 || ($len > 0 && $rend <= $len)) {
		$n_win_total ++;
		
		if ($typetarget2 =~ /^all$/ || $feature =~ /$typetarget2/) {
		    $n_win_type ++;
		    
		    # yeast interconversion of names
		    yeast_name_interconversion(\$seq);
		    
		    substract($seq, $lend, $rend, $n_win1_type, \@seq1, \@lend1, \@rend1, \@bitstring);
		}
	    }
	}
    }
    close (GFF2);
    
    my $n_win_FP     = $n_win_type - $n_win;
    my $n_win_notype = $n_win_total - $n_win_type;
    
    print "gff2:\t$gff2\n";
    if ($len == -1) {
        print "\t\tTotal $source regions:\t$n_win_total\n";
    }
    else {
        print "\t\tTotal $source regions [<= $len ] :\t$n_win_total\n";
    }
    print "\t\t $typetarget2   $source regions:\t$n_win_type\n";
    print "\t\t!$typetarget2   $source regions:\t$n_win_notype\n\n";

    my $covered;
    my $uncovered;
    my $total;
    my $total_covered   = 0;
    my $total_uncovered = 0;
    my $total_total     = 0;

    for (my $x = 0; $x < $n_win1_type; $x++) {
	$covered   = $bitstring[$x] =~ s/1/1/g; $total_covered   += $covered;
	$uncovered = $bitstring[$x] =~ s/0/0/g; $total_uncovered += $uncovered;
	$total = $covered + $uncovered;         $total_total += $total;

	#print "SE $seq1[$x] M $covered U $uncovered T $total\n";
    }    
    print "Total masked $total_covered Total unmasked $total_uncovered Total $total_total\n";

    my $string;
    my $pose;
    my $posf;
    my $type = "$typetarget2\_loci";
    my $n_loci = 0;

    for (my $x = 0; $x < $n_win1_type; $x++) {

	$string = $bitstring[$x];

	if    ($string =~ /^1/) { $pose = $lend1[$x]; }
	elsif ($string =~ /^0/) { 
	    if ($string =~ /01/) { $pose = $lend1[$x]+length($`)+1; $string = "1$'"; } 
	    else                 { undef ($string);                                  } #it is all masked
	}
	
	while ($string) {
	    if ($string =~ /10/) { 
		$posf = $pose + length($`);
			
		print OUT "$seq1[$x]\t$source1\t$type\t$pose\t$posf\t.\t.\t\.\t\n";
		$n_loci ++;

		$string = "0$'";
		if ($string =~ /01/) { 
		    $pose = $posf + length($`) + 2;
		    
		    $string = "1$'";
		}
		elsif($string =~ /0/) {#ends in zeros
		    undef ($string);			   
		}
	    }
	    elsif ($string =~ /^(1+)$/) { #ends in ones
		$posf = $pose + length($1) - 1;
		print OUT "$seq1[$x]\t$source1\t$type\t$pose\t$posf\t.\t.\t\.\t\n";
		$n_loci ++;
		undef ($string);
	    }
	}
	
    }   

    close (OUT);

    print "number of loci = $n_loci\n";
}

sub parse_gff {

    my ($gff, $typetarget, $last_len, $n_win_ref, $n_win_type_ref, $seq_ref, $source_ref, $feature_ref, $lend_ref, $rend_ref, $bitstring_ref) = @_;

    my $seq;
    my $source;
    my $feature;
    my $lend;
    my $rend;

    my $len;

    my $idx = 0;

    open (GFF,"$gff") || die;
    while (<GFF>) {
	
	if (/^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+/) {
	    $seq     = $1;
	    $source  = $2;
	    $feature = $3;
	    $lend    = $4;
	    $rend    = $5;

	    if ($rend < $lend) { print "parse_gff():parsing error feature $feature lend = $lend rend = $rend\n"; die; }

            if ($last_len == -1 || ($last_len > 0 && $rend < $last_len)) {
		$$n_win_ref ++;
		
		if ($typetarget =~ /^all$/ || $feature =~ /$typetarget/) {
		    
		    # yeast interconversion of names
		    yeast_name_interconversion(\$seq);
		    
		    $seq_ref->[$idx]     = $seq;
		    $feature_ref->[$idx] = $feature;
		    $lend_ref->[$idx]    = $lend;
		    $rend_ref->[$idx]    = $rend;
		    
		    $len = $rend - $lend + 1;
		    $bitstring_ref->[$idx] = "";
		    for (my $i = 0; $i < $len; $i++) { $bitstring_ref->[$idx] .= "0"; }
		
		    $idx ++;
		}
	    }
	}
    }
    close (GFF);
    
    $$source_ref = $source;  # we assume that all elements in a given gff file have the same source

    $$n_win_type_ref = $idx;

    my $n_win_notype = $$n_win_ref - $$n_win_type_ref;

    print "gff:\t$gff\n";
    if ($last_len == -1) {
        print "\t\tTotal $source regions:\t$$n_win_ref\n";
    }
    else {
        print "\t\tTotal $source regions [<= $last_len ] :\t$$n_win_ref\n";
    }
    print "\t\t $typetarget   $source regions:\t$$n_win_type_ref\n";
    print "\t\t!$typetarget   $source regions:\t$n_win_notype\n\n";

}


sub overlap {
    my ($lend_loci, $rend_loci, $lend, $rend, $overlap)  = @_;

    my $is_same_loci = 0;

    if ($lend >= $lend_loci && $rend <= $rend_loci) { $is_same_loci = 1; } #is included
    if ($lend <= $lend_loci && $rend >= $rend_loci) { $is_same_loci = 1; } #extends over

    if ($rend < $rend_loci && $rend >= $lend_loci+$overlap)  { $is_same_loci = 1; } #left-end overlap
    if ($lend > $lend_loci && $lend <= $rend_loci-$overlap)  { $is_same_loci = 1; } #right-end overlap


    return $is_same_loci;

}

sub substract {

    my ($seq, $lend, $rend, $idx, $seq_ref, $lend_ref, $rend_ref, $bitstring_ref) = @_;

    my $lend_gff;
    my $rend_gff;
    my $len_gff;

    my $len = $rend - $lend + 1;
    my $start;
    my $end;

    for (my $x = 0; $x < $idx; $x++) {

	if ($seq =~ /^$seq_ref->[$x]$/ &&
	    overlap($lend, $rend, $lend_ref->[$x], $rend_ref->[$x], 0) == 1) { 

	    $start = 0;
	    $end   = -1;

	    $lend_gff = $lend_ref->[$x];
	    $rend_gff = $rend_ref->[$x];
	    $len_gff  = $rend_gff - $lend_gff + 1;
	    
	    if ($lend >= $lend_gff && $rend <= $rend_gff) { $start = $lend-$lend_gff; $end = $rend-$lend_gff; } #is included
	    if ($lend <= $lend_gff && $rend >= $rend_gff) { $start = 0;               $end = $len_gff-1;      } #extends over
	    
	    if ($rend < $rend_gff && $rend >= $lend_gff)  { $start = ($lend<$lend_gff)? 0:$lend-$lend_gff; $end = $rend-$lend_gff;                               } #left-end overlap
	    if ($lend > $lend_gff && $lend <= $rend_gff)  { $start = $lend-$lend_gff;                      $end = ($rend>$rend_gff)? $len_gff-1:$rend-$lend_gff; } #right-end overlap
	    
	    for (my $i = $start; $i <= $end; $i++) { substr($bitstring_ref->[$x], $i, 1) = "1"; }
	    
	    
	}
    }
    
    
}

sub yeast_name_interconversion {
    my ($seq_ref) = @_;
    
    my $seq = "chr";

    # yeast interconversion of names 
    if ($$seq_ref =~ /NC\_00(\d+)/) { 

	my $num = $1;

	if    ($num == 1133) { $seq .= "I";    } 
	elsif ($num == 1134) { $seq .= "II";   } 
	elsif ($num == 1135) { $seq .= "III";  }
	elsif ($num == 1136) { $seq .= "IV";   }
	elsif ($num == 1137) { $seq .= "V";    }
	elsif ($num == 1138) { $seq .= "VI";   }
	elsif ($num == 1139) { $seq .= "VII";  }
	elsif ($num == 1140) { $seq .= "VIII"; }
	elsif ($num == 1141) { $seq .= "IX";   }
	elsif ($num == 1142) { $seq .= "X";    }
	elsif ($num == 1143) { $seq .= "XI";   }
	elsif ($num == 1144) { $seq .= "XII";  } 
	elsif ($num == 1145) { $seq .= "XIII"; }
	elsif ($num == 1146) { $seq .= "XIV";  }
	elsif ($num == 1147) { $seq .= "XV";   }
	elsif ($num == 1148) { $seq .= "XVI";  }
	elsif ($num == 1224) { $seq .= "Mito";  }
	else                 { print "wrong SC chromosome\n"; die; }

	$$seq_ref = $seq;
   }
}
