#!/usr/bin/perl ### ZONAL PHYLOGENY SOFTWARE (written by SUJAY CHATTOPADHYAY, supervised by EVGENI V. SOKURENKO, Microbiology, UW, Oct,2006) # GIVEN A PAUP-OUTPUT ML TREE FILE (*.ml.tre), # & A FASTA ALIGNMENT FILE, # ISOLATES THE SEQUENCES TO BE COMPARED SEQUENTIALLY # STARTING FROM TERMINAL NODES TOWARD THE ANCESTRAL ONES # & STORES THE INFO IN A FILE "seq_comp-out1.txt" use strict; print "\nEnter Tree Filename: "; my $file1 = ; chomp $file1; print "\nEnter Alignment (Fasta) Filename: "; my $file2 = ; chomp $file2; my $out1 = "seq_comp-out1.txt"; open(FOUT1, ">$out1"); my $out2 = "seq_comp-out2.txt"; open(FOUT2, ">$out2"); my $out3 = "pairwise-variation.txt"; open(FOUT3, ">$out3"); my $out4 = "analysis-results.txt"; open(FOUT4, ">$out4"); my $out6 = "color-zp_tree.txt"; open(FOUT6, ">$out6"); #my $out7 = "test-out.txt"; #open(FOUT7, ">$out7"); my $out8 = "zp_tree.dnd"; open(FOUT8, ">$out8"); # GIVEN A FASTA ALIGNMENT FILE # DEVELOPS A 2-D ARRAY WITH SERIAL ARRANGEMENT OF SEQUENCES AS ROWS # & BASE POSITIONS AS COLUMNS my @orifastafile = (); # declare array variable # get the fastafile data into an array from $file2 @orifastafile = get_file_data($file2); ##################################################### ## representing each sequence by a single line ## & copying each of them as a single scalar variable of array @fastafile my @seqinfo = (); my $seqinfoline=0; foreach my $line (@orifastafile) { if($line =~ /^\s*$/) { last; #break out of the foreach loop } elsif($line =~ />/) { $seqinfo[$seqinfoline] = $line; $seqinfo[$seqinfoline] =~ s/>//; $seqinfoline++; $line =~ s/>.*/>/; } } #print @orifastafile, "\n\n"; #for(my $c=0; $c<$seqinfoline; ++$c) #{ # print "seq ", ($c+1), ": ", $seqinfo[$c], "\n"; #} my @fastafile = (); my $linecount=0; my $newline; my $seqno=1; my $seqlength=1000000000; my $seqlength1=0; my $orifastafile = join('', @orifastafile); chomp $orifastafile; $orifastafile =~ s/\s*//g; #print $orifastafile, "\n\n"; my $lenorifastafile = length $orifastafile; @orifastafile = split('', $orifastafile); for(my $i=1; $i<$lenorifastafile; ++$i) { if($orifastafile[$i] eq '>') { $seqno++; $newline=''; $linecount++; } else { $newline = $newline.$orifastafile[$i]; $fastafile[$linecount] = $newline; #print $fastafile[$linecount], "\n"; } } #print "\n", $fastafile[0], "\n"; #print "\n", $fastafile[1], "\n"; #print "\n", @fastafile, "\n"; #print "\n", @seqinfo, "\n"; ##################################### ## calculating sequence length (taking the length of the shortest sequence in the dataset) my $lenfastafile = scalar @fastafile; #print "\n", $lenfastafile, "\n"; for(my $i=0; $i<$lenfastafile; $i++) { $seqlength1 = length $fastafile[$i]; if($seqlength<$seqlength1) {$seqlength = $seqlength;} else {$seqlength = $seqlength1;} } #print "\nseqlength: ", $seqlength, "\n"; my $seqremainder = $seqlength % 3; if($seqremainder == 0) { $seqlength = $seqlength; } else { $seqlength = $seqlength - $seqremainder; } ################################### ##### REMOVING CODONS WITH GAPS FROM THE FASTAFILE BASED ON READING FRAME 1 ##### my @fastafilenew = (); my $gapnumber = 0; for(my $pos=0; $pos<($seqlength-2); ) { for(my $i=0; $i<$lenfastafile; $i++) { my $gap1 = substr($fastafile[$i], $pos, 3); if($gap1 =~ /-/) { $gapnumber++; } } if($gapnumber == 0) { for(my $i=0; $i<$lenfastafile; $i++) { my $cod = substr($fastafile[$i], $pos, 3); $fastafilenew[$i] = $fastafilenew[$i].$cod; } } $gapnumber=0; $pos = $pos + 3; } @fastafile = @fastafilenew; $lenfastafile = scalar @fastafile; for(my $i=0; $i<$lenfastafile; $i++) { $seqlength1 = length $fastafile[$i]; if($seqlength<$seqlength1) {$seqlength = $seqlength;} else {$seqlength = $seqlength1;} } #print "fastafilenew: \n", @fastafilenew, "\n"; #print "\nfastafile: \n", @fastafile, "\n"; #print "\nseqlength: ", $seqlength, "\n"; ##### END OF REMOVING CODONS WITH GAPS FROM THE FASTAFILE BASED ON READING FRAME 1 ##### ####### creating a 2-d array to store the sequence-data: ####### each row for each seq, & each column for each base-position ####### through a reference ($seqdataref) to the array seqdata ####### while the first position of each row carries the sequence number my $seqdataref = []; $seqlength = $seqlength + 1; my $i=0; my $linenumber=1; for(my $k=0; $k<$seqno; ++$k) { $seqdataref -> [$k][0] = $linenumber; $linenumber++; } foreach my $line1 (@fastafile) { if($line1 =~ /[ACGTacgt]/) { chomp $line1; my @line2 = split('', $line1); # print "\nline2: ", @line2, "\n"; # print "\nseqlength: ", $seqlength, "\n"; for(my $j=1; $j<$seqlength; $j++) { my $k = $j-1; $seqdataref -> [$i][$j] = $line2[$k]; } ++$i; } } #for(my $row=0; $row<$seqno; $row++) #{ # print "\n", $row, ":\n"; # for(my $col=0; $col<$seqlength; $col++) # { # print $seqdataref->[$row][$col]; # } #print "\n"; #} #print $seqdataref->[3][35]; #print "\n"; ####### creating another 2-d array to store the AMINO ACID sequence-data: ####### each row for each seq, & each column for each AMINO ACID-position ####### through a reference ($protseqdataref) to the array seqdata ####### while the first position of each row carries the sequence number my $protseqdataref = []; my $protlinenumber=1; my $protlen; for(my $k=0; $k<$seqno; ++$k) { $protseqdataref -> [$k][0] = $protlinenumber; $protlinenumber++; } for(my $k=0; $k<$seqno; ++$k) { my $nucseq; for(my $l=1; $l<$seqlength; ++$l) { $nucseq = $nucseq.$seqdataref->[$k][$l]; } my $nuclen = length $nucseq; # print "\n", $nucseq, "\n"; my $codonseq; my $prot1; my $prot; for(my $pos=0; $pos<($nuclen-2); ) { $codonseq = substr($nucseq, $pos, 3); $prot1 = codon2aa($codonseq); # print "\n", $prot1, "\n"; $prot = $prot.$prot1; $pos = $pos + 3; } # print "\n", $prot, "\n"; my @prot = split('', $prot); # print "\n", @prot, "\n"; $protlen = scalar @prot; $protlen = $protlen + 1; for(my $m=1; $m<$protlen; $m++) { my $n = $m-1; $protseqdataref -> [$k][$m] = $prot[$n]; } } #for(my $row=0; $row<$seqno; $row++) #{ # print "\n", $row, ":\n"; # for(my $col=0; $col<$seqlength; $col++) # { # print $protseqdataref->[$row][0]; # } #print "\n"; #} #print "\n\n"; ####### END of creating a 2-d array to store the AMINO ACID sequence-data: ...####### #print @seqinfo, "\n\n\n"; my @seqinfo2 = (); @seqinfo2 = @seqinfo; my $lengthseqinfo2 = scalar @seqinfo2; for(my $ll=0; $ll<$lengthseqinfo2; $ll++) { $seqinfo2[$ll] =~ s/\n//; } #print @seqinfo2, "\n"; ####### REPLACING SEQ NO. BY SEQ NAME IN AMINO ACID 2-D ARRAY ######## for(my $row=0; $row<$seqno; $row++) { $protseqdataref -> [$row][0] = $seqinfo2[$row]; } #for(my $row=0; $row<$seqno; $row++) #{ # print "\n", $row, ":\n"; # for(my $col=0; $col<$protlen; $col++) # { # print $protseqdataref->[$row][0]; # } #print "\n"; #} ####### END OF REPLACING SEQ NO. BY SEQ NAME IN AMINO ACID 2-D ARRAY ######## my $oriseqno = $seqno; ######## substitution rate calculation part ########### my $acount=0; my $ccount=0; my $gcount=0; my $tcount=0; my $max=0; my $subac=0; my $subag=0; my $subat=0; my $subca=0; my $subcg=0; my $subct=0; my $subga=0; my $subgc=0; my $subgt=0; my $subta=0; my $subtc=0; my $subtg=0; #### counting the # of a,c,g,t in each position in the dataset for(my $col=1; $col<$seqlength; ++$col) { my $acount=0; my $ccount=0; my $gcount=0; my $tcount=0; for(my $row=0; $row<$seqno; ++$row) { if(($seqdataref -> [$row][$col]) eq 'A' || ($seqdataref -> [$row][$col]) eq 'a') { $acount++; } if(($seqdataref -> [$row][$col]) eq 'C' || ($seqdataref -> [$row][$col]) eq 'c') { $ccount++; } if(($seqdataref -> [$row][$col]) eq 'G' || ($seqdataref -> [$row][$col]) eq 'g') { $gcount++; } if(($seqdataref -> [$row][$col]) eq 'T' || ($seqdataref -> [$row][$col]) eq 't') { $tcount++; } } #### determining the most frequent base to set it as consensus for that position if($acount>$ccount) {$max = $acount;} else {$max = $ccount;} if($gcount>$max) {$max = $gcount;} if($tcount>$max) {$max = $tcount;} #### counting the # of each type of substitution (where a->g is different from g->a) if($max eq $acount) { if($ccount>0) {$subac++;} if($gcount>0) {$subag++;} if($tcount>0) {$subat++;} } if($max eq $ccount) { if($acount>0) {$subca++;} if($gcount>0) {$subcg++;} if($tcount>0) {$subct++;} } if($max eq $gcount) { if($acount>0) {$subga++;} if($ccount>0) {$subgc++;} if($tcount>0) {$subgt++;} } if($max eq $tcount) { if($acount>0) {$subta++;} if($ccount>0) {$subtc++;} if($gcount>0) {$subtg++;} } } #print "\nsubac : ", $subac, "\n"; #print "\nsubag : ", $subag, "\n"; #print "\nsubat : ", $subat, "\n"; #print "\nsubca : ", $subca, "\n"; #print "\nsubcg : ", $subcg, "\n"; #print "\nsubct : ", $subct, "\n"; #print "\nsubga : ", $subga, "\n"; #print "\nsubgc : ", $subgc, "\n"; #print "\nsubgt : ", $subgt, "\n"; #print "\nsubta : ", $subta, "\n"; #print "\nsubtc : ", $subtc, "\n"; #print "\nsubtg : ", $subtg, "\n"; ########### end of substitution rate calculation part ########### my @treefile = (); # declare array variable # get the treefile data into an array from $file1 @treefile = get_file_data($file1); #print @treefile; # read through all the lines of @treefile to extract the tree topology my $topology; foreach my $line (@treefile) { if($line =~ /End;\s*$/) { last; # break out of the foreach loop } elsif($line =~ /tree PAUP_1 =/) { $line =~ s/tree PAUP_1 = \[&U\] //g; # print "\ninitial topology: ", $line, "\n"; $topology = $line; } } chomp $topology; my @topology = (); my @topo = (); @topology = split('', $topology); my $gr1; my $gr2; my $len = scalar @topology; #print "\n", @topology, "\n"; #print $len, "\n"; #### MODIFYING @TOPOLOGY BY PLACING EACH SEQUENCE IN SINGLE CELL IN A NEW @NEWTOPLOGY #### #### FOR LATER USE DURING INCORPORATING BRANCH-LENGTHS IN TOPOLOGY #### my @newtopology = (); my $topochar = 0; my @bltopo = (); # TO BE USED LATER for(my $i=0; $i<$len; $i++) { if($topology[$i] =~ /[0-9]/) { if($topology[$i-1] =~ /[0-9]/) { $newtopology[$topochar] = $newtopology[$topochar].$topology[$i]; } else { $topochar++; $newtopology[$topochar] = $newtopology[$topochar].$topology[$i]; } } else { if($i == 0) {$topochar = 0;} else {$topochar++;} $newtopology[$topochar] = $topology[$i]; } } my $lennewtopology = scalar @newtopology; for(my $i=0; $i<$lennewtopology; $i++) { # print FOUT5 $newtopology[$i]; } #print FOUT5 "\n\n\n\n"; ######################################################################### my $char; my $topo; my @gr2 = (); ### filters out each block like (*,*...) sequentially from the topology ### & transform them to */*... & look for the next (*, */*...) blocks do{ for($char=0; $char<$len; $char++) { $gr1 = $gr1.$topology[$char]; $gr2 = $gr2.$topology[$char]; if($gr1 =~ /;/) { $gr1 = ''; } if($gr1 =~ /,\(/) { $gr1 = $topology[$char]; } elsif($gr1 =~ /\(\(/) { $gr1 = $topology[$char]; } elsif($gr1 =~ /\)\)/) { $gr1 = ''; } elsif($gr1 =~ /^\({1}[0-9,\/]*\){1}$/) # for cases: (n+/n+,n+,..) { # print $gr1, "\n"; my $l = length $gr1; @gr2 = split('', $gr2); for(my $c=0; $c<$l; $c++) { my $removechar = pop @gr2; } ############# ## storing the sequence names to be compared pairwise in a file "seq_comp-out1.txt" $gr1 =~ s/\(//g; $gr1 =~ s/\)//g; my @gr1 = (); my @seqcomp = (); my $count=0; my $seqname; @gr1 = split('', $gr1); my $gr1len = scalar @gr1; my $commanumber=0; my $slashno=0; for(my $i=0; $i<$gr1len; ++$i) { if($gr1[$i] eq ',') { $commanumber++; $seqname=''; $count++; } else { $seqname = $seqname.$gr1[$i]; $seqcomp[$count] = $seqname; } } my $seqnumber = $commanumber + 1; for(my $j=0; $j<$commanumber; ++$j) { for(my $i=($j+1); $i<$seqnumber; ++$i) {print FOUT1 $seqcomp[$j], ",", $seqcomp[$i], " ";} } print FOUT1 "\n"; $gr1 = join('', @gr1); ############# $gr2 = join('', @gr2); $gr1 =~ s/,/\//g; $gr2 = $gr2.$gr1; # print $gr2, "\n"; $gr1=''; } } @gr2 = split('', $gr2); @topology = @gr2; $len = scalar @topology; #print $len, "\n"; $topo = join('', @topology); #print "topology: ", $topo, "\n"; $gr2 = ''; } until ($topo =~ /^[1-9].*[0-9];$/); close(FOUT1); ###### END OF STORing THE INFO IN A FILE "seq_comp-out1.txt ########### $topo =~ s/;//g; #print "final topology: ", $topo, "\n"; ### reading "seq_comp-out1.txt" file to create another file "seq_comp-out2.txt" ### having the exact pairwise list that needs to be compared my $seqcompfile = 'seq_comp-out1.txt'; my @seqpaircomp = (); @seqpaircomp = get_file_data($seqcompfile); #print "\n", @seqpaircomp, "\n"; foreach my $line (@seqpaircomp) { my $seq; my @newlinearray = (); my $newlinearraylength; if($line =~ /End;\s*$/) { last; # break out of the foreach loop } else { my $fileline = $line; chomp $fileline; $fileline =~ s/\t$//; my @fileline = (); @fileline = split('', $fileline); my $seqa; my $filelinelength = scalar @fileline; ### getting the unique seqs from each line for the pairwise comparison list my @linearray = (); my $n=0; # print "\n", $filelinelength, "\n"; for(my $i=0; $i<$filelinelength; ++$i) { if($fileline[$i] eq ',' || $fileline[$i] eq ' ') { $linearray[$n] = $seqa; $n++; $seqa = ''; } else { $seqa = $seqa.$fileline[$i];} if($i eq ($filelinelength-1)) { $linearray[$n] = $seqa;} } # print "\narray1= ", @linearray, "\n"; my $linearraylength = scalar @linearray; $linearraylength = $linearraylength-1; $seq = $linearray[0]; my $slash = '/'; $newlinearray[0] = $seq; $seq = $seq.$slash; $newlinearray[1] = $linearray[1]; $seq = $seq.$newlinearray[1]; # print "\nseq: ", $seq, "\n"; # print "\n", $linearraylength, "\n"; my $m=2; if($linearraylength>=3) { for(my $k=2; $k<$linearraylength; ++$k) { if($linearray[$k] eq $newlinearray[1]) { last; } elsif($linearray[$k] ne $linearray[0]) { $seq = $seq.$slash.$linearray[$k]; $newlinearray[$m] = $linearray[$k]; $m++; } } } # print "\narray2= ", @newlinearray, "\n"; # print "\nseq: ", $seq, "\n"; $newlinearraylength = scalar @newlinearray; if($seq ne $topo) { for(my $k=0; $k<$newlinearraylength; ++$k) { print FOUT2 $newlinearray[$k], "vs.", $seq, "\n"; } } else { for(my $k=0; $k<($newlinearraylength-1); ++$k) { for(my $j=($k+1); $j<$newlinearraylength; ++$j) { print FOUT2 $newlinearray[$k], "vs.", $newlinearray[$j], "\n"; } } } } ################ ### comparing the seqs linewise & add the ancestral seqs to the 2-d array my $r=0; my @seqone = (); #print "\narray2= ", @newlinearray, "\n"; #print "\nseq: ", $seq, "\n"; my $s1; my $s2; for(my $j=($r+1); $j<$newlinearraylength; ++$j) { $s1 = $newlinearray[$r]; #assigning variable for the 1st seq to compare $s2 = $newlinearray[$j]; #assigning variable for the 2nd seq to compare my @sequence1 = (); my @sequence2 = (); my $base1=0; my $base2=0; for(my $row=0; $row<$seqno; $row++) { if($seqdataref->[$row][0] eq $s1) { for(my $k=1; $k<$seqlength; ++$k) { $sequence1[$base1] = $seqdataref->[$row][$k]; $base1++; } } if($seqdataref->[$row][0] eq $s2) { for(my $k=1; $k<$seqlength; ++$k) { $sequence2[$base2] = $seqdataref->[$row][$k]; $base2++; } } } my $l1 = scalar @sequence1; my $l2 = scalar @sequence2; #print "\nseq1 name: ", $s1, "\n"; #print "\nseq2 name: ", $s2, "\n"; #print "\nseq1: ", @sequence1, "\n"; #print "\nseq2: ", @sequence2, "\n"; my $l; if($l1<$l2) {$l=$l1;} else {$l=$l2;} #print "\n", $l, "\n"; ###### find out if any or both the sequences are */*..., rectify any */*, & get the ancestral ones ####### if($s1 =~ /^[0-9]*$/ && $s2 =~ /^[0-9]*$/) # case 1: *,* { my @seqtwo = (); for(my $p=0; $p<$l; ++$p) { if($sequence1[$p] eq $sequence2[$p]) { $seqtwo[$p] = $sequence1[$p]; } else { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } } for(my $p=0; $p<$l; ++$p) { if($seqone[$p] eq $seqtwo[$p]) { $seqone[$p] = $seqone[$p]; } else { $seqone[$p] = $seqone[$p].$seqtwo[$p]; } } #print "\nseqone (case 1):", @seqone, "\n"; #print "\nsequence1 (case 1):", @sequence1, "\n"; #print "\nsequence2 (case 1):", @sequence2, "\n"; #print "\nseqone (case 1):", @seqone, "\n"; } ################### end of case 1 ###################### elsif($s1 =~ /\// && $s2 =~ /^[0-9]*$/) # case 2: */*,* { my @seqtwo = (); for(my $p=0; $p<$l; ++$p) { if($sequence1[$p] eq $sequence2[$p]) { $seqtwo[$p] = $sequence1[$p]; } else { my $char1 = $sequence1[$p]; my @char1 = (); @char1 = split('', $char1); my $lenchar1 = scalar @char1; for(my $d=0; $d<$lenchar1; ++$d) { if($char1[$d] eq $sequence2[$p]) { $sequence1[$p] = $sequence2[$p]; $seqtwo[$p] = $sequence2[$p]; $d = $lenchar1; } elsif($char1[$d] =~ /[aA]/ && $sequence2[$p] =~ /[gG]/) { $sequence1[$p] = $char1[$d]; $seqtwo[$p] = $char1[$d].$sequence2[$p]; } elsif($char1[$d] =~ /[gG]/ && $sequence2[$p] =~ /[aA]/) { $sequence1[$p] = $char1[$d]; $seqtwo[$p] = $char1[$d].$sequence2[$p]; } elsif($char1[$d] =~ /[cC]/ && $sequence2[$p] =~ /[tT]/) { $sequence1[$p] = $char1[$d]; $seqtwo[$p] = $char1[$d].$sequence2[$p]; } elsif($char1[$d] =~ /[tT]/ && $sequence2[$p] =~ /[cC]/) { $sequence1[$p] = $char1[$d]; $seqtwo[$p] = $char1[$d].$sequence2[$p]; } } ########### situations like a or g vs. c : just need to include both in $seqtwo ####### if($sequence1[$p] =~ /^[aAgG]$/ && $sequence2[$p] =~ /^[cC]$/) # case: a or g & c { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence1[$p] =~ /^[aAgG]$/ && $sequence2[$p] =~ /^[tT]$/) # case: a or g & t { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence1[$p] =~ /^[cCtT]$/ && $sequence2[$p] =~ /^[aA]$/) # case: c or t & a { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence1[$p] =~ /^[cCtT]$/ && $sequence2[$p] =~ /^[gG]$/) # case: c or t & g { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } ########## end of situations like c vs. a or g ###### ########### situations like a/g vs. c : which one to take ###### if($sequence1[$p] =~ /^[aA][gG]$/ && $sequence2[$p] =~ /^[cC]$/) # case: a/g & c { if($subca>$subcg) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subca==$subcg) { if($subac>$subgc) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[aA][gG]$/ && $sequence2[$p] =~ /^[tT]$/) # case: a/g & t { if($subta>$subtg) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subta==$subtg) { if($subat>$subgt) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[gG][aA]$/ && $sequence2[$p] =~ /^[cC]$/) # case: g/a & c { if($subcg>$subca) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subcg==$subca) { if($subgc>$subac) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[gG][aA]$/ && $sequence2[$p] =~ /^[tT]$/) # case: g/a & t { if($subtg>$subta) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subtg==$subta) { if($subgt>$subat) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[cC][tT]$/ && $sequence2[$p] =~ /^[aA]$/) # case: c/t & a { if($subac>$subat) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subac==$subat) { if($subca>$subta) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[tT][cC]$/ && $sequence2[$p] =~ /^[aA]$/) # case: t/c & a { if($subat>$subac) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subat==$subac) { if($subta>$subca) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[cC][tT]$/ && $sequence2[$p] =~ /^[gG]$/) # case: c/t & g { if($subgc>$subgt) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subgc==$subgt) { if($subcg>$subtg) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[tT][cC]$/ && $sequence2[$p] =~ /^[gG]$/) # case: t/c & g { if($subgt>$subgc) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subgt==$subgc) { if($subtg>$subcg) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } ####### end of situations like a/g vs. c ############### } } for(my $p=0; $p<$l; ++$p) { if($seqone[$p] eq $seqtwo[$p]) { $seqone[$p] = $seqone[$p]; } else { $seqone[$p] = $seqone[$p].$seqtwo[$p]; } } #print "\nseqone (case 2):", @seqone, "\n"; #print "\nsequence1 (case 2):", @sequence1, "\n"; #print "\nsequence2 (case 2):", @sequence2, "\n"; #print "\nseqone (case 2):", @seqone, "\n"; } ################## end of case 2 ###################### elsif($s1 =~ /^[0-9]*$/ && $s2 =~ /\//) # case 3: *, */* [the reverse of case 2] { my @seqtwo = (); for(my $p=0; $p<$l; ++$p) { if($sequence1[$p] eq $sequence2[$p]) { $seqtwo[$p] = $sequence1[$p]; } else { my $char2 = $sequence2[$p]; my @char2 = (); @char2 = split('', $char2); my $lenchar2 = scalar @char2; for(my $d=0; $d<$lenchar2; ++$d) { if($char2[$d] eq $sequence1[$p]) { $sequence2[$p] = $sequence1[$p]; $seqtwo[$p] = $sequence1[$p]; $d = $lenchar2; } elsif($char2[$d] =~ /[aA]/ && $sequence1[$p] =~ /[gG]/) { $sequence2[$p] = $char2[$d]; $seqtwo[$p] = $char2[$d].$sequence1[$p]; } elsif($char2[$d] =~ /[gG]/ && $sequence1[$p] =~ /[aA]/) { $sequence2[$p] = $char2[$d]; $seqtwo[$p] = $char2[$d].$sequence1[$p]; } elsif($char2[$d] =~ /[cC]/ && $sequence1[$p] =~ /[tT]/) { $sequence2[$p] = $char2[$d]; $seqtwo[$p] = $char2[$d].$sequence1[$p]; } elsif($char2[$d] =~ /[tT]/ && $sequence1[$p] =~ /[cC]/) { $sequence2[$p] = $char2[$d]; $seqtwo[$p] = $char2[$d].$sequence1[$p]; } } ########### situations like c vs. a or g : just need to include both in $seqtwo ####### if($sequence2[$p] =~ /^[aAgG]$/ && $sequence1[$p] =~ /^[cC]$/) # case: c & a or g { $seqtwo[$p] = $sequence2[$p].$sequence1[$p]; } if($sequence2[$p] =~ /^[aAgG]$/ && $sequence1[$p] =~ /^[tT]$/) # case: t & a or g { $seqtwo[$p] = $sequence2[$p].$sequence1[$p]; } if($sequence2[$p] =~ /^[cCtT]$/ && $sequence1[$p] =~ /^[aA]$/) # case: a & c or t { $seqtwo[$p] = $sequence2[$p].$sequence1[$p]; } if($sequence2[$p] =~ /^[cCtT]$/ && $sequence1[$p] =~ /^[gG]$/) # case: g & c or t { $seqtwo[$p] = $sequence2[$p].$sequence1[$p]; } ########## end of situations like c vs. a or g ###### ########### situations like c vs. a/g : which one to take ###### if($sequence2[$p] =~ /^[aA][gG]$/ && $sequence1[$p] =~ /^[cC]$/) # case: c & a/g { if($subca>$subcg) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } elsif($subca==$subcg) { if($subac>$subgc) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } if($sequence2[$p] =~ /^[aA][gG]$/ && $sequence1[$p] =~ /^[tT]$/) # case: a/g & t { if($subta>$subtg) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } elsif($subta==$subtg) { if($subat>$subgt) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } if($sequence2[$p] =~ /^[gG][aA]$/ && $sequence1[$p] =~ /^[cC]$/) # case: g/a & c { if($subcg>$subca) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } elsif($subcg==$subca) { if($subgc>$subac) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } if($sequence2[$p] =~ /^[gG][aA]$/ && $sequence1[$p] =~ /^[tT]$/) # case: g/a & t { if($subtg>$subta) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } elsif($subtg==$subta) { if($subgt>$subat) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } if($sequence2[$p] =~ /^[cC][tT]$/ && $sequence1[$p] =~ /^[aA]$/) # case: c/t & a { if($subac>$subat) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } elsif($subac==$subat) { if($subca>$subta) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } if($sequence2[$p] =~ /^[tT][cC]$/ && $sequence1[$p] =~ /^[aA]$/) # case: t/c & a { if($subat>$subac) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } elsif($subat==$subac) { if($subta>$subca) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } if($sequence2[$p] =~ /^[cC][tT]$/ && $sequence1[$p] =~ /^[gG]$/) # case: c/t & g { if($subgc>$subgt) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } elsif($subgc==$subgt) { if($subcg>$subtg) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } if($sequence2[$p] =~ /^[tT][cC]$/ && $sequence1[$p] =~ /^[gG]$/) # case: t/c & g { if($subgt>$subgc) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } elsif($subgt==$subgc) { if($subtg>$subcg) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $char2[0].$sequence1[$p]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $char2[1].$sequence1[$p]; } } ####### end of situations like a/g vs. c ############### } } for(my $p=0; $p<$l; ++$p) { if($seqone[$p] eq $seqtwo[$p]) { $seqone[$p] = $seqone[$p]; } else { $seqone[$p] = $seqone[$p].$seqtwo[$p]; } } #print "\nseqone (case 3):", @seqone, "\n"; #print "\nsequence1 (case 3):", @sequence1, "\n"; #print "\nsequence2 (case 3):", @sequence2, "\n"; #print "\nseqone (case 3):", @seqone, "\n"; } ################### end of case 3 ###################### elsif($s1 =~ /\// && $s2 =~ /\//) # case 4: */*, */* { my @seqtwo = (); for(my $p=0; $p<$l; ++$p) { if($sequence1[$p] eq $sequence2[$p]) { my $seqchar1 = $sequence1[$p]; my @seqchar1 = (); @seqchar1 = split('', $seqchar1); my $lenseqchar1 = scalar @seqchar1; if($lenseqchar1>1) { my $finalchar = $lenseqchar1 - 1; $sequence1[$p] = $seqchar1[$finalchar]; $sequence2[$p] = $seqchar1[$finalchar]; $seqtwo[$p] = $seqchar1[$finalchar]; } else { $seqtwo[$p] = $sequence1[$p]; } } else { ### situations like a/g vs. g OR g/t OR g/t/c my $char1 = $sequence1[$p]; my @char1 = (); @char1 = split('', $char1); my $lenchar1 = scalar @char1; my $char2 = $sequence2[$p]; my @char2 = (); @char2 = split('', $char2); my $lenchar2 = scalar @char2; for(my $d1=0; $d1<$lenchar1; ++$d1) { for(my $d2=0; $d2<$lenchar2; ++$d2) { if($char1[$d1] eq $char2[$d2]) { $sequence1[$p] = $char1[$d1]; $sequence2[$p] = $char2[$d2]; $seqtwo[$p] = $char2[$d2]; $d1 = $lenchar1; $d2 = $lenchar2; } elsif($char1[$d1] =~ /[aA]/ && $char2[$d2] =~ /[gG]/) { $sequence1[$p] = $char1[$d1]; $sequence2[$p] = $char2[$d2]; $seqtwo[$p] = $char1[$d1].$char2[$d2]; } elsif($char1[$d1] =~ /[gG]/ && $char2[$d2] =~ /[aA]/) { $sequence1[$p] = $char1[$d1]; $sequence2[$p] = $char2[$d2]; $seqtwo[$p] = $char1[$d1].$char2[$d2]; } elsif($char1[$d1] =~ /[cC]/ && $char2[$d2] =~ /[tT]/) { $sequence1[$p] = $char1[$d1]; $sequence2[$p] = $char2[$d2]; $seqtwo[$p] = $char1[$d1].$char2[$d2]; } elsif($char1[$d1] =~ /[tT]/ && $char2[$d2] =~ /[cC]/) { $sequence1[$p] = $char1[$d1]; $sequence2[$p] = $char2[$d2]; $seqtwo[$p] = $char1[$d1].$char2[$d2]; } } } ########### situations like a or g vs. c : just need to include both in $seqtwo ####### if($sequence1[$p] =~ /^[aAgG]$/ && $sequence2[$p] =~ /^[cC]$/) # case: a or g & c { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence1[$p] =~ /^[aAgG]$/ && $sequence2[$p] =~ /^[tT]$/) # case: a or g & t { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence1[$p] =~ /^[cCtT]$/ && $sequence2[$p] =~ /^[aA]$/) # case: c or t & a { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence1[$p] =~ /^[cCtT]$/ && $sequence2[$p] =~ /^[gG]$/) # case: c or t & g { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } ########## end of situations like c vs. a or g ###### ########### situations like c vs. a or g : just need to include both in $seqtwo ####### if($sequence2[$p] =~ /^[aAgG]$/ && $sequence1[$p] =~ /^[cC]$/) # case: c & a or g { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence2[$p] =~ /^[aAgG]$/ && $sequence1[$p] =~ /^[tT]$/) # case: t & a or g { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence2[$p] =~ /^[cCtT]$/ && $sequence1[$p] =~ /^[aA]$/) # case: a & c or t { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($sequence2[$p] =~ /^[cCtT]$/ && $sequence1[$p] =~ /^[gG]$/) # case: g & c or t { $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } ########## end of situations like c vs. a or g ###### ########### situations like a/g vs. c : which one to take ###### if($sequence1[$p] =~ /^[aA][gG]$/ && $sequence2[$p] =~ /^[cC]$/) # case: a/g & c { if($subca>$subcg) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subca==$subcg) { if($subac>$subgc) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[aA][gG]$/ && $sequence2[$p] =~ /^[tT]$/) # case: a/g & t { if($subta>$subtg) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subta==$subtg) { if($subat>$subgt) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[gG][aA]$/ && $sequence2[$p] =~ /^[cC]$/) # case: g/a & c { if($subcg>$subca) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subcg==$subca) { if($subgc>$subac) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[gG][aA]$/ && $sequence2[$p] =~ /^[tT]$/) # case: g/a & t { if($subtg>$subta) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subtg==$subta) { if($subgt>$subat) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[cC][tT]$/ && $sequence2[$p] =~ /^[aA]$/) # case: c/t & a { if($subac>$subat) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subac==$subat) { if($subca>$subta) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[tT][cC]$/ && $sequence2[$p] =~ /^[aA]$/) # case: t/c & a { if($subat>$subac) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subat==$subac) { if($subta>$subca) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[cC][tT]$/ && $sequence2[$p] =~ /^[gG]$/) # case: c/t & g { if($subgc>$subgt) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subgc==$subgt) { if($subcg>$subtg) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } if($sequence1[$p] =~ /^[tT][cC]$/ && $sequence2[$p] =~ /^[gG]$/) # case: t/c & g { if($subgt>$subgc) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } elsif($subgt==$subgc) { if($subtg>$subcg) { $sequence1[$p] = $char1[0]; $seqtwo[$p] = $char1[0].$sequence2[$p]; } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } else { $sequence1[$p] = $char1[1]; $seqtwo[$p] = $char1[1].$sequence2[$p]; } } ####### end of situations like a/g vs. c ############### ########### situations like c vs. a/g : which one to take ###### if($sequence2[$p] =~ /^[aA][gG]$/ && $sequence1[$p] =~ /^[cC]$/) # case: c & a/g { if($subca>$subcg) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } elsif($subca==$subcg) { if($subac>$subgc) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } if($sequence2[$p] =~ /^[aA][gG]$/ && $sequence1[$p] =~ /^[tT]$/) # case: a/g & t { if($subta>$subtg) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } elsif($subta==$subtg) { if($subat>$subgt) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } if($sequence2[$p] =~ /^[gG][aA]$/ && $sequence1[$p] =~ /^[cC]$/) # case: g/a & c { if($subcg>$subca) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } elsif($subcg==$subca) { if($subgc>$subac) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } if($sequence2[$p] =~ /^[gG][aA]$/ && $sequence1[$p] =~ /^[tT]$/) # case: g/a & t { if($subtg>$subta) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } elsif($subtg==$subta) { if($subgt>$subat) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } if($sequence2[$p] =~ /^[cC][tT]$/ && $sequence1[$p] =~ /^[aA]$/) # case: c/t & a { if($subac>$subat) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } elsif($subac==$subat) { if($subca>$subta) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } if($sequence2[$p] =~ /^[tT][cC]$/ && $sequence1[$p] =~ /^[aA]$/) # case: t/c & a { if($subat>$subac) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } elsif($subat==$subac) { if($subta>$subca) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } if($sequence2[$p] =~ /^[cC][tT]$/ && $sequence1[$p] =~ /^[gG]$/) # case: c/t & g { if($subgc>$subgt) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } elsif($subgc==$subgt) { if($subcg>$subtg) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } if($sequence2[$p] =~ /^[tT][cC]$/ && $sequence1[$p] =~ /^[gG]$/) # case: t/c & g { if($subgt>$subgc) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } elsif($subgt==$subgc) { if($subtg>$subcg) { $sequence2[$p] = $char2[0]; $seqtwo[$p] = $sequence1[$p].$char2[0]; } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } else { $sequence2[$p] = $char2[1]; $seqtwo[$p] = $sequence1[$p].$char2[1]; } } ####### end of situations like c VS. a/g ############### ########### situations like c/t vs. a/g or the other way round: which one to take ###### my $max2=0; if($subac>$subgt || $subca>$subtg) { $max2 = $subac; } else { $max2 = $subgt; } if($subgc>$max2 || $subcg>$max2) { $max2 = $subgc; } if($subat>$max2 || $subta>$max2) { $max2 = $subat; } if($sequence1[$p] =~ /^[cCtT][tTcC]$/ && $sequence2[$p] =~ /^[aAgG][gGaA]$/) { if($max2 = $subac) { $sequence1[$p] = 'C'; $sequence2[$p] = 'A'; $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($max2 = $subgt) { $sequence1[$p] = 'T'; $sequence2[$p] = 'G'; $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($max2 = $subgc) { $sequence1[$p] = 'C'; $sequence2[$p] = 'G'; $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($max2 = $subat) { $sequence1[$p] = 'T'; $sequence2[$p] = 'A'; $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } } if($sequence2[$p] =~ /^[cCtT][tTcC]$/ && $sequence1[$p] =~ /^[aAgG][gGaA]$/) { if($max2 = $subac) { $sequence2[$p] = 'C'; $sequence1[$p] = 'A'; $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($max2 = $subgt) { $sequence2[$p] = 'T'; $sequence1[$p] = 'G'; $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($max2 = $subgc) { $sequence2[$p] = 'C'; $sequence1[$p] = 'G'; $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } if($max2 = $subat) { $sequence2[$p] = 'T'; $sequence1[$p] = 'A'; $seqtwo[$p] = $sequence1[$p].$sequence2[$p]; } } ########### end of situations like c/t vs. a/g or the other way round ############# } } for(my $p=0; $p<$l; ++$p) { if($seqone[$p] eq $seqtwo[$p]) { $seqone[$p] = $seqone[$p]; } else { $seqone[$p] = $seqone[$p].$seqtwo[$p]; } } #print "\nseqone (case 4):", @seqone, "\n"; #print FOUT3 "\nS1:", $s1, " (case 4):", @sequence1, "\n"; #print FOUT3 "\nS2:", $s2, " (case 4):", @sequence2, "\n"; #print FOUT3 "\nSEQONE:", $s1, "/", $s2, " (case 4):", @seqone, "\n"; } ################### end of case 4 ###################### ## putting new sequences for $sequence1 &/or $sequence2 in the 2-d array if($s1 =~ /\//) { my $p=0; for(my $row=0; $row<$seqno; $row++) { if($seqdataref->[$row][0] eq $s1) { for(my $k=1; $k<$seqlength; ++$k) { $seqdataref->[$row][$k] = $sequence1[$p]; #print $seqdataref->[$row][$k]; $p++; } #print "\n"; } } } if($s2 =~ /\//) { my $p=0; for(my $row=0; $row<$seqno; $row++) { if($seqdataref->[$row][0] eq $s2) { for(my $k=1; $k<$seqlength; ++$k) { $seqdataref->[$row][$k] = $sequence2[$p]; #print $seqdataref->[$row][$k]; $p++; } #print "\n"; } } } ################## #print "\nseq1 name: ", $s1, "\n"; #print "\nseq2 name: ", $s2, "\n"; #print "\nseq1: ", @sequence1, "\n"; #print "\nseq2: ", @sequence2, "\n"; } ################## adding ancestral node sequences to the 2-d array #$seqno++; my $q=0; $seqdataref->[$seqno][0] = $seq; for(my $p=1; $p<$seqlength; ++$p) { $seqdataref->[$seqno][$p] = $seqone[$q]; #print $seqdataref->[$seqno][$p]; $q++; } #print "\n"; $seqno++; } close(FOUT2); ######## END OF comparing the seqs linewise & add the ancestral seqs to the 2-d array ######### ## printing the final 2-d array #for(my $row=0; $row<$seqno; $row++) #{ # print FOUT7 "\n>", $seqdataref->[$row][0], "\n"; # for(my $col=1; $col<$seqlength; $col++) # { # print FOUT7 $seqdataref->[$row][$col]; # } # print FOUT7 "\n"; #} #close(FOUT7); ###### PRIMARY & EXTERNAL ZONE VARIABLE DECLARATIONS ###### my @prizone = (); my @extzone = (); my $lenprizone = 0; my $lenextzone = 0; ########################################################### ######################################################################### ######################################################################### # FINAL PAIRWISE COMPARISON TO CALCULATE SYN & NONSYN DIFFERENCES # ######################################################################### ######################################################################### my $finalcompfile = 'seq_comp-out2.txt'; my @finalpaircomp = (); @finalpaircomp = get_file_data($finalcompfile); #print "\n", @finalpaircomp, "\n"; #my $totalsyn = 0; #my $totalnonsyn = 0; my $synnumber=0; my $nonsynnumber=0; foreach my $line1 (@finalpaircomp) { my $s1; #assigning variable for the 1st seq to compare my $s2; #assigning variable for the 2nd seq to compare my $seq1; my $seq2; my $len1; my $len2; my @l1 = (); my $count=0; @l1 = split('', $line1); do { $s1 = $s1.$l1[$count]; $count++; } until ($l1[$count] =~ /v/); $count = $count + 3; do { $s2 = $s2.$l1[$count]; $count++; } until ($l1[$count] =~ /\n/); for(my $row=0; $row<$seqno; $row++) { if($seqdataref->[$row][0] eq $s1) { for(my $k=1; $k<$seqlength; ++$k) { $seq1 = $seq1.$seqdataref->[$row][$k]; } } if($seqdataref->[$row][0] eq $s2) { for(my $k=1; $k<$seqlength; ++$k) { $seq2 = $seq2.$seqdataref->[$row][$k]; } } } $len1 = length $seq1; $len2 = length $seq2; my $len; if($len1 < $len2) { $len=$len1; } else { $len=$len2; } my $val = $len % 3; $len = $len - $val; #print "\n", $val, "\t", $len, "\n"; my $pos1; my $codon1; my $codon2; my $aa1; my $aa2; my $syn=0; my $nonsyn=0; for($pos1=0; $pos1<($len-2); ) { $codon1 = substr($seq1, $pos1, 3); $codon2 = substr($seq2, $pos1, 3); if($codon1 ne $codon2) { $aa1 = codon2aa($codon1); $aa2 = codon2aa($codon2); if($aa1 ne $aa2) { my $aapos = ($pos1 + 3)/3; print FOUT3 "\nAmino acid change: ", $aa1, $aapos, $aa2, "\n"; } my @codon1 = split('', $codon1); my @codon2 = split('', $codon2); print FOUT3 "codon1: ", @codon1, "\tcodon2: ", @codon2, "\n"; ############################################################### ##comparing 1st base position of the two codons my $n=0; my $s=0; if($codon1[0] ne $codon2[0]) { ##considering it as the 1st mutation my $a1 = $codon1[0].$codon1[1].$codon1[2]; my $a2 = $codon2[0].$codon1[1].$codon1[2]; my $newa1 = codon2aa($a1); my $newa2 = codon2aa($a2); if($newa1 ne '_' && $newa2 ne '_') { if($newa1 ne $newa2) { $n++; } else { $s++; } } ##considering it as the 2nd mutation (case 1) if($codon1[1] ne $codon2[1]){ my $a3 = $codon1[0].$codon2[1].$codon1[2]; my $a4 = $codon2[0].$codon2[1].$codon1[2]; my $newa3 = codon2aa($a3); my $newa4 = codon2aa($a4); if($newa3 ne '_' && $newa4 ne '_') { if($newa3 ne $newa4) { $n++; } else { $s++; } }} ##considering it as the 2nd mutation (case 2) if($codon1[2] ne $codon2[2]){ my $a5 = $codon1[0].$codon1[1].$codon2[2]; my $a6 = $codon2[0].$codon1[1].$codon2[2]; my $newa5 = codon2aa($a5); my $newa6 = codon2aa($a6); if($newa5 ne '_' && $newa6 ne '_') { if($newa5 ne $newa6) { $n++; } else { $s++; } }} ##considering it as the last mutation (after the other bases got the chance to be mutated) if($codon1[1] ne $codon2[1] && $codon1[2] ne $codon2[2]){ my $a7 = $codon1[0].$codon2[1].$codon2[2]; my $a8 = $codon2[0].$codon2[1].$codon2[2]; my $newa7 = codon2aa($a7); my $newa8 = codon2aa($a8); if($newa7 ne '_' && $newa8 ne '_') { if($newa7 ne $newa8) { $n++; } else { $s++; } }} #counting & comparing no. of syn or nonsyn possibilities #to assign the mutation either syn or nonsyn if($n<$s) { $syn++; print FOUT3 $codon1[0], ($pos1+1), $codon2[0], " : S\n"; } else { $nonsyn++; print FOUT3 $codon1[0], ($pos1+1), $codon2[0], " : N\n"; } } ############################################################### ##comparing 2nd base position of the two codons if($codon1[1] ne $codon2[1]) { $nonsyn++; print FOUT3 $codon1[1], ($pos1+2), $codon2[1], " : N\n"; } ############################################################### ##comparing 3rd base position of the two codons $n=0; $s=0; if($codon1[2] ne $codon2[2]) { ##considering it as the 1st mutation my $a1 = $codon1[0].$codon1[1].$codon1[2]; my $a2 = $codon1[0].$codon1[1].$codon2[2]; my $newa1 = codon2aa($a1); my $newa2 = codon2aa($a2); if($newa1 ne '_' && $newa2 ne '_') { if($newa1 ne $newa2) { $n++; } else { $s++; } } ##considering it as the 2nd mutation (case 1) if($codon1[0] ne $codon2[0]){ my $a3 = $codon2[0].$codon1[1].$codon1[2]; my $a4 = $codon2[0].$codon1[1].$codon2[2]; my $newa3 = codon2aa($a3); my $newa4 = codon2aa($a4); if($newa3 ne '_' && $newa4 ne '_') { if($newa3 ne $newa4) { $n++; } else { $s++; } }} ##considering it as the 2nd mutation (case 2) if($codon1[1] ne $codon2[1]){ my $a5 = $codon1[0].$codon2[1].$codon1[2]; my $a6 = $codon1[0].$codon2[1].$codon2[2]; my $newa5 = codon2aa($a5); my $newa6 = codon2aa($a6); if($newa5 ne '_' && $newa6 ne '_') { if($newa5 ne $newa6) { $n++; } else { $s++; } }} ##considering it as the last mutation (after the other bases got the chance to be mutated) if($codon1[0] ne $codon2[0] && $codon1[1] ne $codon2[1]){ my $a7 = $codon2[0].$codon2[1].$codon1[2]; my $a8 = $codon2[0].$codon2[1].$codon2[2]; my $newa7 = codon2aa($a7); my $newa8 = codon2aa($a8); if($newa7 ne '_' && $newa8 ne '_') { if($newa7 ne $newa8) { $n++; } else { $s++; } }} #counting & comparing no. of syn or nonsyn possibilities #to assign the mutation either syn or nonsyn if($n>$s) { $nonsyn++; print FOUT3 $codon1[2], ($pos1+3), $codon2[2], " : N\n"; } else { $syn++; print FOUT3 $codon1[2], ($pos1+3), $codon2[2], " : S\n"; } } ################################################################ } $pos1 = $pos1 + 3; #print FOUT3 "\n\n"; } print FOUT3 $s1, " vs. ", $s2, " : ", $syn, " S, ", $nonsyn, " N\n\n\n"; #print FOUT3 "\n\n"; #$totalsyn = $totalsyn + $syn; #$totalnonsyn = $totalnonsyn + $nonsyn; if($syn > 0 && $nonsyn == 0) { $synnumber++; } if($nonsyn > 0) { $nonsynnumber++; } #### INCORPORATING BRANCH LENGTHS IN THE ARRAY @NEWTOPOLOGY SEQUENTIALLY #### my $totchange = $syn + $nonsyn; #print "\n", $totchange, "\n"; my $colon = ':'; $totchange = $colon.$totchange; my $lastseq; my $eachchar=0; if($s1 =~ /\//) { my @s1 = (); @s1 = split('', $s1); my $s1len = scalar @s1; for(my $char=($s1len-1); $char>0; $char--) { if($s1[$char] eq '/') { $char=0; } else { $lastseq = $s1[$char].$lastseq; } } } #print "\n", $lastseq, "\n"; for(my $j=0; $j<$lennewtopology; ) { if($s1 =~ /\//) { my $m=0; my $n=0; if($newtopology[$j] eq $lastseq) { #print "\n", $newtopology[$j], "\n"; $bltopo[$eachchar] = $newtopology[$j]; $j++; $eachchar++; for($m=$j; $m<$lennewtopology; ) { $n = $m+1; if($newtopology[$m] eq ')' && $newtopology[$n] eq ')') { $bltopo[$eachchar] = $newtopology[$m]; $eachchar++; $bltopo[$eachchar] = $totchange; $eachchar++; $bltopo[$eachchar] = $newtopology[$n]; $m = $lennewtopology; $eachchar++; } elsif($newtopology[$m] eq ')' && $newtopology[$n] eq ',') { $bltopo[$eachchar] = $newtopology[$m]; $eachchar++; $bltopo[$eachchar] = $totchange; $eachchar++; $bltopo[$eachchar] = $newtopology[$n]; $m = $lennewtopology; $eachchar++; } else { $bltopo[$eachchar] = $newtopology[$m]; $m++; $n++; $eachchar++; } } $j=$n+1; } else { $bltopo[$eachchar] = $newtopology[$j]; $j++; $eachchar++; } } else { #print $newtopology[$j], "\n"; if($newtopology[$j] =~ /^$s1$/) { #print $s1, "\t", $newtopology[$j], "\n"; $bltopo[$eachchar] = $newtopology[$j]; $eachchar++; $bltopo[$eachchar] = $totchange; $j++; #print "\n", $bltopo[$eachchar], "\n"; $eachchar++; } else { #print $newtopology[$j], "\t"; $bltopo[$eachchar] = $newtopology[$j]; $j++; $eachchar++; } } } #print FOUT5 @bltopo, "\n"; #print FOUT5 @newtopology, "\n\n\n\n"; @newtopology = @bltopo; $lennewtopology = scalar @newtopology; #### END OF SEQUENTIAL INCORPORATION OF BRANCH LENGTHS IN THE ARRAY @NEWTOPOLOGY } #print @newtopology, "\n"; ########## END OF FINAL PAIRWISE COMPARISON ########### ###### PRINTING @BLTOPO HAVING BRANCH-LENGTH INFO IN A NEW FILE ####### ###### AFTER REPLACING SEQ NO. BY SEQ NAME ################ ###### AND ADDING SYN/NONSYN INFO FOR EACH EXTANT NODE ###### close(FOUT3); my $blenfile = 'pairwise-variation.txt'; my @bleninfo = (); @bleninfo = get_file_data($blenfile); #print $bleninfo[0], "\n"; ### NEW ADDITION ### ### WRITING PRIMARY & EXTERNAL ZONE SEQUENCES IN TWO DIFFERENT ARRAYS ### my $lenbleninfo = scalar @bleninfo; my @bleninfocopy = (); @bleninfocopy = @bleninfo; my $lenbleninfocopy = scalar @bleninfocopy; #print "\nlength: ", $lenbleninfo, "\n"; #print "\n1: ", $bleninfo[343], "\n"; #print "\n2: ", $bleninfo[341], "\n"; #print $lenfastafile, "\n\n"; #print $totalsyn, "\t", $totalnonsyn, "\n"; for(my $i=0; $i<$lenbleninfo; $i++) { my $fs=''; my $ss=''; my $fsbl=''; my $bl=''; if($bleninfo[$i] =~ /\svs.\s/) { my @line1 = (); my $j1=0; @line1 = split('', $bleninfo[$i]); my $lline1 = scalar @line1; for(my $j=0; $j<$lline1; $j++) { if($line1[$j] =~ /\s/) { $j1=$j+5; $j=$lline1; } else { $fs = $fs.$line1[$j]; } } for(my $j=$j1; $j<$lline1; $j++) { if($line1[$j] =~ /\s/) { $j=$lline1; } else { $ss = $ss.$line1[$j]; } } if($fs !~ /\//) { for(my $k=0; $k<$lline1; $k++) { my $next=$k+1; if($line1[$k] eq ':' && $line1[$next] eq ' ') { my $m = $k + 2; for(my $n=$m; $n<$lline1; $n++) { $fsbl = $fsbl.$line1[$n]; } } } $fsbl =~ s/\s//g; if($fsbl =~ /[0-9]*S,[1-9][0-9]*N/) { $extzone[$lenextzone] = $fs; $lenextzone++; #print "\nE: ", $fs, "\n"; } elsif($fsbl =~ /[1-9][0-9]*S,0N/) { $prizone[$lenprizone] = $fs; $lenprizone++; #print "\nP: ", $fs, "\n"; } elsif($fsbl =~ /0S,0N/) { if($i < ($lenbleninfo-3)) { my $oldi = $i-1; my $newi = $i+1; #print "\nNewi: ", $newi, "\tfs: ", $fs, "\tss: ", $ss, "\n"; my $fs2 = $fs; my $ss2 = $ss; $ss2 =~ s/\/$fs2//; $ss2 =~ s/$fs2\///; for(my $p=$oldi; $p>0; $p--) { if($bleninfocopy[$p] =~ /^$ss2\svs.\s$ss\s:\s[1-9][0-9]*\sS,\s0\sN$/) { $prizone[$lenprizone] = $fs; $lenprizone++; } } for(my $p=$newi; $p<$lenbleninfocopy; $p++) { if($bleninfocopy[$p] =~ /^$ss2\svs.\s$ss\s:\s[1-9][0-9]*\sS,\s0\sN$/) { $prizone[$lenprizone] = $fs; $lenprizone++; } } for(my $p=$oldi; $p>0; $p--) { if($bleninfocopy[$p] =~ /\s$ss\s:\s[1-9][0-9]*\sS,\s0\sN/) { $prizone[$lenprizone] = $fs; $lenprizone++; } } for(my $p=$newi; $p<$lenbleninfocopy; $p++) { if($bleninfocopy[$p] =~ /\s$ss\s:\s[1-9][0-9]*\sS,\s0\sN/) { $prizone[$lenprizone] = $fs; $lenprizone++; } } for(my $p=$newi; $p<$lenbleninfocopy; $p++) { my $fsnext=''; if($bleninfocopy[$p] =~ /\svs.\s/) { #print "\nline: ", $bleninfocopy[$p], "\n"; my @line2 = (); @line2 = split('', $bleninfocopy[$p]); my $lline2 = scalar @line2; for(my $q=0; $q<$lline2; $q++) { if($line2[$q] =~ /\s/) { $q=$lline2; } else { $fsnext = $fsnext.$line2[$q]; } } #print "\nfsnext: ", $fsnext, "\n"; if($fsnext eq $ss) { for(my $r=0; $r<$lline2; $r++) { my $nexttor=$r+1; if($line2[$r] eq ':' && $line2[$nexttor] eq ' ') { my $s = $r + 2; for(my $t=$s; $t<$lline2; $t++) { $bl = $bl.$line2[$t]; } $bl =~ s/\s//g; #print "\n", $fsnext, "\t", $bl, "\t", $fs, "\n"; if($bl =~ /[0-9]*S,[1-9][0-9]*N/) { $extzone[$lenextzone] = $fs; #print "\nE2: ", $extzone[$lenextzone], "\n"; $lenextzone++; } elsif($bl =~ /[1-9][0-9]*S,0N/) { $prizone[$lenprizone] = $fs; #print "\nP2: ", $prizone[$lenprizone], "\n"; $lenprizone++; } } } } } } } elsif($i == ($lenbleninfo-3)) { my $synno = 0; my $oldi = $i-1; for(my $p=$oldi; $p>0; $p--) { if($bleninfocopy[$p] =~ /$ss\s:\s[1-9][0-9]*\sS,\s0\sN/) { $synno++; } } if($synno>0) { $prizone[$lenprizone] = $fs; $lenprizone++; } else { $extzone[$lenextzone] = $fs; $lenextzone++; } } } } } } #print @extzone, "\n\n"; ### END OF WRITING PRIMARY & EXTERNAL ZONE SEQUENCES IN TWO DIFFERENT ARRAYS ### ### END OF NEW ADDITION ### my $lenbltopo = scalar @bltopo; foreach my $line1 (@bleninfo) { #print $line1, "\n"; my $firstseq=''; my $firstseqbl=''; if($line1 =~ /\svs.\s/) { my @line1 = (); @line1 = split('', $line1); my $lline1 = scalar @line1; #print $line1, "\n"; for(my $i=0; $i<$lline1; $i++) { if($line1[$i] =~ /[\/\s]/) { $i = $lline1; #$firstseq = ''; } else { $firstseq = $firstseq.$line1[$i]; } } for(my $i=0; $i<$lline1; $i++) { my $next=$i+1; if($line1[$i] eq ':' && $line1[$next] eq ' ') { my $j = $i + 2; for(my $k=$j; $k<$lline1; $k++) { $firstseqbl = $firstseqbl.$line1[$k]; } $firstseqbl =~ s/\s//g; #print $firstseqbl, "\n"; my $dash = '-'; if($firstseqbl ne '0S,0N') { $firstseqbl = $dash.$firstseqbl; } else { $firstseqbl = ''; } $i = $lline1; #print $firstseqbl, "\n"; } } #print $firstseq, "\t", $firstseqbl, "\n"; for(my $i=0; $i<$lenbltopo; $i++) { my $j=$i+1; if($bltopo[$i] =~ /^$firstseq$/ && $bltopo[$j] =~ /:/) { my $s = $bltopo[$i]; my $t = $seqinfo[$s-1]; $t =~ s/\s//; $bltopo[$i] =~ s/$s/$t/; $bltopo[$i] = $bltopo[$i].$firstseqbl; $bltopo[$i] =~ s/,/\//; #print $bltopo[$i], "\n"; } } } } #print FOUT5 @bltopo; #print @bltopo, "\n"; ###### END OF PRINTING @NEWTOPOLOGY HAVING BRANCH-LENGTH INFO IN A NEW FILE ####### ### WRITING PRIMARY & EXTERNAL ZONE SEQUENCES (REPLACING SEQ NO BY NAMES) IN TWO OUTPUT FILES ### ### & ALSO CREATING TWO NEW ARRAYS @PZONE & @EZONE REMOVING THE SPACES AFTER EACH SEQ NAME ### my $newname1=0; my $newname2=0; my @pzone = (); my @ezone = (); my $pcount = 0; my $ecount = 0; print FOUT4 "PRIMARY ZONE REPRESENTATIVES\n"; print FOUT4 "----------------------------\n"; for(my $l1=0; $l1<$lenprizone; $l1++) { for(my $l2=($l1+1); $l2<($lenprizone+1); $l2++) { if($prizone[$l1] eq $prizone[$l2]) { $newname1=0; $l2=$lenprizone; } else { $newname1=$newname1+1; } } if($newname1>0) { if($prizone[$l1] !~ /\//) { my $name1 = $prizone[$l1] - 1; print FOUT4 $seqinfo[$name1]; $pzone[$pcount] = $seqinfo2[$name1]; #print $pzone[$pcount]; $pcount++; } } } if($nonsynnumber == 0) { my $plen=0; for(my $i=0; $i<$lenfastafile; $i++) { $prizone[$plen] = $i + 1; $plen++; } for(my $l1=0; $l1<$plen; $l1++) { my $name1 = $prizone[$l1] - 1; print FOUT4 $seqinfo2[$name1]; $pzone[$pcount] = $seqinfo2[$name1]; $pcount++; } } print FOUT4 "\n\n\n\nEXTERNAL ZONE REPRESENTATIVES\n"; print FOUT4 "-----------------------------\n"; for(my $l1=0; $l1<$lenextzone; $l1++) { #print $extzone[$l1], "\n"; #print $seqinfo[$l1], "\n"; for(my $l2=0; $l2<$lenprizone; $l2++) { if($extzone[$l1] eq $prizone[$l2]) { $newname2=0; $l2=$lenprizone; } else { $newname2=$newname2+1; } } if($newname2>0) { if($extzone[$l1] !~ /\//) { my $name2 = $extzone[$l1] - 1; print FOUT4 $seqinfo[$name2]; $ezone[$ecount] = $seqinfo2[$name2]; #print $ezone[$ecount]; $ecount++; } } } if($synnumber == 0) { my $elen=0; for(my $i=0; $i<$lenfastafile; $i++) { $extzone[$elen] = $i + 1; $elen++; } for(my $l1=0; $l1<$elen; $l1++) { my $name2 = $extzone[$l1] - 1; print FOUT4 $seqinfo2[$name2]; $ezone[$ecount] = $seqinfo2[$name2]; #print $ezone[$ecount],"\n"; $ecount++; } } #print @extzone, "\n"; #print @seqinfo2, "\n"; #print "\n", @pzone, "\n"; #print "\n", @ezone, "\n"; my $lenpzone = scalar @pzone; my $lenezone = scalar @ezone; #print FOUT4 @prizone; #print FOUT5 @extzone; ### END OF WRITING PRIMARY & EXTERNAL ZONE SEQUENCES (REPLACING SEQ NO BY NAMES) IN TWO OUTPUT FILES ### ##### SEPARATING DIFFERENT STRUCTURAL VARIANTS, IF ANY, IN THE PRIMARY ZONE ##### ##### & PLACING THEM IN DIFFERENT ARRAYS, EACH CONTAINING A SINGLE VARIANT ##### my $number=0; my @newprizone=(); for(my $a=0; $a<$lenpzone; $a++) { #if($pzone[$a] =~ /^\s+$/) #{ # last; #} my $newprizonelen = scalar @newprizone; my $cc1=0; my @s1 = (); my @s2 = (); #print "\n\n\nnew: ", @newprizone, "\n"; #print "\na :", $a, "\n\n"; for(my $c=0; $c<$newprizonelen; $c++) { #if($newprizone[$c] =~ /^\s+$/) #{ # last; #} if($newprizone[$c] =~ /PRI.*-$pzone[$a]$/) { $cc1++; $c = $newprizonelen; } } if($cc1 eq 0) { $number++; #print "\n", $number, "\n"; $newprizone[$a] = 'PRI'.$number.'-'.$pzone[$a]; #print "\na: ", $newprizone[$a], "\n"; for(my $s=0; $s<$oriseqno; $s++) { if($protseqdataref->[$s][0] eq $pzone[$a]) { for(my $t=1; $t<$protlen; $t++) { my $u = $t-1; $s1[$u] = $protseqdataref->[$s][$t]; } } } #print "\ns1: ", $pzone[$a], ": ", @s1, "\n"; for(my $b=($a+1); $b<$lenpzone; $b++) { my $diff=0; for(my $s=0; $s<$oriseqno; $s++) { if($protseqdataref->[$s][0] eq $pzone[$b]) { for(my $t=1; $t<$protlen; $t++) { my $u = $t-1; $s2[$u] = $protseqdataref->[$s][$t]; } } } #print "\ns2: ", $pzone[$b], ": ", @s2, "\n"; for(my $v=0; $v<($protlen-1); $v++) { if($s1[$v] ne $s2[$v]) { $diff++; } } if($diff==0) { $newprizone[$b] = 'PRI'.$number.'-'.$pzone[$b]; #print "\n", $newprizone[$b], "\n"; } } } } #print "\npri: ", $newprizone[0], $newprizone[1], "\n"; #print "\npzone: ", $pzone[0], $pzone[1], "\n"; ##### END OF SEPARATING DIFFERENT STRUCTURAL VARIANTS, IF ANY, IN THE PRIMARY ZONE ##### ##### INCORPORATING "PRI" OR "EXT" DESIGNATION IN THE TOPOLOGY ARRAY @BLTOPO & STORING IT IN THE OUTPUT FILE "FOUT5" ##### $lenbltopo = scalar @bltopo; #for(my $i=0; $i<$lenbltopo; $i++) #{ # print $bltopo[$i], "\n"; #} my $lennewprizone = scalar @newprizone; #for(my $i=0; $i<$lennewprizone; $i++) #{ # print $pzone[$i], "\t", $newprizone[$i], "\n"; #} for(my $c1=0; $c1<$lenbltopo; $c1++) { for(my $c2=0; $c2<$lenpzone; $c2++) { if($bltopo[$c1] =~ /^$pzone[$c2].*$/) { if($bltopo[$c1] =~ /-/) { my @c1bltopo = (); @c1bltopo = split('', $bltopo[$c1]); my $count= scalar @c1bltopo; my $newpri; for(my $ccc=$count; $ccc>0; $ccc--) { if($c1bltopo[$ccc] eq '-') { $newpri = $c1bltopo[$ccc].$newpri; $ccc = 0; } else { $newpri = $c1bltopo[$ccc].$newpri; } } $bltopo[$c1] = $newprizone[$c2].$newpri; } else { $bltopo[$c1] = $newprizone[$c2]; } } } } for(my $c1=0; $c1<$lenbltopo; $c1++) { for(my $c2=0; $c2<$lenezone; $c2++) { if($bltopo[$c1] =~ /^$ezone[$c2].*$/) { $bltopo[$c1] = 'EXT-'.$bltopo[$c1]; #print "\n", $bltopo[$c1]; } } } #print FOUT5 @bltopo; ##### END OF INCORPORATING "PRI" OR "EXT" DESIGNATION IN THE TOPOLOGY ARRAY @BLTOPO & STORING IT IN THE OUTPUT FILE "FOUT5" ##### ##### REPLACING SEQUENCE NUMBERS BY NAMES IN THE OUTPUT FILE "PAIRWISE-VARIATION.TXT" ##### my $outfile = 'pairwise-variation.txt'; my @outinfo = (); @outinfo = get_file_data($outfile); #my $lenfile = scalar @seqinfo; #for(my $i=0; $i<$lenfile; $i++) #{ # print $i, "\t", $seqinfo[$i], "\n"; #} foreach my $line (@outinfo) { if($line =~ /\svs.\s/) { my $numtoname = ''; my $newline = ''; my @line1 = (); @line1 = split('', $line); my $lline = scalar @line1; for(my $i=0; $i<$lline; $i++) { if($line1[$i] =~ /:/) { for(my $j=$i; $j<$lline; $j++) { $newline = $newline.$line1[$j]; } $i=$lline; } elsif($line1[$i] =~ /[\/\s]/ && $line1[$i-1] =~ /[0-9]/) { $numtoname = $numtoname - 1; $numtoname = $seqinfo[$numtoname]; $newline = $newline.$numtoname.$line1[$i]; $numtoname = ''; } elsif($line1[$i] =~ /[0-9]/) { $numtoname = $numtoname.$line1[$i]; } else { $newline = $newline.$line1[$i]; } } $newline =~ s/\n//g; $line = $newline; } } #print @outinfo, "\n"; my $lenoutinfo = scalar @outinfo; my $out3 = "pairwise-variation.txt"; open(FOUT3, ">$out3"); print FOUT3 @outinfo; close(FOUT3); ##### END OF REPLACING SEQUENCE NUMBERS BY NAMES IN THE OUTPUT FILE "PAIRWISE-VARIATION.TXT" ##### ##### INCORPORATING THE BRANCH-LENGTH INFO FOR THE INTERNAL EXISTING NODES IN THE TOPOLOGY FILE ##### for(my $i=0; $i<$lenbltopo; $i++) { #print $bltopo[$i], "\n"; if($bltopo[$i] !~ /\//) { if($bltopo[$i] =~ /^PRI/ || $bltopo[$i] =~ /^EXT/) { #print $bltopo[$i], "\n"; my $bl; my $fseqfinal; my $sseq; my $startline; for(my $line=0; $line<$lenoutinfo-3; $line++) { my $k=0; if($outinfo[$line] =~ /\svs.\s/) { my @line1 = (); @line1 = split('', $outinfo[$line]); my $lline1 = scalar @line1; my $fseq; #my $sseq; for(my $j=0; $j<$lline1; $j++) { if($line1[$j] =~ /\s/) { $k=$j; $j=$lline1; } else { $fseq = $fseq.$line1[$j]; } } if($bltopo[$i] =~ /$fseq$/) { $k=$k+5; $fseqfinal = $fseq; for(my $l=$k; $l<$lline1; $l++) { if($line1[$l] =~ /\s/) { $l=$lline1; } else { $sseq = $sseq.$line1[$l]; } } #print $fseqfinal, "\t", $sseq, "\n"; $startline = $line + 1; $line = $lenoutinfo; } } } for(my $line=$startline; $line<$lenoutinfo-3; $line++) { my $k=0; if($outinfo[$line] =~ /\svs.\s/) { my @line2 = (); @line2 = split('', $outinfo[$line]); my $lline2 = scalar @line2; my $f2seq; for(my $j=0; $j<$lline2; $j++) { if($line2[$j] =~ /\s/) { $k=$j+1; $j=$lline2; } else { $f2seq = $f2seq.$line2[$j]; } } if($f2seq eq $sseq) { for(my $l=$lline2; $l>$k; $l--) { if($line2[$l] =~ /:/) { $l=$k; } else { $bl = $line2[$l].$bl; } } $line = $lenoutinfo; } } } #print $bl, "\n"; $bl =~ s/\s//g; $bl =~ tr/,/\//; #print $bltopo[$i], "\n"; if($bl !~ /[0-9]/) { $bltopo[$i] = $bltopo[$i]; } else { $bltopo[$i] = $bltopo[$i].'-'.$bl; } #print $bltopo[$i], "\n\n\n"; } } } #print FOUT5 @bltopo; ##### END OF INCORPORATING THE BRANCH-LENGTH INFO FOR THE INTERNAL EXISTING NODES IN THE TOPOLOGY FILE ##### ##### INCORPORATING THE BRANCH-LENGTH INFO FOR THE INTERNAL HYPOTHETICAL NODES IN THE TOPOLOGY FILE ##### my @exclude = (); my $cell=0; my $hypono = 1; for(my $line=0; $line<$lenoutinfo; $line++) { my $k=0; if($outinfo[$line] =~ /\svs.\s/) { my @line1 = (); @line1 = split('', $outinfo[$line]); my $lline1 = scalar @line1; #my $fseq; my $sseq; for(my $j=0; $j<$lline1; $j++) { if($line1[$j] eq '.') { my $k=$j+1; for(my $l=$k; $l<$lline1; $l++) { if($line1[$l] eq ':') { $l=$lline1; } else { $sseq = $sseq.$line1[$l]; } } $j=$lline1; } } my $lenexclude = scalar @exclude; my $n=0; for(my $i=0; $i<$lenexclude; $i++) { if($exclude[$i] eq $sseq) { $n++; } } if($n==0) { #print $sseq, "\n"; $exclude[$cell] = $sseq; $cell++; if($outinfo[$line] !~ /\s0\sS,\s0\sN/) { #print $sseq, "\n"; my $m = $line+1; my $p = 0; for(my $n=$m; $n<$lenoutinfo; $n++) { if($outinfo[$n] =~ /$sseq/ && $outinfo[$n] =~ /\s0\sS,\s0\sN/) { $p=0; $n = $lenoutinfo; } elsif($outinfo[$n] =~ /$sseq/ && $outinfo[$n] !~ /\s0\sS,\s0\sN/) { $p++; #print $sseq, "\n"; } } if($p>0) { #print $sseq, "\n"; for(my $q=0; $q<$lenoutinfo; $q++) { if($outinfo[$q] =~ /\svs.\s/) { my @line2 = (); @line2 = split('', $outinfo[$q]); my $lline2 = scalar @line2; my $fseq=''; my $s; my $blhypo = ''; for(my $r=0; $r<$lline2; $r++) { if($line2[$r] =~ /\s/) { $s=$r+1; $r=$lline2; } else { $fseq = $fseq.$line2[$r]; } } #print $fseq, "\n"; $sseq =~ s/\s//g; #print $sseq, "\n\n"; if($sseq eq $fseq) { #print $fseq, "\n\n"; for(my $t=$lline2; $t>$s; $t--) { if($line2[$t] eq ':') { $t=$s; } else { $blhypo = $line2[$t].$blhypo; } } $blhypo =~ s/\s//g; $blhypo =~ tr/,/\//; #print $fseq, "\t", $blhypo,"\n"; $q=$lenoutinfo; ### INCORPORATING LENGTH IN TOPOLOGY ### my @fseq = (); @fseq = split('', $fseq); my $lenfseq = scalar @fseq; #my $nofseq = 0; my $fseqfirst = ''; my $fseqlast = ''; my $startpoint; my $endpoint; my $newendpoint = 0; my $bracketno = 0; my $zerono = 0; my $openbno = 0; my $closebno = 0; for(my $v=0; $v<$lenfseq; $v++) { if($fseq[$v] =~ /\//) { $v=$lenfseq; } else { $fseqfirst = $fseqfirst.$fseq[$v]; } } #print $fseqfirst, "\n"; for(my $v=($lenfseq-1); $v>0; $v--) { if($fseq[$v] =~ /\//) { $v=0; } else { $fseqlast = $fseq[$v].$fseqlast; } } #print $fseqfirst, "\t", $fseqlast, "\n"; for(my $a=0; $a<$lenbltopo; $a++) { if($bltopo[$a] =~ /-$fseqfirst-/) { $startpoint = $a; } if($bltopo[$a] =~ /-$fseqlast-/) { $endpoint = $a; } } for(my $b=$startpoint; $b<$endpoint; $b++) { if($bltopo[$b] eq ':0') { $zerono++; } if($bltopo[$b] =~ /\(/) { $openbno++; } if($bltopo[$b] =~ /\)/) { $closebno++; } } #print $zerono, "\t", $openbno, "\t", $closebno, "\n"; if($zerono==0 && $openbno==0 && $closebno==0) #if($zerono==0) { my $spoint = $startpoint; $bltopo[$spoint] = 'H'.$hypono.'-'.$blhypo.':0,'.$bltopo[$spoint]; #print $bltopo[$spoint], "\n"; $hypono++; } else { my $frontb=0; my $backb=0; for(my $d1=($startpoint-1); $d1>0; $d1--) { #print "sp: ", $startpoint, "\n"; if($bltopo[$d1] =~ /\(/) { $frontb++; #print "frontb: ", $frontb, "\n"; } if($bltopo[$d1] !~ /\($/) { $d1=0; } #print "frontb: ", $frontb, "\n"; } for(my $d2=($endpoint+1); $d2<$lenbltopo; $d2++) { if($bltopo[$d2] =~ /[,;]/) { $d2=$lenbltopo; } elsif($bltopo[$d2] =~ /\)/) { $backb++; } } #print $fseq, "\t", $frontb, "\t", $backb, "\n"; my $openb = $openbno + 1; my $closeb = $closebno + $backb; my $gap = $closeb - $openb; if($gap>($frontb-1)) { $gap = $frontb-1; } #print $fseq, "\t", $gap, "\n\n\n"; my $spoint = $startpoint - $gap; $bltopo[$spoint] = 'H'.$hypono.'-'.$blhypo.':0,'.$bltopo[$spoint]; $hypono++; } } } } } } } } } #print FOUT5 @bltopo; #close(FOUT5); ##### END OF INCORPORATING THE BRANCH-LENGTH INFO FOR THE INTERNAL HYPOTHETICAL NODES IN THE TOPOLOGY FILE ##### ##### CREATING ANOTHER TREE FILE REPLACING SEQ NAME BY AMINO-ACID MUTATION INFO ##### #for(my $c=0; $c<$lenbltopo; $c++) #{ # print $bltopo[$c], "\n"; #} ########### my @bltopo2 = (); for(my $c=0; $c<$lenbltopo; $c++) { if($bltopo[$c] =~ /^H[0-9]*-/) { my @arr = (); @arr = split('', $bltopo[$c]); my $lenarr = scalar @arr; my $var1 = ''; my $hy = ''; my $name = ''; my $numopenb = 0; my $appcount = 0; my $appcount2 = 0; my $count = 0; my $hypoinfo = ''; for(my $i=0; $i<$lenarr; $i++) { if($arr[$i] =~ /-/ && $arr[$i-2] =~ /H/ && ($i-2) eq '0') { $var1 = $var1.$arr[$i]; $hy = $var1; $var1 = ''; #print "hy: ", $hy, "\t"; } elsif($arr[$i] =~ /-/ && $arr[$i-3] =~ /H/ && ($i-3) eq '0') { $var1 = $var1.$arr[$i]; $hy = $var1; $var1 = ''; #print "hy: ", $hy, "\t"; } elsif($arr[$i] =~ /:/ && $arr[$i+1] =~ /0/ && $arr[$i+2] =~ /,/) { $var1 = $arr[$i]; } else { $var1 = $var1.$arr[$i]; } } for(my $i=0; $i<$lenarr; $i++) { if($arr[$i] =~ /:/ && $arr[$i+1] =~ /0/ && $arr[$i+2] =~ /,/) { $i = $lenarr; } else { $hypoinfo = $hypoinfo.$arr[$i]; } } #print "\nhypoinfo: ", $hypoinfo, "\tvar1: ", $var1, "\n"; for(my $c2=$c; $c2<$lenbltopo; $c2++) { if($bltopo[$c2] =~ /\(/) { $numopenb++; } if($bltopo[$c2] =~ /PRI[0-9]*-/ || $bltopo[$c2] =~ /EXT-/) { my @arrc2 = (); @arrc2 = split('', $bltopo[$c2]); my $lenarrc2 = scalar @arrc2; #$name = ''; my $varname = ''; for(my $c3=0; $c3<$lenarrc2; $c3++) { if($arrc2[$c3] =~ /-/ && $arrc2[$c3-1] =~ /[\dT]/ && $arrc2[$c3-2] =~ /[\dIX]/ && $arrc2[$c3-3] =~ /[IRE]/) { $varname = ''; } elsif($arrc2[$c3] =~ /-/ && $arrc2[$c3+1] =~ /[\d]/ && $arrc2[$c3+2] =~ /[\dS]/ && $arrc2[$c3+3] =~ /[S\/]/) { $name = $varname; $varname = ''; } else { $varname = $varname.$arrc2[$c3]; #print "varname: ", $varname, "\n"; } } $c2 = $lenbltopo; } } #print "bltopo(c): ", $bltopo[$c], "\n"; #print "hy: ", $hy, "\tvar1: ", $var1, "\tname: ", $name, "\n"; $appcount = $numopenb + 2; #print "appcount: ", $appcount, "\n\n"; my $changeinfo = ''; my $changeinfo2 = ''; for(my $j=0; $j<$lenoutinfo; $j++) { if($outinfo[$j] =~ /$name\/.*\svs.\s/ || $outinfo[$j] =~ /$name\svs.\s/) { $appcount2++; if($appcount2 == $appcount) { my $count=0; $changeinfo2 = ''; for(my $m=($j-1); $m>=0; $m--) { if($outinfo[$m] =~ /\svs.\s/) { $m = -1; } elsif($outinfo[$m] =~ /^Amino\sacid\s/) { $count++; if($count>1) { $changeinfo2 = '/'.$changeinfo2; } my @arrinfo = (); @arrinfo = split('', $outinfo[$m]); my $lenarrinfo = scalar @arrinfo; for(my $n=($lenarrinfo-1); $n>0; $n--) { if($arrinfo[$n] =~ /\s/ && $arrinfo[$n-1] =~ /:/) { my @changedir = (); $changeinfo =~ s/\s//g; @changedir = split('', $changeinfo); my $lenchangedir = ''; my $lastval = ''; my $firstval = ''; $lastval = pop @changedir; $firstval = shift @changedir; my $changedir = join('', @changedir); #print $lastval, "\t", $changedir, "\t", $firstval, "\n"; $changeinfo = $lastval.$changedir.$firstval; $changeinfo2 = $changeinfo.$changeinfo2; $changeinfo = ''; $n=0; } else { $changeinfo = $arrinfo[$n].$changeinfo; } } } } } } } #print "\nchangeinfo2: ", $changeinfo2, "\n"; if($changeinfo2 ne '') { $bltopo2[$c] = $hypoinfo.'-'.$changeinfo2.$var1; #print "\nchangeinfo2 ne '': ", $bltopo2[$c], "\n"; } elsif($changeinfo2 eq '') { $hy =~ s/-//g; $bltopo2[$c] = $hypoinfo.$var1; #print "\nchangeinfo2 eq '': ", $bltopo2[$c], "\n"; } } else { $bltopo2[$c] = $bltopo[$c]; #print "\nbltopo2[c]: ", $bltopo2[$c], "\n"; } #print "\n\n\n"; } my @bltopo3 = (); my $lenbltopo2 = scalar @bltopo2; my $strainnum = 1; for(my $c=0; $c<$lenbltopo2; $c++) { #print $bltopo2[$c], "\n"; if($bltopo2[$c] =~ /PRI[0-9]*-/) { my @arr = (); @arr = split('', $bltopo2[$c]); my $lenarr = scalar @arr; my $var = ''; my $hy = ''; my $pri = ''; my $name = ''; for(my $i=0; $i<$lenarr; $i++) { if($arr[$i] =~ /P/ && $arr[$i-1] =~ /,/) { $hy = $var; $var = $arr[$i]; #print "hy: ", $hy, "\t"; } elsif($arr[$i] =~ /-/ && $arr[$i-1] =~ /[0-9]/ && $arr[$i-2] =~ /[I\d]/ && $arr[$i-3] =~ /[RI]/ && $arr[$i-4] =~ /[PR]/) { $var = $var.$arr[$i]; $pri = $var; $var = ''; #print "pri: ", $pri, "\t"; } elsif($arr[$i] =~ /-/ && $arr[$i+1] =~ /[0-9]/ && $arr[$i+2] =~ /[S\d]/ && $arr[$i+3] =~ /[S\/\d]/ && ($i-2) != 0 && ($i-3) != 0) { $name = $var; $var = $arr[$i]; #print "name: ", $name, "\t"; } else { $var = $var.$arr[$i]; } if($i == ($lenarr-1) && $bltopo2[$c] !~ /\/[0-9]*N$/) { $name = $var; $var = ''; } } #print "\nbltopo[c]: ", $bltopo2[$c], "\n"; #print "hy: ", $hy, "\tpri: ", $pri, "\tname: ", $name, "\tvar: ", $var, "\n"; my @arrname = (); my $num = ''; my $nname = ''; if($name =~ /n[1-9][0-9]*$/) { @arrname = split('', $name); my $lenarrname = scalar @arrname; for(my $k=($lenarrname-1); $k>0; $k--) { if($arrname[$k] eq 'n') { $num = $arrname[$k].$num; $k=0; } else { $num = $arrname[$k].$num; } } for(my $k=0; $k<$lenarrname; $k++) { if($arrname[$k] eq "n" && $arrname[$k+1] =~ /[1-9]/) { $k=$lenarrname; } else { $nname = $nname.$arrname[$k]; } } } else { $num = 'n1'; $nname = $name; } if($var =~ /\/0N/ || $var eq '') { $bltopo3[$c] = $hy.$pri.$nname.'-'.$num.$var; $strainnum++; } elsif($var =~ /\/[1-9][0-9]*N/) { my $appcount = 0; my $changeinfo3 = ''; my $changeinfo4 = ''; for(my $j=0; $j<$lenoutinfo; $j++) { if($outinfo[$j] =~ /$name\/.*\svs.\s/ || $outinfo[$j] =~ /$name\svs.\s/) { $appcount++; if($bltopo2[$c+1] =~ /^:[1-9]/ && $appcount == 1) { my $count=0; $changeinfo4 = ''; for(my $m=($j-1); $m>=0; $m--) { if($outinfo[$m] =~ /\svs.\s/) { $m = -1; } elsif($outinfo[$m] =~ /^Amino\sacid\s/) { $count++; if($count>1) { $changeinfo4 = '/'.$changeinfo4; } my @arrinfo = (); @arrinfo = split('', $outinfo[$m]); my $lenarrinfo = scalar @arrinfo; for(my $n=($lenarrinfo-1); $n>0; $n--) { if($arrinfo[$n] =~ /\s/ && $arrinfo[$n-1] =~ /:/) { my @changedir = (); $changeinfo3 =~ s/\s//g; @changedir = split('', $changeinfo3); my $lenchangedir = ''; my $lastval = ''; my $firstval = ''; $lastval = pop @changedir; $firstval = shift @changedir; my $changedir = join('', @changedir); #print $lastval, "\t", $changedir, "\t", $firstval, "\n"; $changeinfo3 = $lastval.$changedir.$firstval; $changeinfo4 = $changeinfo3.$changeinfo4; $changeinfo3 = ''; $n=0; } else { $changeinfo3 = $arrinfo[$n].$changeinfo3; } } } } } elsif($bltopo2[$c+1] =~ /^:0/ && $appcount == 2) { my $count=0; $changeinfo4 = ''; for(my $m=($j-1); $m>=0; $m--) { if($outinfo[$m] =~ /\svs.\s/) { $m = -1; } elsif($outinfo[$m] =~ /^Amino\sacid\s/) { $count++; if($count>1) { $changeinfo4 = '/'.$changeinfo4; } my @arrinfo = (); @arrinfo = split('', $outinfo[$m]); my $lenarrinfo = scalar @arrinfo; for(my $n=($lenarrinfo-1); $n>0; $n--) { if($arrinfo[$n] =~ /\s/ && $arrinfo[$n-1] =~ /:/) { my @changedir = (); $changeinfo3 =~ s/\s//g; @changedir = split('', $changeinfo3); my $lenchangedir = ''; my $lastval = ''; my $firstval = ''; $lastval = pop @changedir; $firstval = shift @changedir; my $changedir = join('', @changedir); #print $lastval, "\t", $changedir, "\t", $firstval, "\n"; $changeinfo3 = $lastval.$changedir.$firstval; $changeinfo4 = $changeinfo3.$changeinfo4; $changeinfo3 = ''; $n=0; } else { $changeinfo3 = $arrinfo[$n].$changeinfo3; } } } } } } } #print $changeinfo4, "\n"; $bltopo3[$c] = $hy.$pri.$nname.'-'.$num.$var.'-'.$changeinfo4; #print $bltopo3[$c], "\n"; $strainnum++; } } elsif($bltopo2[$c] =~ /EXT-/) { my @arr = (); @arr = split('', $bltopo2[$c]); my $lenarr = scalar @arr; my $var = ''; my $hy = ''; my $ext = ''; my $name = ''; for(my $i=0; $i<$lenarr; $i++) { if($arr[$i] =~ /E/ && $arr[$i-1] =~ /,/) { $hy = $var; $var = $arr[$i]; #print "hy: ", $hy, "\t"; } elsif($arr[$i] =~ /-/ && $arr[$i-1] =~ /T/ && $arr[$i-2] =~ /X/ && $arr[$i-3] =~ /E/) { $var = $var.$arr[$i]; $ext = $var; $var = ''; #print "ext: ", $ext, "\t"; } elsif($arr[$i] =~ /-/ && $arr[$i+1] =~ /[0-9]/ && $arr[$i+2] =~ /[S\d]/ && $arr[$i+3] =~ /[S\/\d]/ && ($i-2) != 0 && ($i-3) != 0) { $name = $var; $var = $arr[$i]; #print "name: ", $name, "\t"; } else { $var = $var.$arr[$i]; } if($i == ($lenarr-1) && $bltopo2[$c] !~ /\/[0-9]*N$/) { $name = $var; $var = ''; } } #print "hy: ", $hy, "\text: ", $ext, "\tname: ", $name, "\tvar: ", $var, "\n"; my @arrname = (); my $num = ''; my $nname = ''; if($name =~ /n[1-9][0-9]*$/) { @arrname = split('', $name); my $lenarrname = scalar @arrname; for(my $k=($lenarrname-1); $k>0; $k--) { if($arrname[$k] eq 'n') { $num = $arrname[$k].$num; $k=0; } else { $num = $arrname[$k].$num; } } for(my $k=0; $k<$lenarrname; $k++) { if($arrname[$k] eq "n" && $arrname[$k+1] =~ /[1-9]/) { $k=$lenarrname; } else { $nname = $nname.$arrname[$k]; } } } else { $num = 'n1'; $nname = $name; } if($var =~ /\/0N/ || $var eq '') { $bltopo3[$c] = $hy.$ext.$nname.'-'.$num.$var; $strainnum++; } elsif($var =~ /\/[1-9][0-9]*N/) { my $appcount = 0; my $changeinfo5 = ''; my $changeinfo6 = ''; for(my $j=0; $j<$lenoutinfo; $j++) { if($outinfo[$j] =~ /$name\/.*\svs.\s/ || $outinfo[$j] =~ /$name\svs.\s/) { $appcount++; if($bltopo2[$c+1] =~ /^:[1-9]/ && $appcount == 1) { my $count=0; $changeinfo6 = ''; for(my $m=($j-1); $m>=0; $m--) { if($outinfo[$m] =~ /\svs.\s/) { $m = -1; } elsif($outinfo[$m] =~ /^Amino\sacid\s/) { $count++; if($count>1) { $changeinfo6 = '/'.$changeinfo6; } my @arrinfo = (); @arrinfo = split('', $outinfo[$m]); my $lenarrinfo = scalar @arrinfo; for(my $n=($lenarrinfo-1); $n>0; $n--) { if($arrinfo[$n] =~ /\s/ && $arrinfo[$n-1] =~ /:/) { my @changedir = (); $changeinfo5 =~ s/\s//g; @changedir = split('', $changeinfo5); my $lenchangedir = ''; my $lastval = ''; my $firstval = ''; $lastval = pop @changedir; $firstval = shift @changedir; my $changedir = join('', @changedir); #print $lastval, "\t", $changedir, "\t", $firstval, "\n"; $changeinfo5 = $lastval.$changedir.$firstval; $changeinfo6 = $changeinfo5.$changeinfo6; $changeinfo5 = ''; $n=0; } else { $changeinfo5 = $arrinfo[$n].$changeinfo5; } } } } } elsif($bltopo2[$c+1] =~ /^:0/ && $appcount == 2) { my $count=0; $changeinfo6 = ''; for(my $m=($j-1); $m>=0; $m--) { if($outinfo[$m] =~ /\svs.\s/) { $m = -1; } elsif($outinfo[$m] =~ /^Amino\sacid\s/) { $count++; if($count>1) { $changeinfo6 = '/'.$changeinfo6; } my @arrinfo = (); @arrinfo = split('', $outinfo[$m]); my $lenarrinfo = scalar @arrinfo; for(my $n=($lenarrinfo-1); $n>0; $n--) { if($arrinfo[$n] =~ /\s/ && $arrinfo[$n-1] =~ /:/) { my @changedir = (); $changeinfo5 =~ s/\s//g; @changedir = split('', $changeinfo5); my $lenchangedir = ''; my $lastval = ''; my $firstval = ''; $lastval = pop @changedir; $firstval = shift @changedir; my $changedir = join('', @changedir); #print $lastval, "\t", $changedir, "\t", $firstval, "\n"; $changeinfo5 = $lastval.$changedir.$firstval; $changeinfo6 = $changeinfo5.$changeinfo6; $changeinfo5 = ''; $n=0; } else { $changeinfo5 = $arrinfo[$n].$changeinfo5; } } } } } } } #print $changeinfo6, "\n"; $bltopo3[$c] = $hy.$ext.$nname.'-'.$num.$var.'-'.$changeinfo6; #print $bltopo3[$c], "\n"; $strainnum++; } } else { $bltopo3[$c] = $bltopo2[$c]; } } #print FOUT8 @bltopo3; #close(FOUT8); ##### END OF CREATING ANOTHER TREE FILE REPLACING SEQ NAME BY AMINO-ACID MUTATION INFO ##### ##### REPLACING "PRI" BY "P", "EXT" BY "E", AND INCORPORATING NUMBERS TO THE EXTERNAL NODES LIKE "E1", "E2" ETC. my $lenbltopo3 = scalar @bltopo3; my $extnumber=1; for(my $c=0; $c<$lenbltopo3; $c++) { if($bltopo3[$c] =~ /PRI[0-9]*-/) { $bltopo3[$c] =~ s/PRI/P/; } if($bltopo3[$c] =~ /EXT-/) { #print "\n", $extnumber, "\n"; $bltopo3[$c] =~ s/EXT/E$extnumber/; $extnumber++; #print "\n", $bltopo3[$c], "\n\n\n"; } } print FOUT8 @bltopo3; close(FOUT8); ##### END OF REPLACING "PRI" BY "P", "EXT" BY "E", AND INCORPORATING NUMBERS TO THE EXTERNAL NODES LIKE "E1", "E2" ETC. ##### CREATING COLOR-CODE FILE TO BE IMPORTED BY HYPERTREE SOFTWARE WHEN VISUALIZING THE TREE ##### for(my $c=0; $c<$lenbltopo3; $c++) { #print "\n", $bltopo3[$c], "\n"; if($bltopo3[$c] =~ /P[0-9]*-/) { if($bltopo3[$c] =~ /H[0-9]*-.+,P[0-9]*-/) { my @array1 = (); my $v1 = ''; my $v2 = ''; @array1 = split('', $bltopo3[$c]); my $lenarray1 = scalar @array1; do { $v1 = shift @array1; $v2 = $v2.$v1; }until($v1 eq ','); $v2 =~ s/:0,//; if($v2 =~ /\/0N/) { print FOUT6 $v2, " blue\n"; } else { print FOUT6 $v2, " blue\n"; } print FOUT6 @array1, " blue\n"; } elsif($bltopo3[$c] =~ /^P[0-9]*-/) { print FOUT6 $bltopo3[$c], " blue\n"; } } if($bltopo3[$c] =~ /E[0-9]*-/) { if($bltopo3[$c] =~ /H[0-9]*-.+,E[0-9]*-/) { my @array2 = (); my $v1 = ''; my $v2 = ''; @array2 = split('', $bltopo3[$c]); my $lenarray2 = scalar @array2; do { $v1 = shift @array2; $v2 = $v2.$v1; }until($v1 eq ','); $v2 =~ s/:0,//; if($v2 =~ /\/0N/) { print FOUT6 $v2, " blue\n"; } else { my $x=0; for(my $next=($c+1); $next<$lenbltopo3; $next++) { if($bltopo3[$next] =~ /\)/) { $next = $lenbltopo3; } if($bltopo3[$next] =~ /P[0-9]*-/) { $x++; } } if($x==0) { print FOUT6 $v2, " red\n"; } else { print FOUT6 $v2, " blue\n"; } } print FOUT6 @array2, " red\n"; } elsif($bltopo3[$c] =~ /^E[0-9]*-/) { print FOUT6 $bltopo3[$c], " red\n"; } } if($bltopo3[$c] =~ /^H[0-9]*-[0-9]*S\/0N-*\w*:0,\($/) { my $v3 = $bltopo3[$c]; $v3 =~ s/:0,\(//; print FOUT6 $v3, " blue\n"; } if($bltopo3[$c] =~ /^H[0-9]*-[0-9]*S\/[1-9][0-9]*N-*\w*:0,\($/) { my $bopen=0; my $bclose=0; for(my $next=($c+1); $next<$lenbltopo3; $next++) { if($bltopo3[$next] =~ /^H[0-9]*-[0-9]*S\/0N-*\w*:0,/) { my $v4 = $bltopo3[$c]; $v4 =~ s/:0,\(//; print FOUT6 $v4, " blue\n"; $next = $lenbltopo3; } if($bltopo3[$next] =~ /P[0-9]*-/) { my $v6 = $bltopo3[$c]; $v6 =~ s/:0,\(//; print FOUT6 $v6, " blue\n"; $next = $lenbltopo3; } if($bltopo3[$next] =~ /\(/) { $bopen++; } if($bltopo3[$next] =~ /\)/) { $bclose++; my $bgap = $bclose-$bopen; if($bgap>1) #if($bclose>$bopen) { my $v5 = $bltopo3[$c]; $v5 =~ s/:0,\(//; print FOUT6 $v5, " red\n"; $next = $lenbltopo3; } } } } } close(FOUT6); ##### END OF CREATING COLOR-CODE FILE TO BE IMPORTED BY HYPERTREE SOFTWARE WHEN VISUALIZING THE TREE ##### ##### EXTRACTING ZONE-WISE STRUCTURAL HOT-SPOT INFO FROM "FOUT6" & STORING THEM IN "FOUT4" ###### my $colorfile = 'color-zp_tree.txt'; my @colorfile = (); @colorfile = get_file_data($colorfile); #print $colorfile[0], "\n"; #print $colorfile[3], "\n"; my $lencolorfile = scalar @colorfile; my $pcounter = 0; my $ecounter = 0; my @pnewcolorarray = (); my $plennewcolorarray = 0; my @enewcolorarray = (); my $elennewcolorarray = 0; for(my $i=0; $i<$lencolorfile; $i++) { my $positioninfo = ''; my $colorinfo = ''; chomp $colorfile[$i]; if($colorfile[$i] =~ /N-\w{3,}/) { my @colorarray = (); @colorarray = split('', $colorfile[$i]); my $lencolorarray = scalar @colorarray; $positioninfo = ''; $colorinfo = ''; for(my $j=0; $j<$lencolorarray; $j++) { if($colorarray[$j] =~ /\d/ && $colorarray[$j-1] =~ /\w/ && $colorarray[$j-2] =~ /-/ && $colorarray[$j-3] =~ /N/) { for(my $k=$j; $k<$lencolorarray; $k++) { if($colorarray[$k+1] =~/\s/) { $k=$lencolorarray; } else { $positioninfo = $positioninfo.$colorarray[$k]; } } } } #print "\npositioninfo: ", $positioninfo, "\n"; for(my $j=($lencolorarray-1); $j>0; $j--) { if($colorarray[$j] =~/\s/) { $j=0; } else { $colorinfo = $colorarray[$j].$colorinfo; } } if($colorinfo eq 'blue') { my $newpositioninfo = ''; if($positioninfo =~ /\//) { my @positionarray = (); @positionarray = split('', $positioninfo); my $lenpositionarray = scalar @positionarray; for(my $a=0; $a<$lenpositionarray; $a++) { if($positionarray[$a] =~ /[0-9]/ && $positionarray[$a+1] =~ /[A-Z]/ || $a == ($lenpositionarray-1)) { $newpositioninfo = $newpositioninfo.$positionarray[$a]; #print "\nnext non-digit: ", $newpositioninfo, "\n"; $pnewcolorarray[$pcounter] = $newpositioninfo.' '.$colorinfo; #print "\nmore: ", $pnewcolorarray[$pcounter], "\n"; $pcounter++; $newpositioninfo = ''; } elsif($positionarray[$a] =~ /[0-9]/ && $positionarray[$a+1] =~ /[0-9]/) { $newpositioninfo = $newpositioninfo.$positionarray[$a]; #print "\nnext again digit: ", $newpositioninfo, "\n"; } } } elsif($positioninfo !~ /\//) { $pnewcolorarray[$pcounter] = $positioninfo.' '.$colorinfo; #print "\nsingle: ", $pnewcolorarray[$pcounter], "\n"; $pcounter++; } } if($colorinfo eq 'red') { my $newpositioninfo = ''; if($positioninfo =~ /\//) { my @positionarray = (); @positionarray = split('', $positioninfo); my $lenpositionarray = scalar @positionarray; for(my $a=0; $a<$lenpositionarray; $a++) { if($positionarray[$a] =~ /[0-9]/ && $positionarray[$a+1] =~ /[A-Z]/ || $a == ($lenpositionarray-1)) { $newpositioninfo = $newpositioninfo.$positionarray[$a]; #print "\nnext non-digit: ", $newpositioninfo, "\n"; $enewcolorarray[$ecounter] = $newpositioninfo.' '.$colorinfo; #print "\nmore: ", $enewcolorarray[$ecounter], "\n"; $ecounter++; $newpositioninfo = ''; } elsif($positionarray[$a] =~ /[0-9]/ && $positionarray[$a+1] =~ /[0-9]/) { $newpositioninfo = $newpositioninfo.$positionarray[$a]; #print "\nnext again digit: ", $newpositioninfo, "\n"; } } } elsif($positioninfo !~ /\//) { $enewcolorarray[$ecounter] = $positioninfo.' '.$colorinfo; #print "\nsingle: ", $enewcolorarray[$ecounter], "\n"; $ecounter++; } } } } $plennewcolorarray = scalar @pnewcolorarray; for(my $b=0; $b<$plennewcolorarray; $b++) { #print "\npri: ", $pnewcolorarray[$b], "\n"; } $elennewcolorarray = scalar @enewcolorarray; for(my $b=0; $b<$elennewcolorarray; $b++) { #print "\next: ", $enewcolorarray[$b], "\n"; } my $pmutationcounter = $plennewcolorarray; my $emutationcounter = $elennewcolorarray; my $photspotcounter = 0; my $ehotspotcounter = 0; my @pcheckarray = (); my @echeckarray = (); my $lenpcheckarray = 0; my $lenecheckarray = 0; my $pcheckcount = 0; my $echeckcount = 0; my $phscount = 0; my $ehscount = 0; my $photspotpos = 0; my $ehotspotpos = 0; for(my $c=0; $c<($plennewcolorarray-1); $c++) { my $ppos1 = $pnewcolorarray[$c]; my $positive = 0; for(my $k=0; $k<$lenpcheckarray; $k++) { if($pcheckarray[$k] eq $ppos1) { $positive++; } } if($positive == 0) { for(my $d=($c+1); $d<$plennewcolorarray; $d++) { my $ppos2 = $pnewcolorarray[$d]; if($ppos2 eq $ppos1) { $phscount++; $pcheckarray[$pcheckcount] = $ppos2; $pcheckcount++; } } if($photspotcounter<$phscount) { $phscount = $phscount + 1; $photspotcounter = $phscount; $photspotpos++; } else { $photspotcounter = $phscount; } $lenpcheckarray = scalar @pcheckarray; } } for(my $c=0; $c<($elennewcolorarray-1); $c++) { my $epos1 = $enewcolorarray[$c]; my $positive = 0; for(my $k=0; $k<$lenecheckarray; $k++) { if($echeckarray[$k] eq $epos1) { $positive++; } } if($positive == 0) { for(my $d=($c+1); $d<$elennewcolorarray; $d++) { my $epos2 = $enewcolorarray[$d]; if($epos2 eq $epos1) { #print "\npos: ", $epos1, "\t", $epos2, "\n"; $ehscount++; $echeckarray[$echeckcount] = $epos2; $echeckcount++; } } if($ehotspotcounter<$ehscount) { $ehscount = $ehscount + 1; $ehotspotcounter = $ehscount; $ehotspotpos++; } else { $ehotspotcounter = $ehscount; } $lenecheckarray = scalar @echeckarray; } } my $phsfreq = $photspotcounter/$pmutationcounter; my $ehsfreq = $ehotspotcounter/$emutationcounter; ### IMPORTANT: STORING THE EXTRACTED INFO IS DONE BELOW AFTER THE OUTPUT OF OVERALL HOT-SPOT FREQS ### ##### END OF EXTRACTING ZONE-WISE STRUCTURAL HOT-SPOT INFO FROM "FOUT6" & STORING THEM IN "FOUT4" ###### ##### CALCULATING S & N FOR SIMPSON & ALPHA STATISTICS ##### my @priinfo = @pzone; my @extinfo = @ezone; my $pris=0; my $exts=0; my $prin=0; my $extn=0; my @prihaplo=(); my @exthaplo=(); foreach my $line1 (@priinfo) { my $freq1; if($line1 =~ /n[0-9]*$/) { chomp $line1; my @line1 = (); @line1 = split('', $line1); my $lenline1 = scalar @line1; for(my $l=($lenline1-1); $l>0; $l--) { if($l==($lenline1-1)) { $freq1 = $line1[$l]; } else { if($line1[$l] == 'n') { $l = 0; } else { $freq1 = $line1[$l].$freq1; #print "\n", $freq1, "\n"; } } } } else { $freq1 = 1; } # print "\n", $freq1, "\n"; $prihaplo[$pris] = $freq1; $pris++; $prin = $prin + $freq1; } foreach my $line2 (@extinfo) { my $freq2=0; if($line2 =~ /n[0-9]*$/) { chomp $line2; my @line2 = (); @line2 = split('', $line2); my $lenline2 = scalar @line2; for(my $l=($lenline2-1); $l>0; $l--) { if($l==($lenline2-1)) { $freq2 = $line2[$l]; } else { if($line2[$l] == 'n') { $l = 0; } else { $freq2 = $line2[$l].$freq2; } } } } else { $freq2 = 1; } # print "\n", $freq2, "\n"; $exthaplo[$exts] = $freq2; $exts++; $extn = $extn + $freq2; } #print "\n", $pris, "\t", $prin, "\n"; #print "\n", $exts, "\t", $extn, "\n"; ###### PRINTING S, N, AND HAPLO-RATIO IN A SEPARATE FILE ####### print FOUT4 "\n\n\n\nZONAL FREQUENCIES OF NUMBERS OF HAPLOTYPES & STRAINS\n"; print FOUT4 "----------------------------------------------------\n"; print FOUT4 "\t\t\tPrimary\t\tExternal\n"; print FOUT4 "\t\t\t-------\t\t--------\n\n"; print FOUT4 "No. of haplotypes\t ", $pris, "\t\t ", $exts, "\n\n"; print FOUT4 "No. of strains\t\t ", $prin, "\t\t ", $extn, "\n\n\n"; my $haploratio = $exts/($pris+$exts); print FOUT4 "Haplotype Ratio : ", $haploratio, "\n"; ###### END OF PRINTING S, N, AND HAPLO-RATIO IN A SEPARATE FILE ####### ###### CALCULATING HOT-SPOT FREQUENCY ####### print FOUT4 "\n\n\n\nSTRUCTURAL HOT-SPOT DETAILS\n"; print FOUT4 "---------------------------\n\n"; my $aminochange=0; my @exclpos = (); my $posnum=0; my $tothotspot=0; for(my $i=0; $i<$lenoutinfo; $i++) { my $freq; if($outinfo[$i] =~ /Amino\sacid\schange/) { #$aminochange++; my @aminoinfo = (); @aminoinfo = split('', $outinfo[$i]); my $lenaminoinfo = scalar @aminoinfo; my $position=''; my $oriamino=''; my $changeamino=''; for(my $j=0; $j<$lenaminoinfo; $j++) { if($aminoinfo[$j] =~ /[0-9]/) { $position = $position.$aminoinfo[$j]; if($aminoinfo[$j-1] =~ /\D/) { $changeamino = $aminoinfo[$j-1]; } if($aminoinfo[$j+1] =~ /\D/) { $oriamino = $aminoinfo[$j+1]; } } } #print FOUT4 $position, ": ", $oriamino, " -> ", $changeamino, "; "; my $lenexclpos = scalar @exclpos; my $num=0; for(my $k=0; $k<$lenexclpos; $k++) { if($exclpos[$k] eq $position) { $num++; } } #print $lenexclpos, "\t", $position, "\t", $num, "\n\n\n"; if($num==0) { print FOUT4 $position, ": ", $oriamino, " -> ", $changeamino, "; "; #print $position, "\n"; $exclpos[$posnum] = $position; $posnum++; $freq=0; for(my $l=($i+1); $l<$lenoutinfo; $l++) { if($outinfo[$l] =~ /Amino\sacid\schange/) { my @aminoinfo2 = (); @aminoinfo2 = split('', $outinfo[$l]); my $lenaminoinfo2 = scalar @aminoinfo2; my $position2=''; my $oriamino2=''; my $changeamino2=''; for(my $m=0; $m<$lenaminoinfo2; $m++) { if($aminoinfo2[$m] =~ /[0-9]/) { $position2 = $position2.$aminoinfo2[$m]; if($aminoinfo2[$m-1] =~ /\D/) { $changeamino2 = $aminoinfo2[$m-1]; } if($aminoinfo2[$m+1] =~ /\D/) { $oriamino2 = $aminoinfo2[$m+1]; } } } if($position2 eq $position) { print FOUT4 $oriamino2, " -> ", $changeamino2, "; "; $freq++; } } } if($freq==0) { $aminochange++; } if($freq>0) { $freq = $freq + 1; $aminochange = $aminochange + $freq; } print FOUT4 "No. of hot-spot mutations: ", $freq, "\n"; } } $tothotspot = $tothotspot + $freq; } my $hotspotfreq = $tothotspot/$aminochange; print FOUT4 "\n\nFrequency of hot-spots : ", $hotspotfreq, " (", $tothotspot, "/", $aminochange, ")\n"; ###### END OF CALCULATING HOT-SPOT FREQUENCY ####### ###### STORING THE ZONAL FREQS OF HOT-SPOTS IN "FOUT4" ############ print FOUT4 "\n\n\n\nZONAL FREQUENCIES OF STRUCTURAL HOT-SPOTS\n"; print FOUT4 "-----------------------------------------\n"; print FOUT4 "\n\t\t\t\t\t\t\tPRIMARY\t\tEXTERNAL\n"; print FOUT4 "\n# of hot-spot positions: \t\t\t\t ", $photspotpos, "\t\t ", $ehotspotpos, "\n"; print FOUT4 "\nhot-spot frequency (# of hot-spots/total # of changes): ", $phsfreq, " (", $photspotcounter, "/", $pmutationcounter, ")\t ", $ehsfreq, " (", $ehotspotcounter, "/", $emutationcounter, ")\n"; ###### END OF STORING THE ZONAL FREQS OF HOT-SPOTS IN "FOUT4" ############ ################################################################### ###### CALCULATING SIMPSON INDEX FOR PRIMARY ZONE ######### print FOUT4 "\n\n\n\nSIMPSON-INDEX RESULTS\n"; print FOUT4 "---------------------\n"; print FOUT4 "\nPrimary zone\n"; print FOUT4 "------------\n"; my @haplo = (); @haplo = @prihaplo; my $num = scalar @haplo; #my $tot=0; my $sum=0; for (my $i=0; $i<$num; $i++) { # print $haplo[$i], "\n"; # $tot = $tot + ($haplo[$i] * ($haplo[$i]-1)); $sum = $sum + $haplo[$i]; } #my $l = $tot / ($sum * ($sum-1)); ################## # UNBIASED Ds (SAMPLING WITHOUT REPLACEMENT) #my $Ds_unbiased = 1 / $l; ################## my $pisq=0; for (my $j=0; $j<$num; $j++) { $pisq = $pisq + (($haplo[$j]/$sum) * ($haplo[$j]/$sum)); } ################# # BIASED Ds FOR VERY LARGE N (WHERE SAMPLING WITHOUT REPLACEMENT EFFECTIVELY EQUIVALENT TO SAMPLING WITH REPLACEMENT) my $Ds_biased = 1 / $pisq; ################# my $picube=0; for (my $k=0; $k<$num; $k++) { $picube = $picube + (($haplo[$k]/$sum) * ($haplo[$k]/$sum) * ($haplo[$k]/$sum)); } ################ #UNBIASED ESTIMATE OF STANDARD ERROR #my $var1 = ((4*$sum*($sum-1)*($sum-2)*$picube) + (2*$sum*($sum-1)*$pisq) - 2*$sum*($sum-1)*((2*$sum)-3)*($pisq*$pisq)) / ($sum*($sum-1)*$sum*($sum-1)); #my $se1 = sqrt $var1; ################ ################ #BIASED ESTIMATE OF STANDARD ERROR (FOR VERY LARGE N) my $var2 = 4 * ($picube - ($pisq*$pisq)) / $sum; my $se2; if($var2>0) { $se2 = sqrt $var2; } else { $se2 = 0; } print FOUT4 "\nORIGINAL SIMPSON'S INDEX (+/- VARIANCE): $pisq (+/-) $var2 \n"; print FOUT4 "\nORIGINAL SIMPSON'S INDEX (+/- SE): $pisq (+/-) $se2 \n"; ################ ################ #UNBIASED CALCULATION OF Ds ###95% CONFIDENCE LIMIT### #$min_l = $l - (1.96 * $se1); #$max_l = $l + (1.96 * $se1); ###90% CONFIDENCE LIMIT### #$min_l = $l - (1.645 * $se1); #$max_l = $l + (1.645 * $se1); ###ONLY STANDARD ERROR, NO CONFIDENCE LIMIT### #my $min_l = $l - $se1; #my $max_l = $l + $se1; #my $lb_unbiased = 1 / $max_l; #my $ub_unbiased = 1 / $min_l; #my $se1_down = $Ds_unbiased - $lb_unbiased; #my $se1_up = $ub_unbiased - $Ds_unbiased; #print FOUT4 "\nUnbiased value for Ds with standard error with 90% CL: \n$Ds_unbiased (-$se1_down) (+$se1_up) \n"; ################ ################ #BIASED CALCULATION OF Ds ###95% CONFIDENCE LIMIT### #$min_pisq = $pisq - (1.96 * $se2); #$max_pisq = $pisq + (1.96 * $se2); ###90% CONFIDENCE LIMIT### #$min_pisq = $pisq - (1.645 * $se2); #$max_pisq = $pisq + (1.645 * $se2); ###ONLY STANDARD ERROR, NO CONFIDENCE LIMIT### my $min_pisq = $pisq - $se2; my $max_pisq = $pisq + $se2; my $lb_biased = 1 / $max_pisq; my $ub_biased = 1 / $min_pisq; my $se2_down = $Ds_biased - $lb_biased; my $se2_up = $ub_biased - $Ds_biased; #print FOUT4 "\nBiased value for Ds with standard error: $Ds_biased (-$se2_down) (+$se2_up) \n"; print FOUT4 "\nDs with standard error: $Ds_biased (-$se2_down) (+$se2_up) \n"; ###### END OF CALCULATION OF SIMPSON INDEX FOR PRIMARY ZONE ###### ###### CALCULATING SIMPSON INDEX FOR EXTERNAL ZONE ######### print FOUT4 "\n\nExternal zone\n"; print FOUT4 "-------------\n"; my @haplo = (); @haplo = @exthaplo; my $num = scalar @haplo; #my $tot=0; my $sum=0; for (my $i=0; $i<$num; $i++) { # print $haplo[$i], "\n"; # $tot = $tot + ($haplo[$i] * ($haplo[$i]-1)); $sum = $sum + $haplo[$i]; } #my $l = $tot / ($sum * ($sum-1)); ################## # UNBIASED Ds (SAMPLING WITHOUT REPLACEMENT) #my $Ds_unbiased = 1 / $l; ################## my $pisq=0; for (my $j=0; $j<$num; $j++) { $pisq = $pisq + (($haplo[$j]/$sum) * ($haplo[$j]/$sum)); } ################# # BIASED Ds FOR VERY LARGE N (WHERE SAMPLING WITHOUT REPLACEMENT EFFECTIVELY EQUIVALENT TO SAMPLING WITH REPLACEMENT) my $Ds_biased = 1 / $pisq; ################# my $picube=0; for (my $k=0; $k<$num; $k++) { $picube = $picube + (($haplo[$k]/$sum) * ($haplo[$k]/$sum) * ($haplo[$k]/$sum)); } ################ #UNBIASED ESTIMATE OF STANDARD ERROR #my $var1 = ((4*$sum*($sum-1)*($sum-2)*$picube) + (2*$sum*($sum-1)*$pisq) - 2*$sum*($sum-1)*((2*$sum)-3)*($pisq*$pisq)) / ($sum*($sum-1)*$sum*($sum-1)); #my $se1 = sqrt $var1; ################ ################ #BIASED ESTIMATE OF STANDARD ERROR (FOR VERY LARGE N) my $var2 = 4 * ($picube - ($pisq*$pisq)) / $sum; my $se2; if($var2>0) { $se2 = sqrt $var2; } else { $se2 = 0; } print FOUT4 "\nORIGINAL SIMPSON'S INDEX (+/- VARIANCE): $pisq (+/-) $var2 \n"; print FOUT4 "\nORIGINAL SIMPSON'S INDEX (+/- SE): $pisq (+/-) $se2 \n"; ################ ################ #UNBIASED CALCULATION OF Ds ###95% CONFIDENCE LIMIT### #$min_l = $l - (1.96 * $se1); #$max_l = $l + (1.96 * $se1); ###90% CONFIDENCE LIMIT### #$min_l = $l - (1.645 * $se1); #$max_l = $l + (1.645 * $se1); ###ONLY STANDARD ERROR, NO CONFIDENCE LIMIT### #my $min_l = $l - $se1; #my $max_l = $l + $se1; #my $lb_unbiased = 1 / $max_l; #my $ub_unbiased = 1 / $min_l; #my $se1_down = $Ds_unbiased - $lb_unbiased; #my $se1_up = $ub_unbiased - $Ds_unbiased; #print FOUT4 "\nUnbiased value for Ds with standard error with 90% CL: \n$Ds_unbiased (-$se1_down) (+$se1_up) \n"; ################ ################ #BIASED CALCULATION OF Ds ###95% CONFIDENCE LIMIT### #$min_pisq = $pisq - (1.96 * $se2); #$max_pisq = $pisq + (1.96 * $se2); ###90% CONFIDENCE LIMIT### #$min_pisq = $pisq - (1.645 * $se2); #$max_pisq = $pisq + (1.645 * $se2); ###ONLY STANDARD ERROR, NO CONFIDENCE LIMIT### my $min_pisq = $pisq - $se2; my $max_pisq = $pisq + $se2; my $lb_biased = 1 / $max_pisq; my $ub_biased = 1 / $min_pisq; my $se2_down = $Ds_biased - $lb_biased; my $se2_up = $ub_biased - $Ds_biased; #print FOUT4 "\nBiased value for Ds with standard error: $Ds_biased (-$se2_down) (+$se2_up) \n"; print FOUT4 "\nDs with standard error: $Ds_biased (-$se2_down) (+$se2_up) \n"; ###### END OF CALCULATION OF SIMPSON INDEX FOR EXTERNAL ZONE ###### ################################################################### ###### CALCULATING ALPHA STATISTIC FOR PRIMARY ZONE ####### print FOUT4 "\n\n\n\nALPHA-INDEX RESULTS\n"; print FOUT4 "-------------------\n"; print FOUT4 "\nPrimary zone\n"; print FOUT4 "------------\n"; my $s = $pris; my $n = $prin; my $diff1 = 0; my $diff2 = 1; my $X2 = 0; my $val; my $val1; my $min; for(my $X=0.0001; $X<1; ) { $val = $s/$n + ((1-$X)*log(1-$X)/$X); $val1 = abs $val; $diff1 = $val1 - 0; $min = $diff1 - $diff2; if($diff1<$diff2) {$diff2 = $diff1; $X2 = $X;} $diff1 = 0; $X = $X + 0.0001; } #print FOUT4 "X = ", $X2, "\n"; my $alpha = (1-$X2)*$n/$X2; my $var_alpha = $alpha*$alpha*$alpha * (((($n+$alpha)*($n+$alpha))*log((2*$n+$alpha)/($n+$alpha)))-($alpha*$n))/((($s*$n)+($s*$alpha)-($n*$alpha))*(($s*$n)+($s*$alpha)-($n*$alpha))); my $se_alpha = sqrt $var_alpha; print FOUT4 "\nalpha (+/- SE) = ", $alpha, " (+/- ", $se_alpha, ")\n"; print FOUT4 "\nalpha (+/- VARIANCE) = ", $alpha, " (+/- ", $var_alpha, ")\n"; ################################################################ #THE EVENNESS MEASURE BY CALCULATING GAMMA, SINCE S*GAMMA=ALPHA ################################################################ #$gamma=0; #$gamma = -1 / (log(1-$X2)); #print "\nThe evenness value, gamma = ", $gamma, "\n"; ###### END OF CALCULATION OF ALPHA STATISTIC FOR PRIMARY ZONE ###### ###### CALCULATING ALPHA STATISTIC FOR EXTERNAL ZONE ####### print FOUT4 "\n\nExternal zone\n"; print FOUT4 "-------------\n"; my $s = $exts; my $n = $extn; my $diff1 = 0; my $diff2 = 1; my $X2 = 0; my $val; my $val1; my $min; for(my $X=0.0001; $X<1; ) { $val = $s/$n + ((1-$X)*log(1-$X)/$X); $val1 = abs $val; $diff1 = $val1 - 0; $min = $diff1 - $diff2; if($diff1<$diff2) {$diff2 = $diff1; $X2 = $X;} $diff1 = 0; $X = $X + 0.0001; } #print FOUT4 "X = ", $X2, "\n"; my $alpha = (1-$X2)*$n/$X2; my $var_alpha = $alpha*$alpha*$alpha * (((($n+$alpha)*($n+$alpha))*log((2*$n+$alpha)/($n+$alpha)))-($alpha*$n))/((($s*$n)+($s*$alpha)-($n*$alpha))*(($s*$n)+($s*$alpha)-($n*$alpha))); my $se_alpha = sqrt $var_alpha; print FOUT4 "\nalpha (+/- SE) = ", $alpha, " (+/- ", $se_alpha, ")\n"; print FOUT4 "\nalpha (+/- VARIANCE) = ", $alpha, " (+/- ", $var_alpha, ")\n"; ################################################################ #THE EVENNESS MEASURE BY CALCULATING GAMMA, SINCE S*GAMMA=ALPHA ################################################################ #$gamma=0; #$gamma = -1 / (log(1-$X2)); #print "\nThe evenness value, gamma = ", $gamma, "\n"; ###### END OF CALCULATION OF ALPHA STATISTIC FOR EXTERNAL ZONE ###### ############################### close(FOUT4); exit; ################ # A Subroutine to Read the files # get_file_data # # A subroutine to get data from a file given its filename sub get_file_data { my($filename) = @_; use strict; use warnings; # Initialize variables my @filedata = ( ); unless( open(GET_FILE_DATA, $filename) ) { print STDERR "Cannot open file \"$filename\"\n\n"; exit; } @filedata = ; close GET_FILE_DATA; return @filedata; } ############################################### # A Subroutine to convert codons to amino acids # codon2aa # # A subroutine to translate a DNA 3-character codon to an amino acid sub codon2aa { my($codon) = @_; if ( $codon =~ /GC./i) { return 'A' } # Alanine elsif ( $codon =~ /TG[TC]/i) { return 'C' } # Cysteine elsif ( $codon =~ /GA[TC]/i) { return 'D' } # Aspartic Acid elsif ( $codon =~ /GA[AG]/i) { return 'E' } # Glutamic Acid elsif ( $codon =~ /TT[TC]/i) { return 'F' } # Phenylalanine elsif ( $codon =~ /GG./i) { return 'G' } # Glycine elsif ( $codon =~ /CA[TC]/i) { return 'H' } # Histidine elsif ( $codon =~ /AT[TCA]/i) { return 'I' } # Isoleucine elsif ( $codon =~ /AA[AG]/i) { return 'K' } # Lysine elsif ( $codon =~ /TT[AG]|CT./i) { return 'L' } # Leucine elsif ( $codon =~ /ATG/i) { return 'M' } # Methionine elsif ( $codon =~ /AA[TC]/i) { return 'N' } # Asparagine elsif ( $codon =~ /CC./i) { return 'P' } # Proline elsif ( $codon =~ /CA[AG]/i) { return 'Q' } # Glutamine elsif ( $codon =~ /CG.|AG[AG]/i) { return 'R' } # Arginine elsif ( $codon =~ /TC.|AG[TC]/i) { return 'S' } # Serine elsif ( $codon =~ /AC./i) { return 'T' } # Threonine elsif ( $codon =~ /GT./i) { return 'V' } # Valine elsif ( $codon =~ /TGG/i) { return 'W' } # Tryptophan elsif ( $codon =~ /TA[TC]/i) { return 'Y' } # Tyrosine elsif ( $codon =~ /TA[AG]|TGA/i) { return '_' } # Stop else { print STDERR "Bad codon \"$codon\"!!\n"; exit; } }