Please cite: Passalacqua, K. D., A. Varadarajan, B. D. Ondov, D. T. Okou, M. E. Zwick, and N. H. Bergman. 2009. Structure and complexity of a bacterial transcriptome. Journal of Bacteriology 191:3203-3211 and Kristoffersen et al. (current paper) if using the script below.
In order to find TSS, TES, and operons structures, run the shell script: 'find_TSS_TES_operon.tcsh'
It will produce two output files: BcXXX_YY_TU_list.txt and BcXXX_YY_GeneList_with_TU.txt, for each strain and chromosome/plasmid. The BcXXX_YY_TU_list.txt contains a list of all TUs and gene-info. The BcXXX_YY_GeneList_with_TU.txt contains basically the same but is sorted on the locus tags.
To run the script several input files and other perl-scripts are needed, however, everything runs automatically by typing the command 'find_TSS_TES_operon.tcsh'
The 'find_TSS_TES_operon.tcsh' - script needs the following files to work:
scriptfiles:
makeScore_10987.pl
makeScore_14579.pl
combine_52mers.perl
computerange_for_52mer.perl
combine_gene_coords_with_TU.perl
computerangesum.pm
Input files: $f = 10987 or 14579, $h = chr or pl
Bc${f}_reads.txt; This file should contain all the mapped reads, in the following tab separated format; ID Strand (FF or RF) Chromosome (Genome or plasmid name) Start_coord Stop_coord Length
Bc${f}_${h}_sorted.coords; gene coords file containing locus tag, start and stop. Genes on the reverse strand are given by decreasing coords. List should be sorted by coords, f=10987*/14579, h=chr/pl
52mer_Bc${f}_${h}.list; list of all non-unique 52mers in the genome, start and stop coords, tab seperated (may be skipped)
NOTE; these scripts are developed for B. cereus ATCC 10987 and ATCC 14579. Should one run the scripts for other organisms, the names and and the first letters of the locus tags must be changed in the find_TSS_TES_operon.tcsh script, and in the perl-scripts; makescore.pl and combine_gene_coords_with_TU.perl
************************
find_TSS_TES_operon.tcsh
************************
#!/bin/tcsh -x
##The script needs the following files to work (in the working directory):
##scriptfiles:
# makeScore_10987.pl
# makeScore_14579.pl
# combine_52mers.perl
# computerange_for_52mer.perl
# combine_gene_coords_with_TU.perl
# computerangesum.pm
##Inputfiles: $f = 10987 or 14579, $h = chr or pl
# Bc${f}_reads.txt, This file should contain all the mapped reads, in the following tab-seperated format; ID Strand (FF or RF) Chromosome (Genome or plasmid name) Start_coord Stop_coord Length
# Bc${f}_${h}_sorted.coords , gene coords file containing locus tag, start and stop. Genes on the reverse strand is given by decreasing coords. List should be sorted on coords, f=10987*/14579, h=chr/pl
# 52mer_Bc${f}_${h}.list list of all non-unique 52mers in the genome, start and stop coords, tab.seperated (Can be skipped)
#NOTE these scripts are developed for B. cereus ATCC 10987 and ATCC 14579. Should one run on another organism, the Names and and the first letters of the locus tags must be changed in this script, and in the perl-scripts; makescore.pl and combine_gene_coords_with_TU.perl
set dir = ~/cereus/Simen/RNA_seq/REDO2/TSS_and_OPERON/TEST
cd ${dir}
##Make input files with makeScore.pl:
#foreach f (10987 14579)
foreach f (10987 )
set p = ` echo $f | sed -e 's/10987/10/' | sed -e 's/14579/14/' `
./makeScore_${f}.pl Bc${f}_reads.txt temp >&! OUT.makescore_${f}
##Make Transcribed units with awk command ( units closer than or eq. 5 nt are fused, units lower than average 5 reads/bp coverage are discarded):
foreach h (chr pl)
mv -f temp_${h}.txt Bc${f}_${h}.OUT_makescore
awk -F '\t| +' 'BEGIN {c0=b=s=e=0} { if($2<1) {c0++;if(c0>=5) {b=0;average=score/score_count;if(average>5) {print s "\t" e "\t" average} score=score_count=0}} else {c0=0;if(b==0) {s=$1;b=1} else {e=$1}} score+=$2;score_count++}' Bc${f}_${h}.OUT_makescore >! Bc${f}_${h}_TranUnits.list
end
end
##Combine TU that ends in the same region of repeated 52mers
#This step can be skipped, just blank out and copy to new filename: \cp Bc${f}_${h}_TranUnits.list Bc${f}_${h}_TranUnits_52mer_merged.list
#foreach f (10987 14579)
foreach f (10987 )
foreach h (chr pl)
combine_52mers.perl Bc${f}_${h}_TranUnits.list 52mer_Bc${f}_${h}.list ${f}_${h}_merged_52mer.list ${f}_${h}.temp
sort -k 1,1n -k 2,2n ${f}_${h}.temp | uniq >! temp1
mv -f temp1 ${f}_${h}.temp
computerange_for_52mer.perl ${f}_${h}.temp ${f}_${h}.out.temp int
awk -F "\t" '{if(NF==4) {print $1 "\t" $2 "\t" $4 } else if(NF==6) {print $1 "\t" $2 "\t" $6} else {printf $1 "\t" $2 "\tNA";for(i=4;i<=NF;i++) {printf "\t"$i} printf "\n"}}' ${f}_${h}.out.temp >! Bc${f}_${h}_TranUnits_52mer_merged.list
end
end
#Combine gene coords with TUs, make genelist and TU-list
#foreach f (10987 14579)
foreach f (10987 )
foreach h (chr pl)
combine_gene_coords_with_TU.perl Bc${f}_${h}_TranUnits_52mer_merged.list Bc${f}_${h}_sorted.coords temp1 temp2
egrep 'FROM_1_TU' temp2 | sed -e 's/\(BC\?.\+[0-9][0-9][0-9][0-9]\)XXX,TU_\([0-9]\+\)\-\([0-9]\+\)YYY,MADE_FROM_1_TUs,Gene_new BC\?.\+[0-9][0-9][0-9][0-9]: \([0-9]\+\),\([0-9]\+\),\([-+]\),/\1\t\2\t\3\t\4\t\5\t\6\tQQQ\t/' | sed -e 's/\tQQQ\t\(BC\?.\+,TU_[0-9]\+-[0-9]\+\),/\tQQQ\t\1;/' -e 's/\],/\];/' -e 's/QQQ\t\(BC\?.\+[0-9][0-9][0-9].\+\),\(TU_[0-9]\+-[0-9]\+;\)/QQQ\tZZZ\t\1\t\2/' | awk -F "\t" '{print $1 "\t" $2 "\t" $3 "\t" $6 "\t" $10 "\t" $9 "\t" $4 "\t" $5 "\t" $6}' > ! genelist_1TU.temp
egrep 'NOT transcribed above cut-off' temp2 | cut -d ',' -f 4,5,6,7,8 | sed -e 's/^.\+: //' | awk -F "," '{print $4 "\t" $5 "\t" "\t" $3 "\t" "\t" $4 "\t" $1 "\t" $2 "\t" $3}' >! genelist_NO_transcribed.temp
egrep -v 'NOT transcribed above cut-off|FROM_1_TU' temp2 | sed -e 's/\(BC\?.\+[0-9][0-9][0-9][0-9]\)XXX.\+,MADE_FROM_\([0-9]\+\)_TUs/\1\t\2_TUs\t\2TUs/' -e 's/,Gene_new .\+: \([0-9]\+\),\([0-9]\+\),\([-+]\),/\t\1\t\2\t\3\t/' -e 's/,[+-],BC/,\tQQQ\tBC/g' -e 's/,[0-9]\+,[0-9]\+,/\tWWW\t/g' -e 's/,\(TU_[0-9]\+-[0-9]\+,\)\[/\tXXX\t\1\tEEE\t\[/g' -e 's/\t\t/\t/g' | awk -F "\t" '{print $7";;"$14";;"$21";;"$28";;"$35";;"$42";;"$49"\tRRR" $0}' | sed -e 's/,\tEEE\t\[/;\[/g' -e 's/\tBC/@BC/g' -e 's/\tXXX\t/@/g' | awk -F "@" '{print $1 "DDDD" $3 "DDDD"$5 "DDDD" $7 "DDDD" $9 "DDDD" $11 "DDDD" $13"DDDD"}' | sed -e 's/\tWWW\tQQQDDDD/;;/g' -e 's/DDDDTU/\tTU/' -e 's/DDD\+//' | awk -F "\t" '{print $2 "\t" $3 "\t" $4 "\t" $7 "\t" $8 "\t" $1 "\t" $5 "\t" $6 "\t" $7}' | sed -e 's/^RRR//' -e 's/;\+\t/\t/' >! genelist_2+TU.temp
cat genelist_1TU.temp genelist_NO_transcribed.temp genelist_2+TU.temp | sort -k 1,1 | awk -F "\t" 'BEGIN {print "Gene\tTU_start\tTU_end\tStrand\tTU_name_annot_comment\tGene_comment\tGene_start\tGene_end\tStrand"};{print $0}' >! Bc${f}_${h}_GeneList_with_TU.txt
cat temp1 | sed -e 's/TU subunits TU_//' | awk -F ":" '{print $1 "\t@@@" $1 "@@@\t:" $2}' | sed -e 's/\-/\t/' | awk -F "\t" '{print "TU_"$3 "\t" $1 "\t" $2 "\t" $4 }' | sed -e 's/: _R/\-\t_R/' | sed -e 's/: _F/+\t_F/' | sed -e 's/@@@//g' | sed -e 's/: /\t/' >! Bc${f}_${h}_TU_list.txt
end
end
#clean up:
mkdir SCRIPT_OUTFILES
rm -f *temp*
#foreach f (10987 14579)
foreach f (10987 )
foreach h (chr pl)
\mv ${f}_${h}_merged_52mer.list Bc${f}_${h}_TranUnits_52mer_merged.list Bc${f}_${h}_TranUnits.list Bc${f}_${h}.OUT_makescore OUT.makescore_${f} SCRIPT_OUTFILES
end
end
exit 0
******************
makeScore_10987.pl
******************
#! /usr/bin/perl
use strict;
##This program takes the output from "SOAP" and creates a file with scores..Outputs 2 files, one for scores in + strand and the other for scores in - strand..
my ($inputFile, $outputFile) = @ARGV;
open (INPUT, "$inputFile");
my @scores;
my @px;
my @scores_plus;
my @scores_minus;
my @px_plus; #score for plasmid px01 strand +
my @px_minus; #score for plasmid px01 strand -
my $i;
my $genLength = 5224283;
my $pxlen = 208369;
#@score is always started with 1 since it gets easier that way.
for ($i = 1; $i < $genLength; $i++)
{
$scores_plus[$i] = 0;
$scores_minus[$i] = 0;
}
for ($i = 1; $i < $pxlen; $i++)
{
$px_plus[$i] = 0;
$px_minus[$i] = 0;
my $line;
my @line;
while (defined ($line = ))
{
#print $line, "\n";
@line = split (/\t/, $line);
print $line[2], "\t";
print $line[1], "\t";
print $line[3], "\t";
print $line[5], "\n";
#$line[5] contains length and $line[3] contains location of first bp on the reference, counted from 1
##SIMEN note, for some reason, teh first column ($line) is skipped, so all columns should be -1 (i.e. Genome is in column 3, but in this script i say it is in 2)
if ($line[2] eq 'Genome')
{
if ($line[1] eq 'FF')
{
for ($i = $line[3]; $i <= $line[3]+$line[5]; $i++)
{#print $i, "\t";
$scores_plus[$i]++;
}
#print "\n";
}
elsif ($line[1] eq 'RF')
{
for ($i = $line[3]; $i <= $line[3]+$line[5]; $i++)
{
$scores_minus[$i]++;
}
}
}
elsif ($line[2] eq 'pBc10987')
{
if ($line[1] eq 'FF')
{
for ($i = $line[3]; $i <= $line[3]+$line[5]; $i++)
{#print $i, "\t";
$px_plus[$i]++;
}
# print "\n";
}
elsif ($line[1] eq 'RF')
{
for ($i = $line[3]; $i <= $line[3]+$line[5]; $i++)
{
$px_minus[$i]++;
}
}
}
}}
close INPUT;
#Made all the score+ and - into comments
#open (OUT, ">>Score+_genome.txt");
#for ($i = 1; $i < $genLength; $i++)
# {
# print OUT $scores_plus[$i], "\n";
#}
#close OUT;
#open (OUT, ">>Score-.txt");
#for ($i = 1; $i < $genLength; $i++)
# {
# print OUT $scores_minus[$i], "\n";
#}
#close OUT;
combineScores();
print "Complete!! :-) \n";
sub combineScores
{
my $i;
my $out1 = $outputFile."_chr.txt";
my $out2 = $outputFile."_pl.txt";
open (OUT1, ">$out1");
open (OUT2, ">$out2");
for ($i = 1; $i <= $genLength; $i++)
{
$scores[$i] = $scores_plus[$i] + $scores_minus[$i];
print OUT1 $i, "\t", $scores[$i], "\n";
}
for ($i = 1; $i <= $pxlen; $i++)
{
$px[$i] = $px_plus[$i] + $px_minus[$i];
print OUT2 $i, "\t", $px[$i], "\n";
}
close OUT1;
close OUT2;
}
******************
makeScore_14579.pl
******************
#! /usr/bin/perl
use strict;
##This program takes the output from "SOAP" and creates a file with scores..Outputs 2 files, one for scores in + strand and the other for scores in - strand..
my ($inputFile, $outputFile) = @ARGV;
open (INPUT, "$inputFile");
my @scores;
my @px;
my @scores_plus;
my @scores_minus;
my @px_plus; #score for plasmid px01 strand +
my @px_minus; #score for plasmid px01 strand -
my $i;
my $genLength = 5411809;
my $pxlen = 15274;
#@score is always started with 1 since it gets easier that way.
for ($i = 1; $i < $genLength; $i++)
{
$scores_plus[$i] = 0;
$scores_minus[$i] = 0;
}
for ($i = 1; $i < $pxlen; $i++)
{
$px_plus[$i] = 0;
$px_minus[$i] = 0;
my $line;
my @line;
while (defined ($line = ))
{
#print $line, "\n";
@line = split (/\t/, $line);
print $line[2], "\t";
print $line[1], "\t";
print $line[3], "\t";
print $line[5], "\n";
#$line[5] contains length and $line[3] contains location of first bp on the reference, counted from 1
##SIMEN note, for some reason, teh first column ($line) is skipped, so all columns should be -1 (i.e. Genome is in column 3, but in this script i say it is in 2)
if ($line[2] eq 'Genome')
{
if ($line[1] eq 'FF')
{
for ($i = $line[3]; $i <= $line[3]+$line[5]; $i++)
{#print $i, "\t";
$scores_plus[$i]++;
}
#print "\n";
}
elsif ($line[1] eq 'RF')
{
for ($i = $line[3]; $i <= $line[3]+$line[5]; $i++)
{
$scores_minus[$i]++;
}
}
}
elsif ($line[2] eq 'pBClin15')
{
if ($line[1] eq 'FF')
{
for ($i = $line[3]; $i <= $line[3]+$line[5]; $i++)
{#print $i, "\t";
$px_plus[$i]++;
}
# print "\n";
}
elsif ($line[1] eq 'RF')
{
for ($i = $line[3]; $i <= $line[3]+$line[5]; $i++)
{
$px_minus[$i]++;
}
}
}
}}
close INPUT;
#Made all the score+ and - into comments
#open (OUT, ">>Score+_genome.txt");
#for ($i = 1; $i < $genLength; $i++)
# {
# print OUT $scores_plus[$i], "\n";
#}
#close OUT;
#open (OUT, ">>Score-.txt");
#for ($i = 1; $i < $genLength; $i++)
# {
# print OUT $scores_minus[$i], "\n";
#}
#close OUT;
combineScores();
print "Complete!! :-) \n";
sub combineScores
{
my $i;
my $out1 = $outputFile."_chr.txt";
my $out2 = $outputFile."_pl.txt";
open (OUT1, ">$out1");
open (OUT2, ">$out2");
for ($i = 1; $i <= $genLength; $i++)
{
$scores[$i] = $scores_plus[$i] + $scores_minus[$i];
print OUT1 $i, "\t", $scores[$i], "\n";
}
for ($i = 1; $i <= $pxlen; $i++)
{
$px[$i] = $px_plus[$i] + $px_minus[$i];
print OUT2 $i, "\t", $px[$i], "\n";
}
close OUT1;
close OUT2;
}
*******************
combine_52mers.perl
*******************
#!/usr/bin/perl -w
print "<>\n";
if( @ARGV!=4 ) {
print "usage: $0 infile1 infile2 outfile52mer outfile \n";
print "infile1: Transcribed_units; coordinate file (one coordinate pair per line)\n";
print "infile2: 52_mer file; coordinate file (one coordinate pair per line)\n";
print "outfile52mer: output file with merged 52mers\n";
print "outfile: output file with grouped TU coordinates\n";
exit 1;
}
($INFILE1, $INFILE2, $OUTFILE52MER, $OUTFILE) = @ARGV;
print "infile1=$INFILE1, infile2=$INFILE2, outfile52mer=$OUTFILE52MER, outfile=$OUTFILE\n";
open(INFILE1, "<$INFILE1") || die("cannot open infile: $!");
@TU_lines = ;
close(INFILE1) || die("cannot close infile: $!");
open(INFILE2, "<$INFILE2") || die("cannot open infile: $!");
@mer52_lines = ;
close(INFILE2) || die("cannot close infile: $!");
#print "tu=@TU_lines\n";
#print "52=@mer52_lines\n";
for($i=0;$i<@TU_lines;$i+=1) {
chomp($TU_lines[$i]);
($TU_start[$i],$TU_end[$i],$TU_coverage[$i])=split(/\t/,$TU_lines[$i]);
}
for($i=0;$i<@mer52_lines;$i+=1) {
chomp($mer52_lines[$i]);
($mer52_start[$i],$mer52_end[$i])=split(/\t/,$mer52_lines[$i]);
}
#merge consecutive 52mers
@merged_52mer_start = @merged_52mer_end = ();
$m = $found = 0;
for($i=1;$i<@mer52_lines;$i+=1) {
if($mer52_start[$i]==($mer52_start[$i-1]+1)) { $found++; $merged_52mer_start[$m]=$mer52_start[$i-$found]; $merged_52mer_end[$m]=$mer52_end[$i]; }
else { $m++; $merged_52mer_start[$m]=$mer52_start[$i]; $merged_52mer_end[$m]=$mer52_end[$i]; $found = 0; }
}
open(OUTFILE52MER, ">$OUTFILE52MER") || die("cannot open outfile52mer: $!");
#for($m=0;$m<@merged_52mer_start;$m+=1) { splice(@merged_52mer_start,$m,1),splice(@merged_52mer_end,$m,1),$m-- unless( defined($merged_52mer_start[$m]) ); }
for($m=0;$m<@merged_52mer_start;$m+=1) { print OUTFILE52MER "$merged_52mer_start[$m]\t$merged_52mer_end[$m]\n"; }
print "$m merged 52mers\n";
close(OUTFILE52MER) || die("cannot close outfile52mer: $!");
$"="\t";
$k=0;
for($i=0;$i<@merged_52mer_start;$i+=1) {
for($j=0;$j<@TU_lines-1;$j+=1) {
if( ($merged_52mer_start[$i]<=$TU_end[$j]) && ($merged_52mer_end[$i]>=$TU_start[$j+1]) ) {
$new_TU_start[$k]=$TU_start[$j];
$new_TU_end[$k]=$TU_end[$j+1];
push(@{$data[$k]}, $TU_lines[$j], $TU_lines[$j+1]);
print "j=$j, f1=$merged_52mer_start[$i],$merged_52mer_end[$i], $TU_start[$j], $TU_end[$j+1], nS=$new_TU_start[$k], nE=$new_TU_end[$k]\n";
for($c=$new_TU_start[$k];$c<=$new_TU_end[$k];$c++) { $used_coords{$c} = 1; }
$k=$k+1;
}
}
}
open(OUTFILE, ">$OUTFILE") || die("cannot open outfile: $!");
for($k=0;$k<@new_TU_start;$k++) {
print OUTFILE "$new_TU_start[$k]\t$new_TU_end[$k]\t@{$data[$k]}\n";
}
for($k=0;$k<@TU_lines;$k++) { print OUTFILE "$TU_start[$k]\t$TU_end[$k]\t$TU_start[$k]\t$TU_end[$k]\t$TU_coverage[$k]\n" unless( exists($used_coords{$TU_start[$k]}) ); }
close(OUTFILE) || die("cannot close outfile: $!");
exit 0;
***************************
computerange_for_52mer.perl
***************************
#!/usr/bin/perl -w
use computerangesum;
print "<>\n";
if( @ARGV!=3 ) {
print "usage: $0 infile outfile integer\n";
print "infile: coordinate file (one coordinate pair per line)\n";
print "outfile: output file with grouped coordinates\n";
print "integer: integer or floating-point numbers (enter int or flo)\n";
exit 1;
}
($INFILE, $OUTFILE, $INTEGER) = @ARGV;
print "infile=$INFILE, outfile=$OUTFILE, integer=$INTEGER\n";
open(INFILE, "<$INFILE") || die("cannot open infile: $!");
@inlines = ;
@range = %strand = ();
$j = 0;
foreach $inline ( @inlines ) {
@fields = split(/\s|-/, $inline, 3);
$start = $fields[0];
$end = $fields[1];
$data[$j] = $fields[2];
chomp($data[$j]);
$start_coords{$start} = $j;
$end_coords{$end} = $j;
#for($i=2;$i<@fields;$i++) { $data[$j] .= "\t".$fields[$i]; }
print "ln=$j, s=$start, e=$end, d=$data[$j]\n";
$strand{$start} = $strand{$end} = "+";
if( $start>$end ) {
$start = $fields[1];
$end = $fields[0];
$strand{$start} = $strand{$end} = "-";
}
push(@range, $start, $end);
$j++;
}
close(INFILE) || die("cannot close infile: $!");
#foreach ( keys(%start_coords) ) { print "$_, $start_coords{$_}\n";}
$numranges = @range;
print "numranges=$numranges\n";
@rangesum = &PrepareRangeSum(\@range, $INTEGER);
print "rangesum=@rangesum\n";
open(OUTFILE, ">$OUTFILE") || die("cannot open outfile: $!");
for($i=0;$i<@rangesum;$i+=2) {
print "$rangesum[$i], $rangesum[$i+1]\n";
if( $strand{$rangesum[$i]} eq "+" ) {
$newdata = "";
for($j=$start_coords{$rangesum[$i]};$j<=$end_coords{$rangesum[$i+1]};$j++) {
print "j1=$rangesum[$i], $start_coords{$rangesum[$i]}, j2=$rangesum[$i+1], $end_coords{$rangesum[$i+1]}, d($j)=$data[$j]\n";
$newdata .= "\t".$data[$j];
}
printf OUTFILE ("%d\t%d\t%s\n", $rangesum[$i], $rangesum[$i+1], $newdata);
}
else { printf OUTFILE ("%d\t%d\n", $rangesum[$i+1], $rangesum[$i]); }
}
$numranges = @rangesum;
print "numranges=$numranges\n";
close(OUTFILE) || die("cannot close outfile: $!");
exit 0;
********************************
combine_gene_coords_with_TU.perl
********************************
#!/usr/bin/perl -w
print "<>\n";
if( @ARGV!=4 ) {
print "usage: $0 infile1 infile2 outfile1 outfile2\n";
print "infile1: Transcribed_units; coordinate file (one coordinate pair per line)\n";
print "infile2: Gene coordinate file (one coordinate pair per line)\n";
print "outfile1: output file with operon coordinates with gene names (sortet on the operon)\n";
print "outfile2: output file genelist with name of which operon they belong (sortet on the genes)\n";
exit 1;
}
($INFILE1, $INFILE2, $OUTFILE1, $OUTFILE2) = @ARGV;
print "infile1=$INFILE1, infile2=$INFILE2, outfile1=$OUTFILE1, outfile2=$OUTFILE2\n";
open(INFILE1, "<$INFILE1") || die("cannot open infile: $!");
@TU_lines = ;
close(INFILE1) || die("cannot close infile: $!");
open(INFILE2, "<$INFILE2") || die("cannot open infile: $!");
@genes_lines = ;
close(INFILE2) || die("cannot close infile: $!");
#print "tu=@TU_lines\n";
#print "52=@genes_lines\n";
for($i=0;$i<@TU_lines;$i+=1) {
chomp($TU_lines[$i]);
($TU_start[$i],$TU_end[$i],$TU_coverage[$i])=split(/\t/,$TU_lines[$i]);
}
for($i=0;$i<@genes_lines;$i+=1) {
chomp($genes_lines[$i]);
($genes_name[$i],$start,$end)=split(/\t/,$genes_lines[$i]);
if($start>$end) { $genes_start[$i]=$end; $genes_end[$i]=$start; $genes_strand[$i]="-"; }
else { $genes_start[$i]=$start; $genes_end[$i]=$end; $genes_strand[$i]="+"; }
}
$"=",";
%TU = %genes_data = ();
for($i=0;$i<@genes_lines;$i+=1) { @{$genes_data{$genes_name[$i]."__".$genes_start[$i]."__".$genes_end[$i]."__".$genes_strand[$i]}}=(); }
for($j=0;$j<@TU_lines;$j+=1) { @{$TU{"TU_".$TU_start[$j]."-".$TU_end[$j]}}=(); }
for($i=0;$i<@genes_lines;$i+=1) {
for($j=0;$j<@TU_lines;$j+=1) {
if( ($TU_start[$j]<=$genes_start[$i]) && ($TU_end[$j]>=$genes_end[$i]) ) {
print "1: $TU_start[$j], $TU_end[$j], $genes_name[$i], $genes_start[$i], $genes_end[$i]\n";
push(@{$TU{"TU_".$TU_start[$j]."-".$TU_end[$j]}}, $genes_name[$i], $genes_start[$i], $genes_end[$i], $genes_strand[$i]);
push(@{$genes_data{$genes_name[$i]."__".$genes_start[$i]."__".$genes_end[$i]."__".$genes_strand[$i]}}, "TU_".$TU_start[$j]."-".$TU_end[$j]."_gene inside TU (1)");
#last;
}
if( ($TU_start[$j]>$genes_start[$i]) && ($TU_end[$j]<$genes_end[$i]) ) {
print "2: $TU_start[$j], $TU_end[$j], $genes_name[$i], $genes_start[$i], $genes_end[$i]\n";
push(@{$TU{"TU_".$TU_start[$j]."-".$TU_end[$j]}}, $genes_name[$i]."_TU_inside_gene", $genes_start[$i], $genes_end[$i], $genes_strand[$i]);
push(@{$genes_data{$genes_name[$i]."__".$genes_start[$i]."__".$genes_end[$i]."__".$genes_strand[$i]}}, "TU_".$TU_start[$j]."-".$TU_end[$j]."_TU inside gene (2)");
}
if( ($TU_start[$j]>$genes_start[$i]) && ($TU_start[$j]<$genes_end[$i]) && ($TU_end[$j]>=$genes_end[$i]) ) {
print "3: $TU_start[$j], $TU_end[$j], $genes_name[$i], $genes_start[$i], $genes_end[$i]\n";
push(@{$TU{"TU_".$TU_start[$j]."-".$TU_end[$j]}}, $genes_name[$i]."_TSS_within", $genes_start[$i], $genes_end[$i], $genes_strand[$i]);
push(@{$genes_data{$genes_name[$i]."__".$genes_start[$i]."__".$genes_end[$i]."__".$genes_strand[$i]}}, "TU_".$TU_start[$j]."-".$TU_end[$j]."_TSS inside gene (3)");
#last;
}
if( ($TU_start[$j]<=$genes_start[$i]) && ($TU_end[$j]>$genes_start[$i]) && ($TU_end[$j]<$genes_end[$i]) ) {
print "4: $TU_start[$j], $TU_end[$j], $genes_name[$i], $genes_start[$i], $genes_end[$i]\n";
push(@{$TU{"TU_".$TU_start[$j]."-".$TU_end[$j]}}, $genes_name[$i]."_TU_end_inside", $genes_start[$i], $genes_end[$i], $genes_strand[$i]);
push(@{$genes_data{$genes_name[$i]."__".$genes_start[$i]."__".$genes_end[$i]."__".$genes_strand[$i]}}, "TU_".$TU_start[$j]."-".$TU_end[$j]."_TU_end inside gene (4)");
#last;
}
}
#if(@{$TU{$j}}==0){ print "Nothing for $gene $i and TU $j\n"; exit 1 ; }
}
foreach $j ( keys(%TU) ) {
@{$TU_new_boundaries{$j}} = ();
$begin = 0;
$ii = 1;
for($i=3;$i<@{$TU{$j}};$i+=4) {
# if (($i+4)<=@{$TU{$j}}) {
if( @{$TU{$j}}>4 && $TU{$j}[$i] ne $TU{$j}[$i+4] ) {
if( ($i+4)<=@{$TU{$j}} && $TU{$j}[$i] eq "+" ) { push(@{$TU_new_boundaries{$j}}, $TU{$j}[$i-1]); }
if( ($i+4)<=@{$TU{$j}} && $TU{$j}[$i] eq "-" ) { push(@{$TU_new_boundaries{$j}}, $TU{$j}[$i-1]); }
if( ($i+4)<=@{$TU{$j}} && $TU{$j}[$i+4] eq "+" ) { push(@{$TU_new_boundaries{$j}}, $TU{$j}[$i+4-2]); }
if( ($i+4)<=@{$TU{$j}} && $TU{$j}[$i+4] eq "-" ) { push(@{$TU_new_boundaries{$j}}, $TU{$j}[$i+4-2]); }
if($begin>0 && ($TU{$j}[$begin+3] eq "+")) { push(@{$TU_annotations{$j."-".$ii}}, "_F_new_start");}
if($begin>0 && ($TU{$j}[$begin+3] eq "-")) { push(@{$TU_annotations{$j."-".$ii}}, "_R_new_end");}
if(($i+4)<@{$TU{$j}} && $TU{$j}[$i] eq "+") { push(@{$TU_annotations{$j."-".$ii}}, "_F_new_end");}
if(($i+4)<@{$TU{$j}} && $TU{$j}[$i] eq "-") { push(@{$TU_annotations{$j."-".$ii}}, "_R_new_start");}
print "beg=$begin, i=$i\n";
#if (($i+4)<=@{$TU{$j}}) {
for($k=$begin;$k<=$i;$k++) {
push(@{$TU_annotations{$j."-".$ii}}, $TU{$j}[$k]);
print "b=$k,$TU{$j}[$k]\n";
}
#}
$begin=($i+1);
$ii++;
print "boundaries($j)=@{$TU_new_boundaries{$j}}, k=$k, b=$begin\n";
}
#}
print "beg=$begin, i=$i\n";
}
$num_subTUs = $ii-1;
print "nsu=$num_subTUs\n";
# split TUs with genes in opposite strands into subTUs
for($k=1;$k<=$num_subTUs;$k++) {
$TU_index = $j."-".$k;
$numdata = @{$TU_annotations{$TU_index}};
#($name, $start, $end) = split(/_|-/,$j);
($name, $coords) = split(/TU_/,$j);
($start, $end) = split(/\-/,$coords);
print "tub($j)=@{$TU_new_boundaries{$j}}, tus=$start, tue=$end\n";
print "tua1=@{$TU_annotations{$TU_index}}\n";
$numannot = 0;
foreach $a (@{$TU_annotations{$TU_index}}) { $numannot++ if($a=~/new/); }
if($numannot==0) { last; }
elsif($numannot==1) {
if($TU_annotations{$TU_index}[2] <= $start || $k==1 ) { $new_start = $start; }
else { $new_start = $TU_annotations{$TU_index}[2]; }
}
elsif($numannot==2) {
if($TU_annotations{$TU_index}[3] <= $start || $k==1 ) { $new_start = $start; }
else { $new_start = $TU_annotations{$TU_index}[3]; }
}
else { print "Error: more than 2 annotattion fields in $TU_index"; exit 1;}
if($TU_annotations{$TU_index}[$numdata-2] >= $end || $k==$num_subTUs ) { $new_end = $end; }
else { $new_end = $TU_annotations{$TU_index}[$numdata-2]; }
@new_boundaries = ($new_start, @{$TU_new_boundaries{$j}}, $new_end);
print "tub2($j)=@new_boundaries, tus2=$new_start, tue2=$new_end\n";
$TU_new_name = "TU_".$new_boundaries[($k*2)-2]."-".$new_boundaries[($k*2)-1];
push(@{$TU_subunits{$TU_new_name}}, @{$TU_annotations{$TU_index}}, "DERIVED FROM ".$j);
print "tusu($TU_new_name)=@{$TU_subunits{$TU_new_name}}\n";
# Extract gene info
$numgenes = 0;
@genelist = ();
foreach $g (@{$TU_subunits{$TU_new_name}}) {
$numgenes++, push(@genelist, $g) if($g=~/B/);
}
print "ng=$numgenes\n";
$gene_index = 1;
for($g=0;$g<@{$TU_subunits{$TU_new_name}};$g++) {
print "g=$g, gi=$gene_index, tug=$TU_subunits{$TU_new_name}[$g], ";
$genelist = join(",", @genelist);
if( $TU_subunits{$TU_new_name}[$g]=~/B/ && $gene_index==1 && (($TU_subunits{$TU_new_name}[0] eq "_F_new_start") || ($TU_subunits{$TU_new_name}[1] eq "_F_new_start")) ) { print "here1, "; push(@{$new_genes_data{$TU_subunits{$TU_new_name}[$g]}}, $TU_subunits{$TU_new_name}[$g+1], $TU_subunits{$TU_new_name}[$g+2], $TU_subunits{$TU_new_name}[$g+3], $TU_subunits{$TU_new_name}[$g]."_F_new_start", $TU_new_name, "[".$genelist."]", $TU_subunits{$TU_new_name}[-1]); $gene_index++; }
elsif( $TU_subunits{$TU_new_name}[$g]=~/B/ && $gene_index==1 && (($TU_subunits{$TU_new_name}[0] eq "_R_new_end") || ($TU_subunits{$TU_new_name}[1] eq "_R_new_end")) ) { print "here2, "; push(@{$new_genes_data{$TU_subunits{$TU_new_name}[$g]}}, $TU_subunits{$TU_new_name}[$g+1], $TU_subunits{$TU_new_name}[$g+2], $TU_subunits{$TU_new_name}[$g+3], $TU_subunits{$TU_new_name}[$g]."_R_new_end", $TU_new_name, "[".$genelist."]", $TU_subunits{$TU_new_name}[-1]); $gene_index++; }
elsif( $TU_subunits{$TU_new_name}[$g]=~/B/ && $gene_index==($numgenes) && (($TU_subunits{$TU_new_name}[0] eq "_R_new_start") || ($TU_subunits{$TU_new_name}[1] eq "_R_new_start")) ) { print "here3, "; push(@{$new_genes_data{$TU_subunits{$TU_new_name}[$g]}}, $TU_subunits{$TU_new_name}[$g+1], $TU_subunits{$TU_new_name}[$g+2], $TU_subunits{$TU_new_name}[$g+3], $TU_subunits{$TU_new_name}[$g]."_R_new_start", $TU_new_name, "[".$genelist."]", $TU_subunits{$TU_new_name}[-1]); $gene_index++; }
elsif( $TU_subunits{$TU_new_name}[$g]=~/B/ && $gene_index==($numgenes) && (($TU_subunits{$TU_new_name}[0] eq "_F_new_end") || ($TU_subunits{$TU_new_name}[1] eq "_F_new_end")) ) { print "here4, "; push(@{$new_genes_data{$TU_subunits{$TU_new_name}[$g]}},$TU_subunits{$TU_new_name}[$g+1], $TU_subunits{$TU_new_name}[$g+2], $TU_subunits{$TU_new_name}[$g+3], $TU_subunits{$TU_new_name}[$g]."_F_new_end", $TU_new_name, "[".$genelist."]", $TU_subunits{$TU_new_name}[-1]); $gene_index++; }
elsif( $TU_subunits{$TU_new_name}[$g]=~/B/ && ($gene_index==1 || $gene_index==($numgenes)) && (($TU_subunits{$TU_new_name}[0] !~ /new/) || ($TU_subunits{$TU_new_name}[1] !~ /new/)) ) { print "here5, "; push(@{$new_genes_data{$TU_subunits{$TU_new_name}[$g]}}, $TU_subunits{$TU_new_name}[$g+1], $TU_subunits{$TU_new_name}[$g+2], $TU_subunits{$TU_new_name}[$g+3], $TU_subunits{$TU_new_name}[$g], $TU_new_name, "[".$genelist."]", $TU_subunits{$TU_new_name}[-1]); $gene_index++; }
elsif( $TU_subunits{$TU_new_name}[$g]=~/B/ && ($gene_index>0 && $gene_index<($numgenes)) ) { print "here6, "; push(@{$new_genes_data{$TU_subunits{$TU_new_name}[$g]}},$TU_subunits{$TU_new_name}[$g+1], $TU_subunits{$TU_new_name}[$g+2], $TU_subunits{$TU_new_name}[$g+3], $TU_subunits{$TU_new_name}[$g], $TU_new_name, "[".$genelist."]", $TU_subunits{$TU_new_name}[-1]); $gene_index++; }
else { print "NEW CASE\n" if($TU_subunits{$TU_new_name}[$g]=~/B/); }
}
}
$Z=0;
# count if all genes are on + strand
for($i=3;$i<@{$TU{$j}};$i+=4) {
if( $TU{$j}[$i] eq "+") {$Z=$Z+1;}
}
if($Z==@{$TU{$j}}/4) {
@{$TU_subunits{$j}} = ("_F", @{$TU{$j}});
@{$TU_annotations{$j}} = ("_F", @{$TU{$j}});
@genelist = ();
foreach $g (@{$TU_subunits{$j}}) {
push(@genelist, $g) if($g=~/B/);
}
for($g=0;$g<@{$TU_subunits{$j}};$g++) {
print "g=$g, tug=$TU_subunits{$j}[$g], ";
$genelist = join(",", @genelist);
push(@{$new_genes_data{$TU_subunits{$j}[$g]}}, $TU_subunits{$j}[$g+1], $TU_subunits{$j}[$g+2], $TU_subunits{$j}[$g+3], $TU_subunits{$j}[$g], $j, "[".$genelist."]") if( $TU_subunits{$j}[$g]=~/B/ );
}
}
$Z=0;
# count if all genes are on - strand
for($i=3;$i<@{$TU{$j}};$i+=4) {
if( $TU{$j}[$i] eq "-") {$Z=$Z+1;}
}
if($Z==@{$TU{$j}}/4) {
@{$TU_subunits{$j}} = ("_R", @{$TU{$j}});
@{$TU_annotations{$j}} = ("_R", @{$TU{$j}});
@genelist = ();
foreach $g (@{$TU_subunits{$j}}) {
push(@genelist, $g) if($g=~/B/);
}
for($g=0;$g<@{$TU_subunits{$j}};$g++) {
print "g=$g, tug=$TU_subunits{$j}[$g], ";
$genelist = join(",", @genelist);
push(@{$new_genes_data{$TU_subunits{$j}[$g]}}, $TU_subunits{$j}[$g+1], $TU_subunits{$j}[$g+2], $TU_subunits{$j}[$g+3], $TU_subunits{$j}[$g], $j, "[".$genelist."]") if( $TU_subunits{$j}[$g]=~/B/ );
}
}
# mark TU not containing genes
if(@{$TU{$j}}==0){
@{$TU_subunits{$j}} = ("_No gene within TU");
@{$TU_annotations{$j}} = ("_No gene within TU");
}
}
$totgenes=keys(%genes_data);
print "totgenes=$totgenes\n";
foreach $gene ( keys(%genes_data) ) {
@fields = split(/__/, $gene);
$name = $fields[0];
print "n=$name\n";
if( @{$genes_data{$gene}}==0 ) {
print "empty=$name, $gene\n";
@{$new_genes_data{$name}} = ($fields[1],$fields[2],$fields[3],$fields[0], "NOT transcribed above cut-off");
}
}
$newtotgenes=keys(%new_genes_data);
print "newtotgenes=$newtotgenes\n";
open(OUTFILE1, ">$OUTFILE1") || die("cannot open outfile1: $!");
foreach $k ( sort {$TU{$a} cmp $TU{$b}} keys(%TU) ) {
# print OUTFILE1 "TU $k: @{$TU{$k}}\n";
}
foreach $k ( sort {$TU_annotations{$a} cmp $TU_annotations{$b}} keys(%TU_annotations) ) {
# print OUTFILE1 "TU_annot $k: @{$TU_annotations{$k}}\n";
}
foreach $k ( sort {$TU_new_boundaries{$a} cmp $TU_new_boundaries{$b}} keys(%TU_new_boundaries) ) {
# print OUTFILE1 "TU boundaries $k: @{$TU_new_boundaries{$k}}\n";
}
foreach $k ( sort {$TU_subunits{$a} cmp $TU_subunits{$b}} keys(%TU_subunits) ) {
print OUTFILE1 "TU subunits $k: @{$TU_subunits{$k}}\n";
}
close(OUTFILE1) || die("cannot close outfile1: $!");
open(OUTFILE2, ">$OUTFILE2") || die("cannot open outfile2: $!");
foreach $k ( sort {$genes_data{$a} cmp $genes_data{$b}} keys(%genes_data) ) {
# print OUTFILE2 "Gene $k: @{$genes_data{$k}}\n";
}
foreach $k ( sort {$new_genes_data{$a} cmp $new_genes_data{$b}} keys(%new_genes_data) ) {
#print OUTFILE2 "Gene_new $k: @{$new_genes_data{$k}}\n";
#for($i=0;$i<@{$new_genes_data{$k}};$i++) { print OUTFILE2 "i($i)==$new_genes_data{$k}[$i]\n"; }
$k=~m/(BCE_A?[0-9]{4}a?|BC(_p)?[0-9]{4})/;
$rootname=$1;
push(@{$new2_genes_data{$rootname}},@{$new_genes_data{$k}});
}
foreach $k ( sort {$new2_genes_data{$a} cmp $new2_genes_data{$b}} keys(%new2_genes_data) ) {
@tucoords = ();
$numtus = 0;
for($i=0;$i<@{$new2_genes_data{$k}};$i++) { if($new2_genes_data{$k}[$i]=~/^TU_\d+-\d+$/) { $new2_genes_data{$k}[$i]=~m/(\d+)-(\d+)/; push(@tucoords,$1,$2); $numtus++; } }
@tucoords=(0,0) if(@tucoords==0);
print "tuc=@tucoords\n";
@tucoords_sorted=sort {$a<=>$b} @tucoords;
$k=~m/(BCE_A?[0-9]{4}a?|BC(_p)?[0-9]{4})/;
$rootname=$1;
print OUTFILE2 "${rootname}XXX,TU_$tucoords_sorted[0]-$tucoords_sorted[-1]YYY,MADE_FROM_${numtus}_TUs,Gene_new $k: @{$new2_genes_data{$k}}\n";
}
close(OUTFILE2) || die("cannot close outfile2: $!");
exit 0;
******************
computerangesum.pm
******************
package computerangesum;
use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(PrepareRangeSum);
# input the range boundaries into the function which computes the range sum
sub PrepareRangeSum
{
my $range = $_[0]; my $integer = $_[1];
my ($i, $j, %Start, %End, @range);
%Start = %End = (); @range = @$range;
$j = 0;
for($i=0;$i<@range;$i+=2) {
$Start{$j} = $range[$i];
$End{$j} = $range[$i+1];
$j++;
}
#print "fS="; foreach ( keys(%Start) ) { print "$_:$Start{$_}-$End{$_} "; } print "\n";
# compute the sum of all the previously computed ranges
@RangeSum = &ComputeRangeSum(\%Start, \%End, $integer);
#print "rangesum=@RangeSum\n";
return @RangeSum;
}
# compute and return the overall range that is the sum of several ranges
sub ComputeRangeSum
{
# the computation of the range sum is done as follows: rank the ranges in increasing order according to their start positions. Take the first range and, if there are ranges whose start position is within the first range, then select among them the range which has the highest end position. Combine and remove those 2 ranges, i.e., create a new range whose boundaries are the start position of the first range and the end position of the selected range. Then repeat the whole procedure starting with that combined range. If no range can be selected at a given step, then leave the range as it is and restart the procedure with the next range in the order (i.e., the range sum may be made up of several parts).
my ($Start, $End, $integer) = @_;
my ($i, $j, @start, @end, $startsum, $endsum);
my @RangeSum = ();
# rank the ranges in increasing order according to their start positions
START: @start = sort { $$Start{$a}<=>$$Start{$b} } ( keys(%$Start) );
if( @start!=0 ) {
$startsum = $$Start{$start[0]}; $endsum = $$End{$start[0]};
#print "startsum=$startsum, endsum=$endsum\n";
# among the ranges whose start position is within the starting range, select the range which has the highest end position
@end = sort { $$End{$b}<=>$$End{$a} } ( keys(%$End) );
ENDLOOP: for($i=0;$i<@end;$i++) {
#print "$$End{$end[$i]}, $$Start{$end[$i]}\n";
# if no range can be selected, then restart from the next range in the order
if( $$End{$end[$i]}<=$endsum ) {
for($j=$i;$j<@end;$j++) { delete($$End{$end[$j]}); delete($$Start{$end[$j]}); }
#print "new startsum=$startsum, new endsum=$endsum\n";
push(@RangeSum, $startsum, $endsum);
goto START;
}
# if a range can be selected, then compute the combined range
#if( $$Start{$end[$i]}<=$endsum ) { $endsum = $$End{$end[$i]}; print "extended endsum=$endsum\n"; goto ENDLOOP; }
if( $integer=~/flo/i && $$Start{$end[$i]}<=$endsum ) { $endsum = $$End{$end[$i]}; goto ENDLOOP; }
if( $integer=~/int/i && $$Start{$end[$i]}<=$endsum+1 ) { $endsum = $$End{$end[$i]}; goto ENDLOOP; }
}
}
return @RangeSum;
}
1;