#!/usr/bin/perl -w


# gack3628 removes deprecated functions - only pcl and cdt files are
# supported, and binomial smoothing is removed as it was never developed

# gack3629 removed unnecessary commented code to reduce size

# gack3630 removed unnecessary code (file input types, EPP difference
# outputs), implements GD histogram output

use Tk;
use Tk::FileSelect;
use POSIX;

use Tk::ErrorDialog;

# Un-comment these for compilation with Perlapp
use Tk::Button;
use Tk::Checkbutton;
use Tk::Frame;
use Tk::Entry;
use Tk::Radiobutton;
use Tk::Scale;
use Tk::Scrollbar;
use Tk::Text;
use Tk::Label;
use Tk::LabEntry;

use GD;
use GD::Graph;
use GD::Graph::lines;
use Math::Round;

my $mw = MainWindow->new;
$mw->title("GACK");
$mw->Label(-text=>"Genomotyping Analysis by Charlie Kim")->pack;

# MAIN FRAMES #

$logfr = $mw->Frame(-relief=>'groove',
		       -borderwidth=>2,
		       )->pack(-side=>'right',
			       -anchor=>'ne',
			       -fill=>'y',
			       );

$inputfr = $mw->Frame(-relief=>'groove',
		      -borderwidth=>2,
		      )->pack(-side=>'top',
			      -anchor=>'nw',
			      -fill=>'both',
			      );

$secondrowfr = $mw->Frame(-borderwidth=>0,
			  )->pack(-side=>'top',
				  -anchor=>'nw',
				  -fill=>'both',
				  );

$rawdatafr = $secondrowfr->Frame(-relief=>'groove',
			-borderwidth=>2,
			)->pack(-side=>'left',
				-anchor=>'nw',
				-fill=>'both',
				-expand=> 1,
				);

$histofr = $secondrowfr->Frame(-relief=>'groove',
			       -label=>'Data Histogram',
			       -borderwidth=>2,
			       )->pack(-side=>'right',
				       -anchor=>'nw',
				       -fill=>'both',
				       );

$smoothfr = $mw->Frame(-relief=>'groove',
		       -label=>'Data Smoothing',
		       -borderwidth=>2,
		       )->pack(-side=>'top',
			       -anchor=>'nw',
			       -fill=>'both',
			       );

$modelfr = $mw->Frame(-relief=>'groove',
		      -label=>'Peak Modeling',
		      -borderwidth=>2,
		      )->pack(-side=>'top',
			      -anchor=>'nw',
			      -fill=>'both',
			      );

$outfilefr = $mw->Frame(-relief=>'groove',
		    -borderwidth=>2,
		    )->pack(-side=>'top',
			    -anchor=>'nw',
			    -fill=>'both',
			    );

# LOG FRAME #
$logopt = $logfr->Frame(-relief=>'groove',
			-borderwidth=>2,
			)->pack(-fill=>'both',
				);
$logging = 1;
$logopt->Checkbutton(-text=>"Create logfile",
		     -variable=> \$logging,
		     -command=> \&logstatus,
		     )->pack(-side=>'left',
			     -anchor=>'nw',
			     );

$logfile = 'gacklog.txt';
$logfile_e = $logopt->Entry(-textvariable=> \$logfile,
			    )->pack(-side=>'left',
				    -anchor=>'nw',
				    -pady=> 2,
				    );

$logfile_b = $logopt->Button(-text=>'Browse',
			     -command=> \&selectlogfile,
			     -borderwidth=> 1,
			     -padx=> 0,
			     -pady=> 0,
			     )->pack(-side=>'left',
				     -anchor=>'nw',
				     );

$log_f = $logfr->Frame(-relief=>'groove',
			-borderwidth=>2,
			)->pack(-fill=>'both');
$logwin = $log_f->Scrolled("Text")->pack(-fill=>'both');
$logwin->configure(-height=>20,
		   -width=>40,
		   -wrap=>'none',
		   -state=>'disabled',
		   );


$logfr->Button(-text=>"Exit",
		   -command=> sub { exit },
		   )->pack(-side=>'right',
			   );

$rungack_b = $logfr->Button(-text=>"Run GACK",
			    -command=> \&gackmain,
			    -state=> 'normal',
			    )->pack(-side=>'left',
#					-padx=>20,
				    );


# OPTION FRAMES #

# input data #

$incol2fr = $inputfr->Frame(#-relief=>'groove',
			    -borderwidth=>2,
			    )->pack(-side=>'left',
				    -fill=>'both',
				    );
$incol3fr = $inputfr->Frame(#-relief=>'groove',
			    -borderwidth=>2,
			    )->pack(-side=>'left',
				    -fill=>'both',
				    );
$indata = 'pclfile';

$pclfile_e = $incol2fr->LabEntry(-label=>'Input pcl or cdt File',
				 -labelPack=> [qw/-side left -anchor w/],
				 -textvariable=> \$pcl_file,
				 -width=>30,
				 -background=>'white',
				 )->pack(
					 -pady=>3,
					 );

$pclfile_b = $incol3fr->Button(-text=>'Browse',
			       -command=> \&selectpclfile,
			       -state=>'normal',
			       -borderwidth=>1,
			       -padx=> 0,
			       -pady=> 0,
			       )->pack(
				       );

# file extensions

#$inext = 'txt';
$outext = 'gck';

# binsize
$bincol1fr = $histofr->Frame(-borderwidth=>2,
			     )->pack(-side=>'left',
				     -fill=>'both',
				     );
$bincol2fr = $histofr->Frame(-borderwidth=>2,
			     )->pack(-side=>'left',
				     -fill=>'both',
				     );
$binsize = 0.1;
$bincol1fr->Label(-text=>'Bin Size',
		  )->pack(-side=>'left',
			  );
$bincol2fr->Scale(-from=>0.01,
		  -to=>0.5,
		  -resolution=>0.01,
		  -variable=> \$binsize,
		  -width=>10,
		  -borderwidth=>1,
		  -orient=>'horizontal',
		  )->pack(-anchor=>'nw');

$rawdataout = 0;
$rawdatafr->Checkbutton(-text=>'Generate raw histogram data',
			-variable=> \$rawdataout,
			)->pack(-side=>'top',
				-anchor=>'nw',
				);

$graphout = 0;
$rawdatafr->Checkbutton(-text=>'Generate graphs',
			-variable=> \$graphout,
			)->pack(-side=>'top',
				-anchor=>'nw',
				);

# smoothing
$smoothcol1fr = $smoothfr->Frame(-borderwidth=>2,
				 )->pack(-side=>'left',
					 -fill=>'both',
					 );
$smoothcol2fr = $smoothfr->Frame(-borderwidth=>2,
				 )->pack(-side=>'left',
					 -fill=>'both',
					 );

$smoothtype = 0;
$smoothwin = 1;
$smoothcol1fr->Radiobutton(-text=>"No Smoothing",
			   -value=>0,
			   -variable=>\$smoothtype,
			   -command=> \&smoothopts,
			   -borderwidth=>0,
			   -pady=> 0,
			   )->pack(-anchor=>'nw',
				   );
$smoothcol1fr->Radiobutton(-text=>"Moving Window Average",
			   -value=>1,
			   -variable=>\$smoothtype,
			   -command=> \&smoothopts,
			   -anchor=>'nw',
			   -borderwidth=>0,
			   -pady=> 0,
			   )->pack(-anchor=>'nw');

$smoothcol2fr->Label(-text=>'',
		     -borderwidth=> 0,
		     )->pack(-anchor=>'nw',
			     -pady=> 3,
			     );

@windowsizes = (3,5,7,9);

# moving average window
$mawframe = $smoothcol2fr->Frame()->pack(-side=>'top',
					 -anchor=>'nw',
					 );
%maw = ();
foreach (@windowsizes) {
    $maw{$_} = $mawframe->Radiobutton(-text=>$_,
				      -value=>$_,
				      -variable=> \$mawsmoothwin,
				      -state=>'disabled',
				      -borderwidth=>0,
				      -pady=> 0,
				      -anchor=>'nw',
				      )->pack(-side=>'left',
					      -anchor=>'nw',
					      );
}

# fake data type
$faketype = 1;
$modelfr->Radiobutton(-text=>"Positive-Side Mirroring",
		      -value=>0,
		      -variable=>\$faketype,
		      -anchor=>'nw',
		      -borderwidth=>0,
		      -pady=> 0,
		      )->pack(-anchor=>'nw');
$modelfr->Radiobutton(-text=>"Normal Curve",
		      -value=>1,
		      -variable=>\$faketype,
		      -anchor=>'nw',
		      -borderwidth=>0,
		      -pady=> 0,
		      )->pack(-anchor=>'nw');

$EPPout = 1;
$EPP_ch = $outfilefr->Checkbutton(-text=> "Graded output",
				-variable=> \$EPPout,
				)->grid(-row=>0,-column=>0,-sticky=>'w');

#$outfilefr->Label(-text=> "Sensitivity",
#	       )->grid(-row=>0,-column=>2);
$EPPsensi = 50;
#$EPP_sc = $outfilefr->Scale(-from=>0,
#			  -to=>100,
#			  -resolution=>1,
#			  -orient=>'horizontal',
#			  -state=>'disabled',
#			  -width=>10,
#			  -length=>200,
#			  -variable=> \$EPPsensi,
#			  )->grid(-row=>0,-column=>1);

$binout = 0;
$outfilefr->Checkbutton(-text=> "Binary output",
			-variable=> \$binout,
			)->grid(-row=>1,-col=>0,-sticky=>'w');

$outfilefr->Label(-text=> "Binary %EPP Cutoff",
		  )->grid(-row=>1,-col=>2);

$binoutcut = 50;
$outfilefr->Scale(-from=>0,
		  -to=>100,
		  -resolution=>5,
		  -orient=>'horizontal',
		  -width=>10,
		  -variable=> \$binoutcut,
		  )->grid(-row=>1,-column=>1);

$triout = 0;
$outfilefr->Checkbutton(-text=> "Trinary output",
			-variable=> \$triout,
			)->grid(-row=>2,-col=>0,-sticky=>'w');
$outfilefr->Label(-text=> "Trinary %EPP Cutoff 1",
		  )->grid(-row=>2,-col=>2);
$outfilefr->Label(-text=> "Trinary %EPP Cutoff 2",
		  )->grid(-row=>3,-col=>2);

$trioutcut1 = 0;
$trioutcut2 = 100;
$outfilefr->Scale(-from=>0,
		  -to=>100,
		  -resolution=>5,
		  -orient=>'horizontal',
		  -width=>10,
		  -variable=> \$trioutcut1,
		  )->grid(-row=>2,-column=>1);
$outfilefr->Scale(-from=>0,
		  -to=>100,
		  -resolution=>5,
		  -orient=>'horizontal',
		  -width=>10,
		  -variable=> \$trioutcut2,
		  )->grid(-row=>3,-column=>1);

MainLoop;


###############
# SUBROUTINES #
###############

sub errormsg {
    my $msg = shift;
    my $errorwin = $mw->Toplevel();
#    print "\a";
    $errorwin->bell;
    $errorwin->Label(-text=> $msg,
		     )->pack();
    $errorwin->Button(-text=>'Close',
		      -command=> sub { $errorwin->destroy },
		      )->pack();
    $errorwin->waitWindow();
}

sub printlog {
    my $line = shift;
    $logwin->configure(-state=>'normal');
    $logwin->insert('end',$line);
    $logwin->yview(moveto=>1);
    $logwin->configure(-state=>'disabled');
    $logwin->update();
    if ($logging == 1) {
	open(LOGFILE,">>$logfile") or die "Logfile problem\n";
	print LOGFILE $line;
    }
    close LOGFILE;
}

sub logparam {
    &printlog("Bin size = $binsize\n");

    if ($smoothtype == 0) {
	&printlog("No data smoothing selected\n");
    }
    elsif ($smoothtype == 1) {
	&printlog("Moving window average smoothing selected\n");
	&printlog("Smoothing window = $mawsmoothwin");
    }
    elsif ($smoothtype == 2) {
	&printlog("Binomial smoothing selected\n");
	&printlog("Smoothing window = $binsmoothwin");
    }
    
    if ($faketype == 0) {
	&printlog("Peak model = Right-side mirroring\n");
    }
    elsif ($faketype == 1) {
	&printlog("Peak model = Normal curve\n");
    }
}

sub selectlogfile {
    my $file = $mw->getSaveFile();
    if (defined $file and $file ne '') {
	$logfile_e->delete(0, 'end');
	$logfile_e->insert(0, $file);
	$logfile_e->xview('end');
    }
}

sub logstatus {
    if ($logging == 0) {
	$logfile_b->configure(-state=>'disabled');
	$logfile_e->configure(-state=>'disabled',
			      -background=>'grey',
			     );
	
    }
    else {
	$logfile_b->configure(-state=>'normal');
	$logfile_e->configure(-state=>'normal',
			      -background=>'white',
			      );
    }
}

sub selectsinglefile {
    my $file = $mw->getOpenFile();
    if (defined $file and $file ne '') {
	$single_e->delete(0, 'end');
	$single_e->insert(0, $file);
	$single_e->xview('end');
    }
}

sub selectpclfile {
    my $file = $mw->getOpenFile();
    if (defined $file and $file ne '') {
	$pclfile_e->delete(0, 'end');
	$pclfile_e->insert(0, $file);
	$pclfile_e->xview('end');
    }
}

sub selectdir {
    use Tk::FileSelect;
    use Cwd;
    my $dirref = $mw->FileSelect(-verify=> [qw/-d/] );
    my $dir = $dirref->Show;
    if (defined $dir and $dir ne '') {
	$batch_e->delete(0, 'end');
	$batch_e->insert(0, $dir);
	$batch_e->xview('end');
    }
}

sub fileopts {
    if ($indata eq 'single') {
	$single_e->configure(-state=>'normal',
			     -background=>'white',
			     );
	$single_b->configure(-state=>'normal');
	$batch_e->configure(-state=>'disabled',
			    -background=>'grey',
			    );
	$batch_b->configure(-state=>'disabled');
	$pclfile_e->configure(-state=>'disabled',
			    -background=>'grey',
			    );
	$pclfile_b->configure(-state=>'disabled');
	$inext_e->configure(-state=>'disabled',
			    -background=>'grey',
			    );
	$EPP_ch->configure(-state=>'disabled');
	$EPP_sc->configure(-state=>'disabled');
    }
    elsif ($indata eq 'batch') {
	$single_e->configure(-state=>'disabled',
			     -background=>'grey',
			     );
	$single_b->configure(-state=>'disabled');
	$batch_e->configure(-state=>'normal',
			    -background=>'white',
			    );
	$batch_b->configure(-state=>'normal');
	$pclfile_e->configure(-state=>'disabled',
			     -background=>'grey',
			     );
	$pclfile_b->configure(-state=>'disabled');
	$inext_e->configure(-state=>'normal',
			    -background=>'white',
			    );
	$EPP_ch->configure(-state=>'disabled');
	$EPP_sc->configure(-state=>'disabled');
    }
    elsif ($indata eq 'pclfile') {
	$single_e->configure(-state=>'disabled',
			     -background=>'grey',
			     );
	$single_b->configure(-state=>'disabled');
	$batch_e->configure(-state=>'disabled',
			    -background=>'grey',
			    );
	$batch_b->configure(-state=>'disabled');
	$pclfile_e->configure(-state=>'normal',
			     -background=>'white',
			     );
	$pclfile_b->configure(-state=>'normal');
	$inext_e->configure(-state=>'disabled',
			    -background=>'grey',
			    );
	$EPP_ch->configure(-state=>'normal');
	$EPP_sc->configure(-state=>'normal');
    }
}

sub smoothopts {
    if ($smoothtype == 0) {
	$smoothwin = 1;
	$mawsmoothwin = 0;
	$binsmoothwin = 0;
	foreach (keys %maw) {
	    $maw{$_}->configure(-state=>'disabled');
	}
	foreach (keys %bin) {
	    $bin{$_}->configure(-state=>'disabled');
	}
    }
    elsif ($smoothtype == 1) {
	$binsmoothwin = 0;
	$mawsmoothwin = 3;
	foreach (keys %maw) {
	    $maw{$_}->configure(-state=>'normal');
	}
	foreach (keys %bin) {
	    $bin{$_}->configure(-state=>'disabled');
	}
    }
    elsif ($smoothtype == 2) {
	$mawsmoothwin = 0;
	$binsmoothwin = 3;
	foreach (keys %maw) {
	    $maw{$_}->configure(-state=>'disabled');
	}
	foreach (keys %bin) {
	    $bin{$_}->configure(-state=>'normal');
	}
    }
}

sub gackmain {
    $rungack_b->configure(-state=> 'disabled');

    if (!$single_file && !$batch_dir && !$pcl_file) {
	&errormsg('No input file/directory specified');
	$rungack_b->configure(-state=> 'normal');
	return;
    }

    if ( ($indata eq 'single' && !$single_file) || ($indata eq 'batch' && !$batch_dir) || ($indata eq 'pclfile' && !$pcl_file) ) {
	&errormsg('Improper input file/directory selection');
	$rungack_b->configure(-state=> 'normal');
	return;
    }


    if ($logging == 1 && !$logfile) {
	&errormsg('Logging enabled, but no logfile specified');
	$rungack_b->configure(-state=> 'normal');
	return;
    }

    if ($smoothtype == 0) {
	$smoothwin = 1;
    }
    elsif ($smoothtype == 1) {
	$smoothwin = $mawsmoothwin;
    }
    elsif ($smoothtype == 2) {
	$smoothwin = $binsmoothwin;
    }
    else {
	&errormsg('Smoothing type not specified');
	$rungack_b->configure(-state=> 'normal');
	return;
    }

    unless ($outext) {
	&errormsg('Output file extension not specified');
	$rungack_b->configure(-state=> 'normal');
	return;
    }

# SINGLE FILE #
    if ($indata eq 'single') {
	; # removed 8/2/02
    }

# BATCH PROCESS DIRECTORY #
    elsif ($indata eq 'batch') {
	; # removed 8/2/02
    }

# PCL FILE #
    elsif ($indata eq 'pclfile') {
	my $infile = $pcl_file;
	use File::Basename;
	my $outdir = dirname($infile);
	chdir $outdir;

	my $time = localtime();

	&printlog ("$time\n");
	&printlog ("Opening $infile\n\n");
	unless (-e $infile) {
	    &printlog("Could not find input file $infile\n");
	    die;
	}
	
	open(INFILE,$infile);
	%datarefs = &read_multi_data(*INFILE);
	close INFILE;

#	%optcut = ();
#	%checkcut = ();

	my @EPP_testvals = (0.5, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100);
	my %EPP_cutoffs = ();

	my $EPPoutfile = 'EPP.val';
	if (-e $EPPoutfile) {
	    $EPPoutfile = $mw->getSaveFile(-initialfile=>'EPP.val',
					    -title=>'EPP output file',
					    );
	}

	open(EPPOUT,">$EPPoutfile") or die "EPP out problem\n";
	print EPPOUT "Dataset";
	foreach (@EPP_testvals) {
	    print EPPOUT "\t$_";
	}
	print EPPOUT "\n";
	close EPPOUT;

	foreach $key (sort keys %datarefs) {
	    next if $key eq 'NAME_COLUMN_VALUE';
	    next if $key eq 'ID_COLUMN_VALUE';
	    next if $key eq $datarefs{'NAME_COLUMN_VALUE'};
	    next if $key eq $datarefs{'ID_COLUMN_VALUE'};
	    next if $key =~ /GWEIGHT$/i;
	    next if $key =~ /GID$/i;
	    next if $key eq 'name';

	    &printlog("Processing $key\n");
	    my @logdata = @{$datarefs{$key}};

	    ($min,$max) = &getminmax(@logdata);
	    ($min,$max) = &defineborders($min,$max,$binsize);
	    my %datahist = &makehist($min,$max,$binsize,@logdata);
	    my @data = %datahist;
	    my @smoothdata = &get_smoothed_max($smoothtype,$smoothwin,$binsize,@data); # bin of the peak
	    my $mu = shift @smoothdata;
	    my %smoothdata = @smoothdata;
	    my %fakedata = ();
	    if ($smoothtype == 0) {
		%fakedata = &makefake($faketype,$mu,%datahist);
	    }
	    elsif ($smoothtype > 0) {
		%fakedata = &makefake($faketype,$mu,%smoothdata);
	    }
#	    my %diff = &calcdiff(\%datahist,\%fakedata);
#	    my @diff = %diff;
#	    my @smoothdiff = &get_smoothed_max($smoothtype,$smoothwin,$binsize,@diff);
#	    my $optcut = shift @smoothdiff;
#	    my %smoothdiff = @smoothdiff;

	    # EPP: Estimated Probabiliy of Presence
	    %EPP = &calcEPP(\%datahist,\%fakedata);


# Graphs of fitted distributions
#	    my $graphout = 1;
	    if ($graphout) {
		my $graph = GD::Graph::lines->new(600,300);
		my @xval;
		my @rawval;
		my @fakeval;
		my @eppval;
		my $ymax = 1;
		my $bin = 0;
		foreach (sort {$a<=>$b} keys %EPP) {
		    push @xval, $_;
		    push @rawval, $datahist{$_};
		    push @fakeval, $fakedata{$_};
		    push @eppval, $EPP{$_};
		    $ymax = $datahist{$_} if $datahist{$_} > $ymax;
		    $ymax = $fakedata{$_} if $fakedata{$_} > $ymax;
		    $ymax = $EPP{$_} if $EPP{$_} > $ymax;
		    $bin++;
		}
		my @legend = ('Data', 'Estimated Present Genes', 'EPP');
		$graph->set_legend(@legend);
		$ymax += 100 if $ymax % 50 < 50;
		$ymax = nearest(100,$ymax);
		my $xskip = int($bin/10);
		$graph->set(
			    y_max_value => $ymax,
			    x_label_skip => $xskip,
			    legend_placement => 'RC',
			    );
		my @datagraph = ([@xval], [@rawval], [@fakeval], [@eppval]);
		my $gd = $graph->plot(\@datagraph);

		my $graphfile = $key . '.jpg';
		open(GRAPH,">$graphfile") or die $!;
		binmode GRAPH;
		print GRAPH $gd->jpeg(100);
		close GRAPH;
	    }

# cutoff calculations
	    foreach (@EPP_testvals) {
		$EPP_cutoffs{$_} = &calc_EPPbound($_,%EPP);
	    }

	    open(EPPOUT,">>$EPPoutfile") or die "EPP out problem\n";
	    print EPPOUT "$key";
	    foreach (sort {$a<=>$b} keys %EPP_cutoffs) {
		print EPPOUT "\t$EPP_cutoffs{$_}";
	    }
	    print EPPOUT "\n";
	    close EPPOUT;
	    
	    my $EPPcutval = 0.5;
	    my $EPP_bound = &calc_EPPbound($EPPcutval,%EPP);
	    my $diff_bound = ($mu + $EPP_bound) / 2;
	    my %boundsmoothdiff = ();
	    foreach (sort {$a<=>$b} keys %smoothdiff) {
		$boundsmoothdiff{$_} = $smoothdiff{$_} if $_ < $diff_bound;
	    }
	    my @boundcutdata = &get_smoothed_max($smoothtype,$smoothwin,$binsize,%boundsmoothdiff);
	    my $boundcut = shift @boundcutdata;
# end of cutoff calculations
	    
	    if ($rawdataout) {
		my $inprefix = $key;
		my $histoutfile = $inprefix . '.out';
		
		$overwritecheck = 1;
		&tk_writefile(*OUTFILE,'output',$histoutfile);
		if ($overwritecheck == 1) {
		    open(OUTFILE,">$histoutfile") or &errormsg("Couldn't open $histoutfile");
#		    print OUTFILE "Overall difference peak = $optcut\n";
#		    print OUTFILE "EPP boundary = $EPP_bound\n";
#		    print OUTFILE "Bounded Difference Peak = $boundcut\n";
#		    print OUTFILE "EPP \= Estimated Probability of Presence\n";
		    if ($smoothtype > 0) {
			print OUTFILE "Bin\tFrequency\tSmoothedFreq\tFakePeak\tDiff\tSmoothedDiff\t\%EPP\n";
		    }
#		    else { print OUTFILE "Bin\tFrequency\tFakePeak\tDifference\t\%EPP\n"; }
		    else { print OUTFILE "Bin\tFrequency\tFakePeak\t\%EPP\n"; }
		    foreach $val ( sort {$a<=>$b} keys %datahist ) {
			if ($smoothtype > 0) {
			    print OUTFILE "$val\t$datahist{$val}\t";
			    if (exists $smoothdata{$val}) {
				print OUTFILE "$smoothdata{$val}\t";
			    }
			    else { print OUTFILE "\t"; }
			    if (exists $fakedata{$val}) {
				print OUTFILE "$fakedata{$val}\t";
			    }
			    else { print OUTFILE "\t"; }
#			    if (exists $diff{$val}) {
#				print OUTFILE "$diff{$val}\t";
#			    }
#			    else { print OUTFILE "\t"; }
			    if (exists $smoothdiff{$val}) {
				print OUTFILE "$smoothdiff{$val}\t";
			    }
			    else { print OUTFILE "\t"; }
			    if (exists $EPP{$val}) {
				print OUTFILE "$EPP{$val}\n";
			    }
			    else { print OUTFILE "\n"; }
			}
			else {
			    print OUTFILE "$val\t$datahist{$val}\t$fakedata{$val}\t";
			    print OUTFILE "$EPP{$val}\n"; 
			}
		    }
		    close OUTFILE;
		}
		else {
		    &printlog("File overwrite canceled: $histoutfile\n\n");
		    &errormsg("$histoutfile not overwritten");
		    $rungack_b->configure(-state=> 'normal');
		    next;
		}
		&printlog("Data written to $histoutfile\n\n");
		$rungack_b->configure(-state=> 'normal');
	    }
	}

	if ($EPPout) {
	    # create EPP-based bin output
	    &printlog("Calculating and processing EPP scaled output\n");
	    &makeEPPout($EPPoutfile,$infile);
	}
	if ($binout) {
	    &printlog("Generating binary output\n");
	    &makebinout($EPPoutfile,$infile);
	}
	if ($triout) {
	    &printlog("Generating trinary output\n");
	    &maketriout($EPPoutfile,$infile);
	}

	unlink($EPPoutfile) unless ($EPPout);
	&printlog("Processing completed for all datasets in $infile\n");
	$mw->bell;
	$rungack_b->configure(-state=> 'normal');
    }
}

sub maketriout {
    my $EPPoutfile = shift;
    my $infile = shift;
    open(EPPVALS,$EPPoutfile) or die "Couldn't open EPPout\n";
    chomp($EPPbins=<EPPVALS>);
    my @EPPbin = split(/\t/,$EPPbins);
    my %EPPvals = ();
    
# assign EPP bins to a range from lowbound to highbound - linear scale
    
    my $cutoffcol;
    for (my $c = 0; $c <= $#EPPbin; $c++) {
	next if $EPPbin[$c] =~ /[a-zA-Z]/;
	$EPPbin[$c] = 0 if $EPPbin[$c] == 0.5;
	if ($EPPbin[$c] == $trioutcut1) {
	    $cutoffcol1 = $c;
	}
	if ($EPPbin[$c] == $trioutcut2) {
	    $cutoffcol2 = $c;
	}
    }
    
    if ($cutoffcol1 > $cutoffcol1) { # switch so that col1 < col2
	my $temp = $cutoffcol1;
	$cutoffcol1 = $cutoffcol2;
	$cutoffcol2 = $temp;
    }

    while(<EPPVALS>) {
	chomp;
	my @line = split(/\t/);
	$_ = shift @line;
	$_ =~ /(c\d+)_/;
	my $key = $1;
	$EPPvals{$key}{'low'} = $line[$cutoffcol1-1];
	$EPPvals{$key}{'high'} = $line[$cutoffcol2-1];
    }
    close EPPVALS;
 
    open(INFILE,$infile) or die "Infile problem with $infile\n";
    my $gckoutfile = basename($infile);
    $gckoutfile =~ s/\.\w{3}$//;
    $gckoutfile .= '.tgk';
    
    if (-e $gckoutfile) {
	$gckoutfile = $mw->getSaveFile(-initialfile=> $gckoutfile,
				       -title=>'New PCL File',
				       );
    }
    open(NEWGCKFILE,">$gckoutfile") or die "Can't open output gck file\n";
    chomp($header = <INFILE>);
    print NEWGCKFILE "$header\n";

    @headers = split(/\t/,$header);
    
    my $gweight = 0;
    $gweight++ if $header =~ /GWEIGHT/i;
    my $gid = 0;
    $gid++ if $header =~ /^GID/i;
    
    my %headercol = ();
    for (my $d = 0; $d <= $#headers; $d++) {
	my $colnum = $d + 1;
	my $key = 'c' . 0 x (3 - length($colnum) ) . $colnum;
	$headercol{$key} = $headers[$d];
    }
        
    while(<INFILE>) {
	if (/^EWEIGHT/) {
	    print NEWGCKFILE $_;
	    next;
	}
	if (/^AID/) {
	    print NEWGCKFILE $_;
	    next;
	}
	chomp;
	my @line = split(/\t/);
	print NEWGCKFILE "$line[0]";
	for (my $f = 1; $f <= 1+$gweight+$gid; $f++) {
	    print NEWGCKFILE "\t$line[$f]";
	}
	
	for (my $e = $gweight+$gid+2; $e <= $#line; $e++) {
	    if ($line[$e] eq '') {
		print NEWGCKFILE "\t";
		next;
	    }
	    my $colnum = $e + 1;
	    my $colref = 'c' . 0 x (3-length($colnum)) . $colnum;
	    my $binval;

	    if ($line[$e] >= $EPPvals{$colref}{'high'} ) {
		$binval = 1;
	    }
	    elsif ($line[$e] < $EPPvals{$colref}{'low'} ) {
		$binval = -1;
	    }
	    elsif ($line[$e] >= $EPPvals{$colref}{'low'} &&
		   $line[$e] < $EPPvals{$colref}{'high'}) {
		$binval = 0;
	    }
	    else { $binval = 'problem'; }
	    print NEWGCKFILE "\t$binval";
	}
	print NEWGCKFILE "\n";
    }
    close NEWGCKFILE;
    close INFILE;

}


sub makebinout {
    my $EPPoutfile = shift;
    my $infile = shift;
    open(EPPVALS,$EPPoutfile) or die "Couldn't open EPPout\n";
    chomp($EPPbins=<EPPVALS>);
    my @EPPbin = split(/\t/,$EPPbins);
    my %EPPvals = ();
    
# assign EPP bins to a range from lowbound to highbound - linear scale
    
    my $cutoffcol;
    for (my $c = 0; $c <= $#EPPbin; $c++) {
	next if $EPPbin[$c] =~ /[a-zA-Z]/;
	$EPPbin[$c] = 0 if $EPPbin[$c] == 0.5;
	if ($EPPbin[$c] == $binoutcut) {
	    $cutoffcol = $c;
	    last;
	}
    }

    while(<EPPVALS>) {
	chomp;
	my @line = split(/\t/);
	$_ = shift @line;
	$_ =~ /(c\d+)_/;
	my $key = $1;
	$EPPvals{$key} = $line[$cutoffcol-1];
    }
    close EPPVALS;

    open(INFILE,$infile) or die "Infile problem with $infile\n";
    my $gckoutfile = basename($infile);
    $gckoutfile =~ s/\.\w{3}$//;
    $gckoutfile .= '.bgk';
    
    if (-e $gckoutfile) {
	$gckoutfile = $mw->getSaveFile(-initialfile=> $gckoutfile,
				       -title=>'New PCL File',
				       );
    }
    open(NEWGCKFILE,">$gckoutfile") or die "Can't open output gck file\n";
    chomp($header = <INFILE>);
    print NEWGCKFILE "$header\n";

    @headers = split(/\t/,$header);
    
    my $gweight = 0;
    $gweight++ if $header =~ /GWEIGHT/i;
    my $gid = 0;
    $gid++ if $header =~ /^GID/i;
    
    my %headercol = ();
    for (my $d = 0; $d <= $#headers; $d++) {
	my $colnum = $d + 1;
	my $key = 'c' . 0 x (3 - length($colnum) ) . $colnum;
	$headercol{$key} = $headers[$d];
    }
        
    while(<INFILE>) {
	if (/^EWEIGHT/) {
	    print NEWGCKFILE $_;
	    next;
	}
	if (/^AID/) {
	    print NEWGCKFILE $_;
	    next;
	}
	chomp;
	my @line = split(/\t/);
	print NEWGCKFILE "$line[0]";
	for (my $f = 1; $f <= 1+$gweight+$gid; $f++) {
	    print NEWGCKFILE "\t$line[$f]";
	}
	
	for (my $e = $gweight+$gid+2; $e <= $#line; $e++) {
	    if ($line[$e] eq '') {
		print NEWGCKFILE "\t";
		next;
	    }
	    my $colnum = $e + 1;
	    my $colref = 'c' . 0 x (3-length($colnum)) . $colnum;
	    my $binval;
	    if ($line[$e] >= $EPPvals{$colref}) {
		$binval = 1;
	    }
	    elsif ($line[$e] < $EPPvals{$colref}) {
		$binval = 0;
	    }
	    else { $binval = 'problem'; }
	    print NEWGCKFILE "\t$binval";
	}
	print NEWGCKFILE "\n";
    }
    close NEWGCKFILE;
    close INFILE;

}

sub makeEPPout {
    my $EPPoutfile = shift;
    my $infile = shift;

    open(EPPVALS,$EPPoutfile) or die "Couldn't open EPPout\n";
    chomp($EPPbins=<EPPVALS>);
    my @EPPbin = split(/\t/,$EPPbins);
    my %EPPvals = ();
    shift @EPPbin;
    
# assign EPP bins to a range from lowbound to highbound - linear scale
    my $sensi = $EPPsensi / 100;
    my $lowbound = 0 - $sensi;
    my $highbound = $lowbound + 1;
    
    for (my $c = 0; $c <= $#EPPbin; $c++) {
	$EPPbin[$c] = 0 if $EPPbin[$c] == 0.5;
	$EPPbin[$c] = $lowbound + ( ($EPPbin[$c] / 100) * ( $highbound - $lowbound ) );
    }
    
    while(<EPPVALS>) {
	chomp;
	my @line = split(/\t/);
	$_ = shift @line;
	$_ =~ /(c\d+)_/;
	my $key = $1;
	for (my $c = 0; $c <= $#line; $c++) {
	    $EPPvals{$key}{$EPPbin[$c]} = $line[$c];
	}
    }
    close EPPVALS;
   
    open(INFILE,$infile) or die "Infile problem with $infile\n";
    my $gckoutfile = basename($infile);
    $gckoutfile =~ s/\.\w{3}$//;
    $gckoutfile .= '.gck';
    
    if (-e $gckoutfile) {
	$gckoutfile = $mw->getSaveFile(-initialfile=> $gckoutfile,
				       -title=>'New PCL File',
				       );
    }
    open(NEWGCKFILE,">$gckoutfile") or die "Can't open output gck file\n";
    chomp($header = <INFILE>);
    print NEWGCKFILE "$header\n";

    @headers = split(/\t/,$header);
    
    my $gweight = 0;
    $gweight++ if $header =~ /GWEIGHT/i;
    my $gid = 0;
    $gid++ if $header =~ /^GID/i;
    
    my %headercol = ();
    for (my $d = 0; $d <= $#headers; $d++) {
	my $colnum = $d + 1;
	my $key = 'c' . 0 x (3 - length($colnum) ) . $colnum;
	$headercol{$key} = $headers[$d];
    }
    
    while(<INFILE>) {
	if (/^EWEIGHT/) {
	    print NEWGCKFILE $_;
	    next;
	}
	if (/^AID/) {
	    print NEWGCKFILE $_;
	    next;
	}
	chomp;
	my @line = split(/\t/);
	print NEWGCKFILE "$line[0]";
	for (my $f = 1; $f <= 1+$gweight+$gid; $f++) {
	    print NEWGCKFILE "\t$line[$f]";
	}
	
	for (my $e = $gweight+$gid+2; $e <= $#line; $e++) {
	    my $binval = $highbound;
	    if (!$line[$e]) {
		print NEWGCKFILE "\t";
		next;
	    }
	    my $colnum = $e + 1;
	    my $colref = 'c' . 0 x (3-length($colnum)) . $colnum;
	    foreach $val (sort {$b<=>$a} keys %{$EPPvals{$colref}} ) {
		if ($line[$e] > $EPPvals{$colref}{$val}) {
		    last;
		}
		else {
		    $binval = $val;
		    next;
		}
	    }
	    print NEWGCKFILE "\t$binval";
	}
	print NEWGCKFILE "\n";
    }
    close NEWGCKFILE;
    close INFILE;
}


sub check_for_falsecut {
    my $cut = shift;
    my $diffref = shift;
    my $check = 0;
    foreach $key (sort {$a<=>$b} keys %{$diffref}) {
	if ($key < $cut && $$diffref{$key} < 0) {
	    $check++;
	}
    }
    return $check;
}

sub calc_EPPbound {
    my $bound = shift;
    my %EPPdata = @_;
    my $EPPcut = '';
    my $interpolated_EPP = '';

    foreach $key (sort {$a<=>$b} keys %EPPdata) {
	if ($EPPdata{$key} < $bound) {
	    $EPPcut = $key;
	}
	else {
# v3.627 fix
	    if ($EPPcut eq '') {
		foreach $key (sort {$a<=>$b} keys %EPPdata) {
		    $interpolated_EPP = $key;
		    last;
		};
		last;
	    }
#

	    my $x1 = $EPPcut;
	    my $y1 = $EPPdata{$EPPcut};
	    my $x2 = $key;
	    my $y2 = $EPPdata{$key};

	    $interpolated_EPP = &calc_interpolate($bound,$x1,$y1,$x2,$y2);

	    last;
	}

    }
    return $interpolated_EPP;
#    return $EPPcut;
}

sub calcEPP {
    my $dataref = shift;
    my $fakeref = shift;
    my %EPP = ();
    foreach $key (sort {$a<=>$b} keys %{$dataref}) {
	if ( !(exists $$dataref{$key}) || !(exists $$fakeref{$key}) ) {
	    next;
	}
	if ($$dataref{$key} == 0) {
	    $EPP{$key} = 0;
	}
	else { 
	    my $EPP = $$fakeref{$key} / $$dataref{$key};
	    $EPP{$key} = 100 * $EPP;
	}
    }
    return %EPP;
}

sub calcdiff {
    my $dataref = shift;
    my $fakeref = shift;
    my %diff = ();
    foreach $key (sort {$a<=>$b} keys %{$dataref}) {
	if ( !(exists $$dataref{$key}) || !(exists $$fakeref{$key}) ) {
	    next;
	}
	my $difference = $$dataref{$key} - $$fakeref{$key};
	$diff{$key} = $difference;
    }
    return %diff;
}

sub makefake {
    my $type = shift;
    my $mid = shift;
    my %data = @_;
    my %fakedata = ();

    if ($type == 0) {  # right-side mirroring
	foreach $key (sort {$a<=>$b} keys %data) {
	    if ($key > $mid) {
		$fakedata{$key} = $data{$key};
		$negkey = $mid - ($key-$mid);
		$fakedata{$negkey} = $data{$key};
	    }
	    elsif ($key == $mid) {
		$fakedata{$key} = $data{$key};
	    }
	}
	foreach $key (keys %data) {
	    $fakedata{$key} = 0 unless $fakedata{$key};
	}
	return %fakedata;
    }
    elsif ($type == 1) {
	# normal distribution mapping
	my $halfmax = $data{$mid} / 2;
	my $mu = '';
	my $stdev = '';
	($mu,$stdev) = &get_interpolated($halfmax,%data);

	# this max factor normalizes the normal curve's height
        # to match the height of the real dataset
	my $largestval = 0;
	foreach $key (sort {$a<=>$b} keys %data) {
	    $fakedata{$key} = &calc_norm_dens_fx($key,$mu,$stdev,1);
	    if ($fakedata{$key} > $largestval) {
		$largestval = $fakedata{$key};
	    }
	}
#	print "mid: $mid\n";
	my $maxfactor = 0;
	$maxfactor = $data{$mid} / $largestval;
#	print "datamid: $data{$mid} largest: $largestval maxfact: $maxfactor\n";
	foreach $key (keys %data) {
	    $fakedata{$key} *= $maxfactor;
	}

	return %fakedata;
    }
}

sub calc_norm_dens_fx {
    my $x = shift;
    my $mu = shift;
    my $sigma = shift;
    my $max_factor = shift;

    my $term1 = 0 - ((($x - $mu)**2 )/ ( 2 * ($sigma)**2));
    my $term2 = $sigma * (sqrt(2 * 3.141592654) );
    my $calc_val = $max_factor * (exp($term1)) / $term2;
    return $calc_val;
}

sub get_interpolated { # linear interpolation of the 50% max values
    my $halfval = shift;
    my %data = @_;
    my @binvals = ();
    my $lastval = -1;
    my $lastbin = -1;

    foreach $bin (sort {$a<=>$b} keys %data) {
	if ($lastval == -1) {
	    $lastval = $data{$bin};
	    $lastbin = $bin;
	    next;
	}
	if ($halfval <= $data{$bin} && $halfval >= $lastval) {
	    my $intbin = &calc_interpolate($halfval,$lastbin,$lastval,$bin,$data{$bin});
	    push @binvals, $intbin;
	    $lastval = $data{$bin};
	    $lastbin = $bin;
	    next;
	}
	if ($halfval >= $data{$bin} && $halfval <= $lastval) {
	    my $intbin = &calc_interpolate($halfval,$lastbin,$lastval,$bin,$data{$bin});
	    push @binvals, $intbin;
	    $lastval = $data{$bin};
	    $lastbin = $bin;
	    next;
	}
	$lastval = $data{$bin};
	$lastbin = $bin;
    }

    my $min = 0;
    my $max = 0;
    ($min,$max) = &getminmax(@binvals);
    my $mid = ($min+$max)/2;
    my $sd = &calc_stdev($min,$max,$mid,$halfval);
    return ($mid,$sd);
}

sub calc_stdev {
    my $min = shift;
    my $max = shift;
    my $mid = shift;
    my $y_50 = shift;

    my $norm_x_at_50pct = sqrt( -2 * log(0.5) );
    my $sd = ($mid-$min) / $norm_x_at_50pct;
    return $sd;
}

sub calc_interpolate {
    my $y_int = shift;
    my $x1 = shift;
    my $y1 = shift;
    my $x2 = shift;
    my $y2 = shift;

    if ($y2 == $y1) { # average x values if the y is equal
	return ($x1 + $x2)/2;
    }
    my $x_int = ( (($y_int-$y1)/($y2-$y1))*($x2-$x1) ) + $x1;
    return $x_int;
}

sub get_smoothed_max {
    my $smoothtype = shift;
    my $smoothwin = shift;
    my $binsize = shift;
    my %data = @_;
    my @bin = ();
    my @y_val = ();
    my $largest_avg = 0;
    my $largest_bin = 0;
    my @smootheddata = ();

    foreach $entry (sort {$a<=>$b} keys %data) {
	push @bin, $entry;
	push @y_val, $data{$entry};
    }
    if ($smoothtype == 0) {
	my @multiple_peaks = (); # stores bin numbers
	for (my $d = 0; $d <= $#y_val; $d++) {
	    push @smootheddata, $bin[$d];
	    push @smootheddata, $y_val[$d];
	    if ($y_val[$d] > $largest_avg) {
		$largest_avg = $y_val[$d];
		$largest_bin = $bin[$d];
		@multiple_peaks = ();
	    }
	    elsif ($y_val[$d] == $largest_avg) {
		push @multiple_peaks,$bin[$d];
	    }
	}
	if (@multiple_peaks) {
	    my $sum = 0;
	    &printlog("Multiple peaks detected\n");
	    foreach $val (@multiple_peaks) {
		$sum += $val;
	    }
	    $largest_bin = $sum / ($#multiple_peaks+1); #finds avg bin of mult peak
	}
	$largest_bin = &round_to_lowerbin($largest_bin, $binsize);
	return ($largest_bin, @smootheddata);
    }

    elsif ($smoothtype == 1) {
	my $window = ($smoothwin - 1) / 2; # number of values on each side of main value to eval
	my $maxbin = 0;
	for (my $c = $window; $c <= $#y_val - $window; $c++) {
	    my $avg = 0;
	    for (my $d = $c-$window; $d <= $c+$window; $d++) {
		$avg += $y_val[$d];
	    }
	    $avg /= $smoothwin;
	    push @smootheddata, $bin[$c];
	    push @smootheddata, $avg;
	    if ($avg > $largest_avg) {
		$largest_bin = $c;
		$largest_avg = $avg;
	    }
	}
	return ($bin[$largest_bin], @smootheddata);
    }

    elsif ($smoothtype == 2) {
	die "Binomial smoothing not yet implemented\n";
    }

    else { die "No smoothing type specified\n"; }
}
    
sub makehist {
    my $min = shift;
    my $max = shift;
    my $binsize = shift;
    my @data = @_;
    my %datahash = ();
    $min /= $binsize; # these bizarre calculations are used
    $max /= $binsize; # to account for rounding errors
    for (my $c = int($min); $c < int($max); $c++) {
	my $bin = $c * $binsize;
	$datahash{$bin} = 0;
    }
    foreach $val (@data) {
	$newval = &round_to_lowerbin($val,$binsize);
#	print "$val\t$newval\n";
	$datahash{$newval}++;
    }
    return %datahash;
}

sub round_to_lowerbin {
    my $val = shift;
    my $bin = shift;
    my $newval = $val/$bin;

    $newval += 0.0001; # this factor accounts for rounding errors in
                       # val/bin division

    $newval = floor($newval);
    $newval *= $bin;
#    print "val: $val\tnewval: $newval\n";

    return $newval;
}

sub defineborders {
    my $min = shift;
    my $max = shift;
    my $bin = shift;
    my $minval = int($min/$bin);
    $minval--;
    my $newmin = $minval * $bin;
    my $maxval = int($max/$bin);
    $maxval++;
    my $newmax = $maxval * $bin;
    return ($newmin,$newmax);
}

sub getminmax {
    my @data = @_;
    my $min = $data[0];
    my $max = $data[0];
    foreach $val (@data) {
	$min = $val if $val < $min;
	$max = $val if $val > $max;
    }
    return ($min,$max);
}

sub read_data {
    my $file = shift;
    my @data = ();
    while(<$file>) {
	next if /SCUTID/;
	next if /NAME/;
	next if /LOG_RAT2N_MEDIAN/;
	next if /LOG_RAT2N_MEAN/;
	chomp;
	$_ =~ s/\r//; # deal with DOS text
	my @line = split(/\t/);
	next if !$line[1];
	push @data, $line[1];
    }
    return @data;
}

sub read_multi_data {
    my $file = shift;
    my %data = ();
    chomp($_=<$file>);
    my @header = split(/\t/);

    my $gweight = 0;
    $gweight++ if $_ =~ /GWEIGHT/i;

    my $gid = 0;
    $gid++ if $header[0] =~ /^GID$/i;

    $data{'ID_COLUMN_VALUE'} = $header[0+$gid];
    $data{'NAME_COLUMN_VALUE'} = $header[1+$gid];

    # check headers for uniqueness and non-alphanumeric values

    for (my $d = 2+$gweight ; $d <= $#header; $d++) {
	$header[$d] =~ s/ /_/g;
	$header[$d] =~ s/[^0-9a-zA-Z_-]//g;
	my $column = $d+1;  # - 1 - $gweight;
	my $coldigits = '0' x (3 - length($column) ) . $column;
	my $headername = 'c' . $coldigits . '_' . $header[$d];
	$headername = lc($headername);
	$header[$d] = $headername;
    }

    while(<$file>) {
	chomp;
	$_ =~ s/\r//; # deal with DOS text
	my @line = split(/\t/);
	next if $line[0] =~ /^EWEIGHT/i;
	next if $line[0] =~ /^AID/i;
	for (my $c = 0; $c <= $#line; $c++) {
	    next if !$line[$c];
	    push @{$data{$header[$c]}}, $line[$c];
	}
    }

    foreach $val (@header) { # check for empty columns
	unless ($data{$val}) {
	    &printlog("No values detected in column \"$val\"\n");
	    undef($data{$val});
	}
    }
    return %data;
}


sub tk_writefile {
    my $filehandle = $_[0];
    my $filetype = $_[1] if $_[1];
    my $filename = $_[2] if $_[2];

    if (-e $filename) {
	my $overwrite = $mw->Toplevel();
	$overwrite->Label(-text=>"$filename already exists!  Proceed with overwrite\?",
			  )->pack();
	$overwrite->Button(-text=>"OK",
			   -command=> sub { $overwritecheck = 1; $overwrite->destroy() },
			   )->pack(-side=>'left');
	$overwrite->Button(-text=>"Cancel",
			   -command=> sub { $overwritecheck = 0; $overwrite->destroy() },
			   )->pack(-side=>'right');
	$overwrite->waitWindow();
    }
    else { open($filehandle, ">$filename") or die "Couldn't open $filename\n"; }
    $filehandle = '';
    $filetype = '';
    $filename = '';
}
