#!/usr/bin/perl
##################################################
##################################################
##						##	
##		keyword_search.pl		##
##		Kellen Olszewski		##
##		August 2005			##
##						##
##################################################
##################################################
#
#	Scans a spreadsheet (tab-delimited text)
#	for occurrences from a list of search terms.
#
#	FORMAT
#	#perl keyword_search.pl in=INPUT_FILE key=KEY1,KEY2,... col=COL1,COL2,... -w -c > OUTPUT_FILE
#
#	ARGUMENTS
#
#	in:	The input file to search.
#
#	key:	The search keys to scan with.  This should
#		be a comma-separated list of words or parts
#		of words, i.e., "neuron,nerv,gangli".
#
#		NOTE:  Do not use spaces in your search
#		terms!  Everything after the space will
#		be ignored by the program.  In place of
#		a space, use a period.  For example,
#		instead of "all cells" search for "all.cells".
#
#		NOTE:  Some punctuation characters may give
#		unexpected results.  This is due to how Perl
#		treats certain characters.  Characters with 
#		issues include '[', ']', '-', '+', and '.'.
#
#		WILDCARD MODE:  If the -w flag is set, the 
#		script searches in wildcard mode.  The only
#		accepted wildcard characters are: 
#		'C' (uppercase letters) 
#		'c' (lowercase letters)
#		'n' (digits)
#		'a' (all characters).  
#		No other characters are permitted.
#		For example, the key "cccan" would find
#		"unc-4" and "CCCan" would find "UNC-4".
#		
#	col:	The columns to search in.  Input as a comma-separated list,
#		i.e., "1,3,4";
#
#	w:	Search for WILDCARD keys. By default this is off.  See
#		below for details on accepted wildcards.
#
#	c:	Combine redundant lines in the input file.  If this flag is set,
#		any lines in the input file have the same first entry
#		will be combined.  For example, these two lines:
#
#		gen-1	neurons	Expressed in neurons
#		gen-1	muscle	Expressed in muscle
#
#		would be combined into the line:
#
#		gen-1	neuron|muscle	Expressed in neurons|Expressed in muscle	
#
#		in the output.  I recommend using this option.
#		
#
#	OUTPUT
#	Outputs to the screen by default.  Redirect to a file
#	using the ">" operator (see above).
#
#	Example output:
#
#	There are 3 entries that contain the search term
#	171720_x_at	WBGene00013011	active
#	171721_x_at	WBGene00011344	active
#	171722_x_at	WBGene00018934	active 
#
#	There are 2 entries that do not contain the search term
#	171726_x_at	-	not a gene
#	172102_x_at	-	not a gene	
#

my $input;
my @keys;
my @columns;
my $wildcard = 0;
my $combine = 0;
my @entries;

# Parse the arguments
foreach my $arg (@ARGV) {
	if ($arg eq "-w") {
		$wildcard = 1;
	}
	elsif ($arg eq "-c") {
		$combine = 1;
	}
	else {
		my @words = split(/=/, $arg);
	
		if ($words[0] eq "in") {
			$input = $words[1];
		}
		if ($words[0] eq "key") {
			@keys = split(/,/, $words[1]);
		}
		if ($words[0] eq "col") {
			@columns = split(/,/, $words[1]);
		}
	}
}

if ($input eq "") {
	print STDERR "No input file specified.\n";
	die;
}
if (scalar(@keys) == 0) {
	print STDERR "No search keys entered.\n";
	die;
}
if (scalar(@columns) == 0) {
	print STDERR "No columns specified.\n";
	die;
}

# Get the input
open INPUT, "<$input";
my @lines = <INPUT>;
close INPUT;

# Combine the list entries
if ($combine == 1) {
	my %hash = ();
	foreach my $line (@lines) {
		chomp($line);
		my @fields = split(/\t/, $line);
		if (exists $hash{$fields[0]}) {
			my @temp = split(/\t/, $hash{$fields[0]});
			my $cnt = 0;
			foreach my $field (@fields) {
				unless ($field eq $temp[$cnt] || $field eq "") {
					if ($temp[$cnt] eq "") {
						$temp[$cnt] = $field;
					}
					else {
						$temp[$cnt] = $temp[$cnt] . "|" . $field;
					}
				}
				$cnt++;
			}
			$hash{$fields[0]} = join "\t", @temp;
		}
		elsif ($fields[0] ne "") {
			$hash{$fields[0]} = $line;
		}
	}

	for my $key (sort keys %hash) {
		push(@entries, $hash{$key} . "\n");
	}
}
else {
	@entries = @lines;
}

# Parse the search key
my @search;
if ($wildcard == 1) {
	foreach my $key (@keys) {
		my @letters = split(//, $key);
		my $temp = "";
		foreach my $letter (@letters) {
			if ($letter eq "C") {
				$temp =  $temp . "[A-Z]";
			}
			elsif ($letter eq "c") {
				$temp =  $temp . "[a-z]";
			}
			elsif ($letter eq "n") {
				$temp =  $temp . "\\d";
			}
			elsif ($letter eq "a") {
				$temp =  $temp . ".";
			}
		}
		push(@search, $temp);
	}
}
else {
	@search = @keys;
}

# Scan the input
my @match;
my @not_match;
foreach my $entry (@entries) {
	chomp($entry);
	my @fields = split(/\t/, $entry);

	my $found = 0;
	foreach my $column (@columns) {
		foreach my $search_key (@search) {
			if ($fields[$column - 1] =~ /$search_key/) {
				$found = 1;
				last;
			}
		}
		if ($found == 1) {
			last;
		}		
	}

	if ($found == 1) {
		push @match, "$entry";
	}
	else {
		push @not_match, "$entry";
	}
}

# Print results
print "Command line:  keyword_search.pl ", join " ", @ARGV, "\n";
print scalar(@match) + scalar(@not_match), " entries total", "\n\n";
print "There are ", scalar(@match), " entries that contain the search term\n";
foreach my $out (@match) {
	print $out, "\n";
}
print "\nThere are ", scalar(@not_match), " entries that do not contain the search term\n";
foreach my $out (@not_match) {
	print $out, "\n";
}