# find_agps.pl, a program designed to search for proteins # with biased amino acid comp. # # (c) December 2001, Michael Rumsewicz # Authors: Michael Rumsewicz (coding) # Carolyn Schultz (requirements and algorithms) # The programs takes a database of protein sequences downloaded from # the TIGR website, # ftp://ftp.tigr.org/pub/data/a_thaliana/ath1 # The current version of the database is called ATH1.pep # Note that this database is regularly being updated # The program computes the percentage of P, A, S and T in the sequence. # If the percentage is above a pre-defined threshold level, or the sequence # length is between two bounds, the program outputs the sequence information. # Four output files are generated: # ???.long - contains all proteins meeting the threshold criteria # ???.long.50 - abbreviated form of ???.long suitable for sending to the # signalp server # ???.short - contains all proteins meeting the length criteria # ???.long.50 - abbreviated form of ???.short suitable for sending to the # signalp server # The prefix for the output files is a command line argument. No checking # is done to see whether or not the files already exist, SO BE CAREFUL. # Algorithms: # The database has the following form: # >string#string#string\tstring\tstring # PASTGC..... # # >string#string#string\tstring\tstring # MPSTGCP..... # ">" Indicates the name and further information about a protein sequence # The following line/lines are the sequence itself. # Having read the protein information line, the program concatenates the # following lines, removing white space, to get the sequence itself. # The number of P, S, A and T is counted and the precentage of the entire # sequence computed. Appropriate information then goes to the output files. # Programming notes: # Protein name: All characters from > to the first comma # Protein sequence: Can be spread over multiple lines so needs to be # found by concatentating all the lines and removing white # space. # Program structure: # - count_amino : a subroutine to count number of occurrences of # a given letter in a string. # - print_data : a subroutine to output desired information to files. # created to avoid repetitious print related code. # - Main program : Takes two command line arguments, # - protein database # - a prefix for the output file names # ================================================================== # Subroutine: Get the number of occurrences of a particular amino # acid within a sequence sub count_amino { my ($x) = $_[0]; my ($seq) = $_[1]; $tmp = $seq; $num_x = 0; while (index($tmp, $x) >= 0 ) { $num_x++; $in = index($tmp, $x); $tmp=substr($tmp,$in+1); } return $num_x; } # ================================================================== # Subroutine: print the sequence and summary information to two output files # - full information # - suitable for signalp server sub print_data { my ($file1) = $_[0]; my ($file2) = $_[1]; print $file1 ">$name\n$sequence\n"; print $file1 "Length : $len\n"; foreach $key (sort @a) { print $file1 sprintf("Number of %s: %3d, %6.2f\%\n", $key, $num{$key},$per{$key}); } print $file1 sprintf("Total: %6.2f\%\n", $count); print $file1 "\n\n"; # only print out the first 75 chars so as not to hassle PSORT $temp = substr($name,0, 75); print $file2 ">$temp\n"; print $file2 substr($sequence,0,50) . "\n\n"; } # ========================================================================= # Main Program # Go and get the input filename from the command line if ($#ARGV < 1) { print "Incorrect number of command line arguments specified\n"; print "Usage - \n"; print " perl $0 [amino_acid_1 amino_acid_2 .... ] [threshold_value] protein_sequence_file output_file_prefix\n"; print "where\n"; print " amino_acid_1, 2, .... are optional amino acid arguments (one upper case character each)\n"; print " threshold is an optional threshold value (sum of amino_acid percentages)\n"; print " protein_sequence_file is the file of protein sequence data\n"; print " output_file_prefix is a prefix to the two output files generated by $0\n"; print "Exiting\n"; exit; } # Read in command line arguments # Assume that first N-3 arguments are amino acids, given as one letter # (upper case), separated by spaces. # If there are less than 3 command line arguments, use defaul values specified later # Assume that argument N-2 is the threshold percentage # Assume that last two arguments are # - protein sequence file # - prefix for files generated # Get the amino acids requred from the command line if ($#ARGV >= 2) { for ($i = 0; $i <= $#ARGV - 3; $i++) { $amino_acid = $ARGV[$i]; $a[$i] = $amino_acid; print STDERR "Amino acid $i: $a[$i]\n"; } $threshold = $ARGV[$#ARGV - 2]; $num_amino_acids = $#ARGV - 3; } print "Threshold: $threshold\n"; $file = $ARGV[$#ARGV - 1]; print STDERR "Base: $file\n"; open(INPUT, $file) or die "File $file not found - Exiting.\n"; $base = $ARGV[$#ARGV]; print STDERR "Prefix: $base\n"; #exit(); # OUTPUT1 and OUTPUT3 are the full output files # OUTPUT2 and OUTPUT4 are the output files in signalp server format open(OUTPUT1, "> $base\_long.txt"); open(OUTPUT2, "> $base\_long\_50.txt"); open(OUTPUT3, "> $base\_short.txt"); open(OUTPUT4, "> $base\_short\_50.txt"); # Singalp server requires "EUK" on first line and "." on last line # of data files. print OUTPUT2 "EUK\n"; print OUTPUT4 "EUK\n"; # Part 1: Get each of the protein sequences # Variables: # $num_proteins - figures out the total number of proteins in the input stream # @protein_name - array of the names of each protein, 1 .. $num_proteins # @protein_sequence - array of sequences for each protein, 1 .. $num_proteins # # Other information: # - ">" is a marker in first position of a line to indicate that # the protein name follows # - protein sequence could be spread over multiple lines # - white space needs to be removed # Global variables (default values) $short_sequence_length_low = 55; $short_sequence_length_high = 75; # Default values for amino acids and threshold if not specified on command line if ($#ARGV < 2) { $threshold = 50.0; } if ($#ARGV < 3) { @a[0]="T"; @a[1]="P"; @a[2]="A"; @a[3]="S"; $num_amino_acids = 4; } $name = ""; # Go through each line of the database while ($line = ) { chomp($line); # Is the line the name of a new sequence? $isname = index($line,">"); # If it is, then finish the analysis of the previous sequence if ($isname eq 0) { # Save the previous sequence with white space omitted $sequence =~ s/\*//g; $len = length($sequence); # Now see if the seqence matches the criteria # $count is the total percentage of specified amino acids in the sequence # $num{$key} is the number of $key specified amino acids in the sequence # $per{$key} is the percentage of $key specified amino acids in the sequence $count = 0; if (($len > 0) && (substr ($sequence, 0, 1) eq "M")) { foreach $key (@a) { $num{$key} = count_amino($key, $sequence); $per{$key} = $num{$key}*100.0/$len; $count += $per{$key}; } # Two alternative criteria for saving a sequence # 1. Total percentage greater than or equal to threshold # 2. The sequence length is between the two bounds # Note that a sequence may meet both criteria # and hence be placed in both sets of output files if ( ($count >= $threshold) || ( (length($sequence) <= $short_sequence_length_high) && (length($sequence) >= $short_sequence_length_low) ) ) { # If criteria 1 met, save to OUTPUT1 and OUTPUT2 if ($count >= $threshold) { print_data (OUTPUT1, OUTPUT2); } # If criteria 2 met, save to OUTPUT1 and OUTPUT2 if ( (length($sequence) <= $short_sequence_length_high) && (length($sequence) >= $short_sequence_length_low)) { print_data (OUTPUT3, OUTPUT4); } } } # The name of the sequence is everything after the ">" # Clear the sequence in preparation for reading the next one $name = substr($line, 1); $sequence = ""; } else { # Add the next fragment of the sequence to that read in so far $subsequence = $line; $sequence = $sequence . $subsequence; } } # and finish off the last sequence when we reach the end of the file # Basically, repeat the above. $sequence =~ s/\*//g; $len = length($sequence); $count = 0; if (($len > 0) && (substr ($sequence, 0, 1) eq "M")) { foreach $key (@a) { $num{$key} = count_amino($key, $sequence); $per{$key} = $num{$key}*100.0/$len; $count += $per{$key}; } if ( ($count >= $threshold) || ((length($sequence) < $short_sequence_length_high) && (length($sequence) > $short_sequence_length_low))) { if ($count >= $threshold) { print_data (OUTPUT1, OUTPUT2); } if ((length($sequence) <= $short_sequence_length_high) && (length($sequence) >= $short_sequence_length_low)) { print_data (OUTPUT3, OUTPUT4); } } } print OUTPUT2 ".\n"; print OUTPUT4 ".\n"; close (INPUT);