# find_agps.pl, a program designed to search for proteins 
# with biased amino acid comp.
#
# (c) December 2001, Michael Rumsewicz
# Authors: Michael Rumsewicz (coding)
#          Carolyn Schultz (requirements and algorithms)

# The programs takes a database of protein sequences downloaded from 
# the TIGR website,
#   ftp://ftp.tigr.org/pub/data/a_thaliana/ath1  
# The current version of the database is called ATH1.pep
# Note that this database is regularly being updated

# The program computes the percentage of P, A, S and T in the sequence.
# If the percentage is above a pre-defined threshold level, or the sequence
# length is between two bounds, the program outputs the sequence information.

# Four output files are generated:
#    ???.long     - contains all proteins meeting the threshold criteria
#    ???.long.50  - abbreviated form of ???.long suitable for sending to the
#                   signalp server
#    ???.short    - contains all proteins meeting the length criteria
#    ???.long.50  - abbreviated form of ???.short suitable for sending to the
#                   signalp server

# The prefix for the output files is a command line argument. No checking
# is done to see whether or not the files already exist, SO BE CAREFUL.
 
# Algorithms:
# The database has the following form:
#     >string#string#string\tstring\tstring
#     PASTGC.....
#
#     >string#string#string\tstring\tstring
#     MPSTGCP.....

# ">" Indicates the name and further information about a protein sequence
# The following line/lines are the sequence itself. 

# Having read the protein information line, the program concatenates the 
# following lines, removing white space, to get the sequence itself.
# The number of P, S, A and T is counted and the precentage of the entire
# sequence computed. Appropriate information then goes to the output files.

# Programming notes:
#   Protein name: All characters from > to the first comma
#   Protein sequence: Can be spread over multiple lines so needs to be 
#                 found by concatentating all the lines and removing white
#                 space.
#  Program structure:
#  - count_amino : a subroutine to count number of occurrences of 
#    a given letter in a string.
#  - print_data  : a subroutine to output desired information to files. 
#    created to avoid repetitious print related code.
#  - Main program : Takes two command line arguments, 
#                   - protein database
#                   - a prefix for the output file names

# ==================================================================
# Subroutine: Get the number of occurrences of a particular amino 
# acid within a sequence
sub count_amino {
    my ($x) = $_[0];
    my ($seq) = $_[1];

    $tmp = $seq;
    $num_x = 0;
    while (index($tmp, $x) >= 0 ) {
        $num_x++;
        $in = index($tmp, $x);
        $tmp=substr($tmp,$in+1);
    } 

    return $num_x;
}


# ==================================================================
# Subroutine: print the sequence and summary information to two output files
# - full information
# - suitable for signalp server

sub print_data {
    my ($file1) = $_[0];
    my ($file2) = $_[1];

    print $file1 ">$name\n$sequence\n";
    print $file1 "Length     : $len\n";
    foreach $key (sort @a)
    {
        print $file1 sprintf("Number of %s: %3d, %6.2f\%\n", $key, $num{$key},$per{$key});
    }
    print $file1 sprintf("Total:            %6.2f\%\n", $count);
    print $file1 "\n\n";
    # only print out the first 75 chars so as not to hassle PSORT
    $temp = substr($name,0, 75);
    print $file2 ">$temp\n";
    print $file2 substr($sequence,0,50) . "\n\n";
}


# =========================================================================
# Main Program
# Go and get the input filename from the command line

    if ($#ARGV < 1) {
        print "Incorrect number of command line arguments specified\n";
        print "Usage - \n";
        print "    perl $0 [amino_acid_1 amino_acid_2 .... ] [threshold_value] protein_sequence_file output_file_prefix\n";
        print "where\n";
        print "      amino_acid_1, 2, .... are optional amino acid arguments (one upper case character each)\n";
        print "      threshold is an optional threshold value (sum of amino_acid percentages)\n";
        print "      protein_sequence_file is the file of protein sequence data\n";
        print "      output_file_prefix is a prefix to the two output files generated by $0\n";
        print "Exiting\n";
        exit;
    }

# Read in command line arguments
# Assume that first N-3 arguments are amino acids, given as one letter
# (upper case), separated by spaces.
# If there are less than 3 command line arguments, use defaul values specified later
# Assume that argument N-2 is the threshold percentage
# Assume that last two arguments are 
#      - protein sequence file
#      - prefix for files generated

    # Get the amino acids requred from the command line
    if ($#ARGV >= 2) {
        for ($i = 0; $i <= $#ARGV - 3; $i++) {
            $amino_acid = $ARGV[$i];
            $a[$i] = $amino_acid;
            print STDERR "Amino acid $i: $a[$i]\n";
        }
        $threshold = $ARGV[$#ARGV - 2];
        $num_amino_acids = $#ARGV - 3;
    }

    print "Threshold: $threshold\n";

    $file = $ARGV[$#ARGV - 1];
    print STDERR "Base: $file\n";

    open(INPUT, $file)
        or die "File $file not found - Exiting.\n";

    $base = $ARGV[$#ARGV];
    print STDERR "Prefix: $base\n";

    #exit();

# OUTPUT1 and OUTPUT3 are the full output files
# OUTPUT2 and OUTPUT4 are the output files in signalp server format
    open(OUTPUT1, "> $base\_long.txt");
    open(OUTPUT2, "> $base\_long\_50.txt");
    open(OUTPUT3, "> $base\_short.txt");
    open(OUTPUT4, "> $base\_short\_50.txt");

# Singalp server requires "EUK" on first line and "." on last line 
# of data files.
    print OUTPUT2 "EUK\n";
    print OUTPUT4 "EUK\n";

# Part 1: Get each of the protein sequences
# Variables:
#   $num_proteins - figures out the total number of proteins in the input stream
#   @protein_name - array of the names of each protein, 1 .. $num_proteins
#   @protein_sequence - array of sequences for each protein, 1 .. $num_proteins
#
# Other information:
#   - ">" is a marker in first position of a line to indicate that 
#     the protein name follows
#   - protein sequence could be spread over multiple lines
#   - white space needs to be removed


# Global variables (default values)

    $short_sequence_length_low = 55;
    $short_sequence_length_high = 75;

# Default values for amino acids and threshold if not specified on command line
    if ($#ARGV < 2) {
        $threshold = 50.0;
    }
    if ($#ARGV < 3) {
        @a[0]="T";
        @a[1]="P";
        @a[2]="A";
        @a[3]="S";
        $num_amino_acids = 4;
    }
    $name = "";
    
    # Go through each line of the database
    while ($line = <INPUT>) {
        chomp($line);

        # Is the line the name of a new sequence?
        $isname = index($line,">");
 
        # If it is, then finish the analysis of the previous sequence
        if ($isname eq 0) {

            # Save the previous sequence with white space omitted
            $sequence =~ s/\*//g;
            $len = length($sequence);

            # Now see if the seqence matches the criteria
            # $count is the total percentage of specified amino acids in the sequence
            # $num{$key} is the number of $key specified amino acids in the sequence
            # $per{$key} is the percentage of $key specified amino acids in the sequence
            $count = 0;
            if (($len > 0) && (substr ($sequence, 0, 1) eq "M")) {
                foreach $key (@a)
                {
                    $num{$key} = count_amino($key, $sequence);
                    $per{$key} = $num{$key}*100.0/$len;
                    $count += $per{$key};
                }

                # Two alternative criteria for saving a sequence
                # 1. Total percentage greater than or equal to threshold
                # 2. The sequence length is between the two bounds
                # Note that a sequence may meet both criteria
                # and hence be placed in both sets of output files
                if ( ($count >= $threshold)
                || ( (length($sequence) <= $short_sequence_length_high)
                && (length($sequence) >= $short_sequence_length_low) ) ) {

                    # If criteria 1 met, save to OUTPUT1 and OUTPUT2
                    if ($count >= $threshold) {
                            print_data (OUTPUT1, OUTPUT2);
                    }

                    # If criteria 2 met, save to OUTPUT1 and OUTPUT2
                    if ( (length($sequence) <= $short_sequence_length_high)
                        && (length($sequence) >= $short_sequence_length_low)) {
                        print_data (OUTPUT3, OUTPUT4);
                    }
                }
            }
            # The name of the sequence is everything after the ">"
            # Clear the sequence in preparation for reading the next one
            $name = substr($line, 1);
            $sequence = "";
        }
        else
        {
            # Add the next fragment of the sequence to that read in so far
            $subsequence = $line;
            $sequence = $sequence . $subsequence;
        }
    }
# and finish off the last sequence when we reach the end of the file
# Basically, repeat the above.
    $sequence =~ s/\*//g;
    $len = length($sequence);
    $count = 0;
    if (($len > 0) && (substr ($sequence, 0, 1) eq "M")) {
        foreach $key (@a)
        {
            $num{$key} = count_amino($key, $sequence);
            $per{$key} = $num{$key}*100.0/$len;
            $count += $per{$key};
        }

        if ( ($count >= $threshold)
        || ((length($sequence) < $short_sequence_length_high)
        && (length($sequence) > $short_sequence_length_low))) {

            if ($count >= $threshold) {
                print_data (OUTPUT1, OUTPUT2);
            }

            if ((length($sequence) <= $short_sequence_length_high)
            && (length($sequence) >= $short_sequence_length_low)) {
                print_data (OUTPUT3, OUTPUT4);
            }
        }
    }

    print OUTPUT2 ".\n";
    print OUTPUT4 ".\n";
    close (INPUT);