codon2aa.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

#!/usr/bin/env perl

use strict;
use warnings;

use Data::Dumper;
use IO::File;

my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv';
my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv';

&usage unless $#ARGV == 1;

my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME);
my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME);

my $genomefh = IO::File->new($ARGV[0])
    or die "Couldn't open $ARGV[0] for reading: $!\n";
my $genome = <$genomefh>;
chomp $genome;

my $sequences = IO::File->new($ARGV[1])
    or die "Couldn't open $ARGV[1] for reading: $!\n";
$_ = <$sequences>;
while (<$sequences>) {
    chomp;
    my ($name, $start, $end) = split /,/;

    # Lengths are inclusive, so add 1 to them.
    my $protein_len = $end - $start + 1;
    unless ($protein_len % 3 == 0) {
        print STDERR "Protein $name has improper length $protein_len.\n";
        next;
    }

    my $aaseq = '';
    # Decrement $start to change from the input's 1-indexed array to
    # perl's 0-indexed.
    for ($start--; $start < $end; $start += 3) {
        my $codon = substr $genome, $start, 3;
        my $aa = $aa2letter{$codon2aa{$codon}};
        $aaseq .= $aa;
    }
    print "$name,$aaseq\n";
}

sub usage {
    print STDERR "Usage: $0 genome protein-sequence.csv\n";
    exit 1;
}

sub csv2kvp {
    my ($filename) = @_;

    my $fh = IO::File->new($filename)
        or die "Couldn't open $filename for reading: $!\n";

    # Drop the header line since we're returning a direct map of keys
    # to values.
    $_ = <$fh>;

    my %rc = ();
    while (<$fh>) {
        chomp;
        my ($k, $v) = split /,/;
        $rc{$k} = $v;
    }

    %rc
}