blob: a9b9e8d683c371b3bd6745748c1e084fc283a60e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
|
#!/usr/bin/env perl
use strict;
use warnings;
use Data::Dumper;
use IO::File;
my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv';
my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv';
&usage unless $#ARGV == 1;
my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME);
my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME);
my $genomefh = IO::File->new($ARGV[0])
or die "Couldn't open $ARGV[0] for reading: $!\n";
my $genome = <$genomefh>;
chomp $genome;
my $sequences = IO::File->new($ARGV[1])
or die "Couldn't open $ARGV[1] for reading: $!\n";
$_ = <$sequences>;
while (<$sequences>) {
chomp;
my ($name, $start, $end) = split /,/;
# Lengths are inclusive, so add 1 to them.
my $protein_len = $end - $start + 1;
unless ($protein_len % 3 == 0) {
print STDERR "Protein $name has improper length $protein_len.\n";
next;
}
my $aaseq = '';
# Decrement $start to change from the input's 1-indexed array to
# perl's 0-indexed.
for ($start--; $start < $end; $start += 3) {
my $codon = substr $genome, $start, 3;
my $aa = $aa2letter{$codon2aa{$codon}};
$aaseq .= $aa;
}
print "$name,$aaseq\n";
}
sub usage {
print STDERR "Usage: $0 genome protein-sequence.csv\n";
exit 1;
}
sub csv2kvp {
my ($filename) = @_;
my $fh = IO::File->new($filename)
or die "Couldn't open $filename for reading: $!\n";
# Drop the header line since we're returning a direct map of keys
# to values.
$_ = <$fh>;
my %rc = ();
while (<$fh>) {
chomp;
my ($k, $v) = split /,/;
$rc{$k} = $v;
}
%rc
}
|