#!/usr/bin/env perl use strict; use warnings; use Data::Dumper; use IO::File; my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv'; my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv'; &usage unless $#ARGV == 1; my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME); my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME); my $genomefh = IO::File->new($ARGV[0]) or die "Couldn't open $ARGV[0] for reading: $!\n"; my $genome = <$genomefh>; chomp $genome; my $sequences = IO::File->new($ARGV[1]) or die "Couldn't open $ARGV[1] for reading: $!\n"; $_ = <$sequences>; while (<$sequences>) { chomp; my ($name, $start, $end) = split /,/; # Lengths are inclusive, so add 1 to them. my $protein_len = $end - $start + 1; unless ($protein_len % 3 == 0) { print STDERR "Protein $name has improper length $protein_len.\n"; next; } my $aaseq = ''; # Decrement $start to change from the input's 1-indexed array to # perl's 0-indexed. for ($start--; $start < $end; $start += 3) { my $codon = substr $genome, $start, 3; my $aa = $aa2letter{$codon2aa{$codon}}; $aaseq .= $aa; } print "$name,$aaseq\n"; } sub usage { print STDERR "Usage: $0 genome protein-sequence.csv\n"; exit 1; } sub csv2kvp { my ($filename) = @_; my $fh = IO::File->new($filename) or die "Couldn't open $filename for reading: $!\n"; # Drop the header line since we're returning a direct map of keys # to values. $_ = <$fh>; my %rc = (); while (<$fh>) { chomp; my ($k, $v) = split /,/; $rc{$k} = $v; } %rc }