From 630d01364bdb9128fed07a5efa8f3be8091b50f5 Mon Sep 17 00:00:00 2001 From: Brian Cully Date: Fri, 1 Jul 2022 15:51:44 -0400 Subject: Add script for printing a protein's codon sequence. --- codon2aa.pl | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100755 codon2aa.pl diff --git a/codon2aa.pl b/codon2aa.pl new file mode 100755 index 0000000..a9b9e8d --- /dev/null +++ b/codon2aa.pl @@ -0,0 +1,70 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use Data::Dumper; +use IO::File; + +my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv'; +my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv'; + +&usage unless $#ARGV == 1; + +my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME); +my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME); + +my $genomefh = IO::File->new($ARGV[0]) + or die "Couldn't open $ARGV[0] for reading: $!\n"; +my $genome = <$genomefh>; +chomp $genome; + +my $sequences = IO::File->new($ARGV[1]) + or die "Couldn't open $ARGV[1] for reading: $!\n"; +$_ = <$sequences>; +while (<$sequences>) { + chomp; + my ($name, $start, $end) = split /,/; + + # Lengths are inclusive, so add 1 to them. + my $protein_len = $end - $start + 1; + unless ($protein_len % 3 == 0) { + print STDERR "Protein $name has improper length $protein_len.\n"; + next; + } + + my $aaseq = ''; + # Decrement $start to change from the input's 1-indexed array to + # perl's 0-indexed. + for ($start--; $start < $end; $start += 3) { + my $codon = substr $genome, $start, 3; + my $aa = $aa2letter{$codon2aa{$codon}}; + $aaseq .= $aa; + } + print "$name,$aaseq\n"; +} + +sub usage { + print STDERR "Usage: $0 genome protein-sequence.csv\n"; + exit 1; +} + +sub csv2kvp { + my ($filename) = @_; + + my $fh = IO::File->new($filename) + or die "Couldn't open $filename for reading: $!\n"; + + # Drop the header line since we're returning a direct map of keys + # to values. + $_ = <$fh>; + + my %rc = (); + while (<$fh>) { + chomp; + my ($k, $v) = split /,/; + $rc{$k} = $v; + } + + %rc +} -- cgit v1.2.3