#!/usr/bin/env perl use strict; use warnings; use IO::File; use POSIX qw(floor); my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv'; my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv'; &usage unless $#ARGV == 2; my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME); my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME); my $genomefh = IO::File->new($ARGV[0]) or die "Couldn't open $ARGV[0] for reading: $!\n"; my $genome = <$genomefh>; chomp $genome; my $proteins = IO::File->new($ARGV[1]) or die "Couldn't open $ARGV[1] for reading: $!\n"; $_ = <$proteins>; my %protein2pos; while (<$proteins>) { chomp; my ($name, $start, $end) = split /,/; $protein2pos{$name} = [$start, $end]; } my $variants = IO::File->new($ARGV[2]) or die "Couldn't open $ARGV[2] for reading: $!\n"; my $name = 'unnamed-change'; while (<$variants>) { chomp; my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/; $name = $name_maybe if length($name_maybe) > 0; my ($protein, $protein_start, $aa_index) = &find_protein($pos, %protein2pos); if (!defined($protein)) { print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n"; next; } # find start of codon relative to the start of the protein. my $start = $pos - $protein_start; # convert start of protein index to 0-based. $protein_start--; # convert position to the start of the codon and the offset of the # change within from the start. my $offset = $start % 3; $start -= $offset; # verify the nucleotide at $start is what's expected from the csv. my $check_nt = substr $genome, $start+$protein_start+$offset, 1; if ($check_nt ne $orig_nt) { print STDERR "warning: $name (line $.): nucleotide at position $pos is $check_nt, but expected $orig_nt.\n"; } my $codon = substr $genome, $start+$protein_start, 3; die "$. - couldn't find amino acid for codon '$codon'\n" unless ($codon2aa{$codon}); my $orig_aa = $aa2letter{$codon2aa{$codon}}; substr $codon, $offset, 1, $new_nt; my $new_aa = $aa2letter{$codon2aa{$codon}}; print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\t$protein\t" . $orig_aa . $aa_index . $new_aa . "\n"; } sub usage { print STDERR "Usage: $0 genome protein-sequence.csv variants.csv\n"; exit 1; } sub find_protein { my $pos = shift; my %protein2pos = @_; foreach (keys %protein2pos) { my ($start, $end) = @{$protein2pos{$_}}; if ($start <= $pos && $pos <= $end) { # Start counting from 1, like normal people. my $aa_index = floor(($pos - $start) / 3) + 1; return ($_, $start, $aa_index); } } return (undef, undef, undef); } sub csv2kvp { my ($filename) = @_; my $fh = IO::File->new($filename) or die "Couldn't open $filename for reading: $!\n"; # Drop the header line since we're returning a direct map of keys # to values. $_ = <$fh>; my %rc = (); while (<$fh>) { chomp; my ($k, $v) = split /,/; $rc{$k} = $v; } %rc }