1 files changed, 107 insertions, 0 deletions
diff --git a/varscan2codon.pl b/varscan2codon.pl
new file mode 100755
index 0000000..a7edad7
--- /dev/null
+++ b/varscan2codon.pl
@@ -0,0 +1,107 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use IO::File;
+use POSIX qw(floor);
+
+my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv';
+my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv';
+
+&usage unless $#ARGV == 2;
+
+my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME);
+my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME);
+
+my $genomefh = IO::File->new($ARGV[0])
+    or die "Couldn't open $ARGV[0] for reading: $!\n";
+my $genome = <$genomefh>;
+chomp $genome;
+
+my $proteins = IO::File->new($ARGV[1])
+    or die "Couldn't open $ARGV[1] for reading: $!\n";
+$_ = <$proteins>;
+my %protein2pos;
+while (<$proteins>) {
+    chomp;
+    my ($name, $start, $end) = split /,/;
+    $protein2pos{$name} = [$start, $end];
+}
+
+my $variants = IO::File->new($ARGV[2])
+    or die "Couldn't open $ARGV[1] for reading: $!\n";
+my $name = 'unnamed-change';
+while (<$variants>) {
+    my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/;
+    $name = $name_maybe if length($name_maybe) > 0;
+
+    # genome is 0-indexed, so decrement position.
+    my $start = $pos - 1;
+
+    # verify the nucleotide at $start is what's expected from the csv.
+    my $check_nt = substr $genome, $start, 1;
+    if ($check_nt ne $orig_nt) {
+        die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n";
+    }
+
+    my ($protein, $aa_index) = &find_protein($pos, %protein2pos);
+    if (!defined($protein)) {
+        print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n";
+        next;
+    }
+
+    # convert position to the start of the codon and the offset of the
+    # change within from the start.
+    my $offset = $start % 3;
+    $start -= $offset;
+
+    my $codon = substr $genome, $start, 3;
+    my $check_aa = $aa2letter{$codon2aa{$codon}};
+
+    my $tmp = $codon;
+    my $orig_aa = $aa2letter{$codon2aa{$codon}};
+    substr $codon, $offset, 1, $new_nt;
+    my $new_aa = $aa2letter{$codon2aa{$codon}};
+    my $codon_pos = floor($pos / 3);
+    print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\t$protein\t" . $orig_aa . $aa_index . $new_aa . "\n";
+}
+
+sub usage {
+    print STDERR "Usage: $0 genome protein-sequence.csv variants.csv\n";
+    exit 1;
+}
+
+sub find_protein {
+    my $pos = shift;
+    my %protein2pos = @_;
+
+    foreach (keys %protein2pos) {
+        my ($start, $end) = @{$protein2pos{$_}};
+        if ($start <= $pos && $pos <= $end) {
+            my $aa_index = floor(($pos - $start) / 3);
+            return ($_, $aa_index);
+        }
+    }
+    return (undef, undef);
+}
+
+sub csv2kvp {
+    my ($filename) = @_;
+
+    my $fh = IO::File->new($filename)
+        or die "Couldn't open $filename for reading: $!\n";
+
+    # Drop the header line since we're returning a direct map of keys
+    # to values.
+    $_ = <$fh>;
+
+    my %rc = ();
+    while (<$fh>) {
+        chomp;
+        my ($k, $v) = split /,/;
+        $rc{$k} = $v;
+    }
+
+    %rc
+}