summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xvarscan2codon.pl107
1 files changed, 107 insertions, 0 deletions
diff --git a/varscan2codon.pl b/varscan2codon.pl
new file mode 100755
index 0000000..a7edad7
--- /dev/null
+++ b/varscan2codon.pl
@@ -0,0 +1,107 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+use IO::File;
+use POSIX qw(floor);
+
+my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv';
+my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv';
+
+&usage unless $#ARGV == 2;
+
+my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME);
+my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME);
+
+my $genomefh = IO::File->new($ARGV[0])
+ or die "Couldn't open $ARGV[0] for reading: $!\n";
+my $genome = <$genomefh>;
+chomp $genome;
+
+my $proteins = IO::File->new($ARGV[1])
+ or die "Couldn't open $ARGV[1] for reading: $!\n";
+$_ = <$proteins>;
+my %protein2pos;
+while (<$proteins>) {
+ chomp;
+ my ($name, $start, $end) = split /,/;
+ $protein2pos{$name} = [$start, $end];
+}
+
+my $variants = IO::File->new($ARGV[2])
+ or die "Couldn't open $ARGV[1] for reading: $!\n";
+my $name = 'unnamed-change';
+while (<$variants>) {
+ my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/;
+ $name = $name_maybe if length($name_maybe) > 0;
+
+ # genome is 0-indexed, so decrement position.
+ my $start = $pos - 1;
+
+ # verify the nucleotide at $start is what's expected from the csv.
+ my $check_nt = substr $genome, $start, 1;
+ if ($check_nt ne $orig_nt) {
+ die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n";
+ }
+
+ my ($protein, $aa_index) = &find_protein($pos, %protein2pos);
+ if (!defined($protein)) {
+ print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n";
+ next;
+ }
+
+ # convert position to the start of the codon and the offset of the
+ # change within from the start.
+ my $offset = $start % 3;
+ $start -= $offset;
+
+ my $codon = substr $genome, $start, 3;
+ my $check_aa = $aa2letter{$codon2aa{$codon}};
+
+ my $tmp = $codon;
+ my $orig_aa = $aa2letter{$codon2aa{$codon}};
+ substr $codon, $offset, 1, $new_nt;
+ my $new_aa = $aa2letter{$codon2aa{$codon}};
+ my $codon_pos = floor($pos / 3);
+ print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\t$protein\t" . $orig_aa . $aa_index . $new_aa . "\n";
+}
+
+sub usage {
+ print STDERR "Usage: $0 genome protein-sequence.csv variants.csv\n";
+ exit 1;
+}
+
+sub find_protein {
+ my $pos = shift;
+ my %protein2pos = @_;
+
+ foreach (keys %protein2pos) {
+ my ($start, $end) = @{$protein2pos{$_}};
+ if ($start <= $pos && $pos <= $end) {
+ my $aa_index = floor(($pos - $start) / 3);
+ return ($_, $aa_index);
+ }
+ }
+ return (undef, undef);
+}
+
+sub csv2kvp {
+ my ($filename) = @_;
+
+ my $fh = IO::File->new($filename)
+ or die "Couldn't open $filename for reading: $!\n";
+
+ # Drop the header line since we're returning a direct map of keys
+ # to values.
+ $_ = <$fh>;
+
+ my %rc = ();
+ while (<$fh>) {
+ chomp;
+ my ($k, $v) = split /,/;
+ $rc{$k} = $v;
+ }
+
+ %rc
+}