diff options
-rwxr-xr-x | varscan2codon.pl | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/varscan2codon.pl b/varscan2codon.pl new file mode 100755 index 0000000..a7edad7 --- /dev/null +++ b/varscan2codon.pl @@ -0,0 +1,107 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use IO::File; +use POSIX qw(floor); + +my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv'; +my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv'; + +&usage unless $#ARGV == 2; + +my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME); +my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME); + +my $genomefh = IO::File->new($ARGV[0]) + or die "Couldn't open $ARGV[0] for reading: $!\n"; +my $genome = <$genomefh>; +chomp $genome; + +my $proteins = IO::File->new($ARGV[1]) + or die "Couldn't open $ARGV[1] for reading: $!\n"; +$_ = <$proteins>; +my %protein2pos; +while (<$proteins>) { + chomp; + my ($name, $start, $end) = split /,/; + $protein2pos{$name} = [$start, $end]; +} + +my $variants = IO::File->new($ARGV[2]) + or die "Couldn't open $ARGV[1] for reading: $!\n"; +my $name = 'unnamed-change'; +while (<$variants>) { + my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/; + $name = $name_maybe if length($name_maybe) > 0; + + # genome is 0-indexed, so decrement position. + my $start = $pos - 1; + + # verify the nucleotide at $start is what's expected from the csv. + my $check_nt = substr $genome, $start, 1; + if ($check_nt ne $orig_nt) { + die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n"; + } + + my ($protein, $aa_index) = &find_protein($pos, %protein2pos); + if (!defined($protein)) { + print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n"; + next; + } + + # convert position to the start of the codon and the offset of the + # change within from the start. + my $offset = $start % 3; + $start -= $offset; + + my $codon = substr $genome, $start, 3; + my $check_aa = $aa2letter{$codon2aa{$codon}}; + + my $tmp = $codon; + my $orig_aa = $aa2letter{$codon2aa{$codon}}; + substr $codon, $offset, 1, $new_nt; + my $new_aa = $aa2letter{$codon2aa{$codon}}; + my $codon_pos = floor($pos / 3); + print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\t$protein\t" . $orig_aa . $aa_index . $new_aa . "\n"; +} + +sub usage { + print STDERR "Usage: $0 genome protein-sequence.csv variants.csv\n"; + exit 1; +} + +sub find_protein { + my $pos = shift; + my %protein2pos = @_; + + foreach (keys %protein2pos) { + my ($start, $end) = @{$protein2pos{$_}}; + if ($start <= $pos && $pos <= $end) { + my $aa_index = floor(($pos - $start) / 3); + return ($_, $aa_index); + } + } + return (undef, undef); +} + +sub csv2kvp { + my ($filename) = @_; + + my $fh = IO::File->new($filename) + or die "Couldn't open $filename for reading: $!\n"; + + # Drop the header line since we're returning a direct map of keys + # to values. + $_ = <$fh>; + + my %rc = (); + while (<$fh>) { + chomp; + my ($k, $v) = split /,/; + $rc{$k} = $v; + } + + %rc +} |