varscan2codon.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

#!/usr/bin/env perl

use strict;
use warnings;

use IO::File;
use POSIX qw(floor);

my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv';
my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv';

&usage unless $#ARGV == 2;

my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME);
my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME);

my $genomefh = IO::File->new($ARGV[0])
    or die "Couldn't open $ARGV[0] for reading: $!\n";
my $genome = <$genomefh>;
chomp $genome;

my $proteins = IO::File->new($ARGV[1])
    or die "Couldn't open $ARGV[1] for reading: $!\n";
$_ = <$proteins>;
my %protein2pos;
while (<$proteins>) {
    chomp;
    my ($name, $start, $end) = split /,/;
    $protein2pos{$name} = [$start, $end];
}

my $variants = IO::File->new($ARGV[2])
    or die "Couldn't open $ARGV[1] for reading: $!\n";
my $name = 'unnamed-change';
while (<$variants>) {
    chomp;
    my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/;
    $name = $name_maybe if length($name_maybe) > 0;

    my ($protein, $protein_start, $aa_index) = &find_protein($pos, %protein2pos);
    if (!defined($protein)) {
        print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n";
        next;
    }

    # find start of codon relative to the start of the protein.
    my $start = $pos - $protein_start;

    # convert start of protein index to 0-based.
    $protein_start--;

    # convert position to the start of the codon and the offset of the
    # change within from the start.
    my $offset = $start % 3;
    $start -= $offset;

    # verify the nucleotide at $start is what's expected from the csv.
    my $check_nt = substr $genome, $start+$protein_start+$offset, 1;
    if ($check_nt ne $orig_nt) {
        print STDERR "warning: $name (line $.): nucleotide at position $pos is $check_nt, but expected $orig_nt.\n";
    }

    my $codon = substr $genome, $start+$protein_start, 3;
    die "$. - couldn't find amino acid for codon '$codon'\n" unless ($codon2aa{$codon});

    my $orig_aa = $aa2letter{$codon2aa{$codon}};
    substr $codon, $offset, 1, $new_nt;
    my $new_aa = $aa2letter{$codon2aa{$codon}};
    print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\t$protein\t" . $orig_aa . $aa_index . $new_aa . "\n";
}

sub usage {
    print STDERR "Usage: $0 genome protein-sequence.csv variants.csv\n";
    exit 1;
}

sub find_protein {
    my $pos = shift;
    my %protein2pos = @_;

    foreach (keys %protein2pos) {
        my ($start, $end) = @{$protein2pos{$_}};
        if ($start <= $pos && $pos <= $end) {
            # Start counting from 1, like normal people.
            my $aa_index = floor(($pos - $start) / 3) + 1;
            return ($_, $start, $aa_index);
        }
    }
    return (undef, undef, undef);
}

sub csv2kvp {
    my ($filename) = @_;

    my $fh = IO::File->new($filename)
        or die "Couldn't open $filename for reading: $!\n";

    # Drop the header line since we're returning a direct map of keys
    # to values.
    $_ = <$fh>;

    my %rc = ();
    while (<$fh>) {
        chomp;
        my ($k, $v) = split /,/;
        $rc{$k} = $v;
    }

    %rc
}