1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
#!/usr/bin/env perl
use strict;
use warnings;
use IO::File;
use POSIX qw(floor);
my $CODON_TO_AA_FILENAME = 'codon-amino-acid.csv';
my $AA_TO_LETTER_FILENAME = 'amino-acid-letters.csv';
&usage unless $#ARGV == 2;
my %codon2aa = &csv2kvp($CODON_TO_AA_FILENAME);
my %aa2letter = &csv2kvp($AA_TO_LETTER_FILENAME);
my $genomefh = IO::File->new($ARGV[0])
or die "Couldn't open $ARGV[0] for reading: $!\n";
my $genome = <$genomefh>;
chomp $genome;
my $proteins = IO::File->new($ARGV[1])
or die "Couldn't open $ARGV[1] for reading: $!\n";
$_ = <$proteins>;
my %protein2pos;
while (<$proteins>) {
chomp;
my ($name, $start, $end) = split /,/;
$protein2pos{$name} = [$start, $end];
}
my $variants = IO::File->new($ARGV[2])
or die "Couldn't open $ARGV[1] for reading: $!\n";
my $name = 'unnamed-change';
while (<$variants>) {
chomp;
my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/;
$name = $name_maybe if length($name_maybe) > 0;
my ($protein, $protein_start, $aa_index) = &find_protein($pos, %protein2pos);
if (!defined($protein)) {
print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n";
next;
}
# find start of codon relative to the start of the protein.
my $start = $pos - $protein_start;
# convert start of protein index to 0-based.
$protein_start--;
# convert position to the start of the codon and the offset of the
# change within from the start.
my $offset = $start % 3;
$start -= $offset;
# verify the nucleotide at $start is what's expected from the csv.
my $check_nt = substr $genome, $start+$protein_start+$offset, 1;
if ($check_nt ne $orig_nt) {
print STDERR "warning: $name (line $.): nucleotide at position $pos is $check_nt, but expected $orig_nt.\n";
}
my $codon = substr $genome, $start+$protein_start, 3;
die "$. - couldn't find amino acid for codon '$codon'\n" unless ($codon2aa{$codon});
my $orig_aa = $aa2letter{$codon2aa{$codon}};
substr $codon, $offset, 1, $new_nt;
my $new_aa = $aa2letter{$codon2aa{$codon}};
print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\t$protein\t" . $orig_aa . $aa_index . $new_aa . "\n";
}
sub usage {
print STDERR "Usage: $0 genome protein-sequence.csv variants.csv\n";
exit 1;
}
sub find_protein {
my $pos = shift;
my %protein2pos = @_;
foreach (keys %protein2pos) {
my ($start, $end) = @{$protein2pos{$_}};
if ($start <= $pos && $pos <= $end) {
# Start counting from 1, like normal people.
my $aa_index = floor(($pos - $start) / 3) + 1;
return ($_, $start, $aa_index);
}
}
return (undef, undef, undef);
}
sub csv2kvp {
my ($filename) = @_;
my $fh = IO::File->new($filename)
or die "Couldn't open $filename for reading: $!\n";
# Drop the header line since we're returning a direct map of keys
# to values.
$_ = <$fh>;
my %rc = ();
while (<$fh>) {
chomp;
my ($k, $v) = split /,/;
$rc{$k} = $v;
}
%rc
}
|