summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Cully <bjc@kublai.com>2022-07-08 18:08:46 -0400
committerBrian Cully <bjc@kublai.com>2022-07-08 18:08:46 -0400
commit94a90b0c45344d07b7830a086c4286d73aab7798 (patch)
tree8ec088b906d23e3fa6dd258f4070a9054a6cccbf
parent79d5aa9716881932c225c911d775797f4a21f7f7 (diff)
downloadfav-94a90b0c45344d07b7830a086c4286d73aab7798.tar.gz
fav-94a90b0c45344d07b7830a086c4286d73aab7798.zip
Calculate codon starts from the start of the protein they're in.
Turns out that the start of the genome ain't necessarily the start of a coding region.
-rwxr-xr-xvarscan2codon.pl32
1 files changed, 18 insertions, 14 deletions
diff --git a/varscan2codon.pl b/varscan2codon.pl
index 2c490b6..e0eaaf8 100755
--- a/varscan2codon.pl
+++ b/varscan2codon.pl
@@ -36,27 +36,30 @@ while (<$variants>) {
my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/;
$name = $name_maybe if length($name_maybe) > 0;
- # genome is 0-indexed, so decrement position.
- my $start = $pos - 1;
-
- # verify the nucleotide at $start is what's expected from the csv.
- my $check_nt = substr $genome, $start, 1;
- if ($check_nt ne $orig_nt) {
- die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n";
- }
-
- my ($protein, $aa_index) = &find_protein($pos, %protein2pos);
+ my ($protein, $protein_start, $aa_index) = &find_protein($pos, %protein2pos);
if (!defined($protein)) {
print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n";
next;
}
+ # find start of codon relative to the start of the protein.
+ my $start = $pos - $protein_start;
+
+ # convert start of protein index to 0-based.
+ $protein_start--;
+
# convert position to the start of the codon and the offset of the
# change within from the start.
my $offset = $start % 3;
$start -= $offset;
- my $codon = substr $genome, $start, 3;
+ # verify the nucleotide at $start is what's expected from the csv.
+ my $check_nt = substr $genome, $start+$protein_start+$offset, 1;
+ if ($check_nt ne $orig_nt) {
+ die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n";
+ }
+
+ my $codon = substr $genome, $start+$protein_start, 3;
my $orig_aa = $aa2letter{$codon2aa{$codon}};
substr $codon, $offset, 1, $new_nt;
my $new_aa = $aa2letter{$codon2aa{$codon}};
@@ -76,11 +79,12 @@ sub find_protein {
foreach (keys %protein2pos) {
my ($start, $end) = @{$protein2pos{$_}};
if ($start <= $pos && $pos <= $end) {
- my $aa_index = floor(($pos - $start) / 3);
- return ($_, $aa_index);
+ # Start counting from 1, like normal people.
+ my $aa_index = floor(($pos - $start) / 3) + 1;
+ return ($_, $start, $aa_index);
}
}
- return (undef, undef);
+ return (undef, undef, undef);
}
sub csv2kvp {