From 94a90b0c45344d07b7830a086c4286d73aab7798 Mon Sep 17 00:00:00 2001
From: Brian Cully <bjc@kublai.com>
Date: Fri, 8 Jul 2022 18:08:46 -0400
Subject: Calculate codon starts from the start of the protein they're in.

Turns out that the start of the genome ain't necessarily the start of
a coding region.
---
 varscan2codon.pl | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/varscan2codon.pl b/varscan2codon.pl
index 2c490b6..e0eaaf8 100755
--- a/varscan2codon.pl
+++ b/varscan2codon.pl
@@ -36,27 +36,30 @@ while (<$variants>) {
     my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/;
     $name = $name_maybe if length($name_maybe) > 0;
 
-    # genome is 0-indexed, so decrement position.
-    my $start = $pos - 1;
-
-    # verify the nucleotide at $start is what's expected from the csv.
-    my $check_nt = substr $genome, $start, 1;
-    if ($check_nt ne $orig_nt) {
-        die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n";
-    }
-
-    my ($protein, $aa_index) = &find_protein($pos, %protein2pos);
+    my ($protein, $protein_start, $aa_index) = &find_protein($pos, %protein2pos);
     if (!defined($protein)) {
         print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n";
         next;
     }
 
+    # find start of codon relative to the start of the protein.
+    my $start = $pos - $protein_start;
+
+    # convert start of protein index to 0-based.
+    $protein_start--;
+
     # convert position to the start of the codon and the offset of the
     # change within from the start.
     my $offset = $start % 3;
     $start -= $offset;
 
-    my $codon = substr $genome, $start, 3;
+    # verify the nucleotide at $start is what's expected from the csv.
+    my $check_nt = substr $genome, $start+$protein_start+$offset, 1;
+    if ($check_nt ne $orig_nt) {
+        die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n";
+    }
+
+    my $codon = substr $genome, $start+$protein_start, 3;
     my $orig_aa = $aa2letter{$codon2aa{$codon}};
     substr $codon, $offset, 1, $new_nt;
     my $new_aa = $aa2letter{$codon2aa{$codon}};
@@ -76,11 +79,12 @@ sub find_protein {
     foreach (keys %protein2pos) {
         my ($start, $end) = @{$protein2pos{$_}};
         if ($start <= $pos && $pos <= $end) {
-            my $aa_index = floor(($pos - $start) / 3);
-            return ($_, $aa_index);
+            # Start counting from 1, like normal people.
+            my $aa_index = floor(($pos - $start) / 3) + 1;
+            return ($_, $start, $aa_index);
         }
     }
-    return (undef, undef);
+    return (undef, undef, undef);
 }
 
 sub csv2kvp {
-- 
cgit v1.2.3