From 94a90b0c45344d07b7830a086c4286d73aab7798 Mon Sep 17 00:00:00 2001 From: Brian Cully Date: Fri, 8 Jul 2022 18:08:46 -0400 Subject: Calculate codon starts from the start of the protein they're in. Turns out that the start of the genome ain't necessarily the start of a coding region. --- varscan2codon.pl | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/varscan2codon.pl b/varscan2codon.pl index 2c490b6..e0eaaf8 100755 --- a/varscan2codon.pl +++ b/varscan2codon.pl @@ -36,27 +36,30 @@ while (<$variants>) { my ($name_maybe, $ref, $pos, $orig_nt, $new_nt) = split /,/; $name = $name_maybe if length($name_maybe) > 0; - # genome is 0-indexed, so decrement position. - my $start = $pos - 1; - - # verify the nucleotide at $start is what's expected from the csv. - my $check_nt = substr $genome, $start, 1; - if ($check_nt ne $orig_nt) { - die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n"; - } - - my ($protein, $aa_index) = &find_protein($pos, %protein2pos); + my ($protein, $protein_start, $aa_index) = &find_protein($pos, %protein2pos); if (!defined($protein)) { print "$name\t" . lc($orig_nt) . $pos . lc($new_nt) . "\tnon-coding\n"; next; } + # find start of codon relative to the start of the protein. + my $start = $pos - $protein_start; + + # convert start of protein index to 0-based. + $protein_start--; + # convert position to the start of the codon and the offset of the # change within from the start. my $offset = $start % 3; $start -= $offset; - my $codon = substr $genome, $start, 3; + # verify the nucleotide at $start is what's expected from the csv. + my $check_nt = substr $genome, $start+$protein_start+$offset, 1; + if ($check_nt ne $orig_nt) { + die "$name: nucleotide at position $pos is $check_nt, but expected $orig_nt.\n"; + } + + my $codon = substr $genome, $start+$protein_start, 3; my $orig_aa = $aa2letter{$codon2aa{$codon}}; substr $codon, $offset, 1, $new_nt; my $new_aa = $aa2letter{$codon2aa{$codon}}; @@ -76,11 +79,12 @@ sub find_protein { foreach (keys %protein2pos) { my ($start, $end) = @{$protein2pos{$_}}; if ($start <= $pos && $pos <= $end) { - my $aa_index = floor(($pos - $start) / 3); - return ($_, $aa_index); + # Start counting from 1, like normal people. + my $aa_index = floor(($pos - $start) / 3) + 1; + return ($_, $start, $aa_index); } } - return (undef, undef); + return (undef, undef, undef); } sub csv2kvp { -- cgit v1.2.3