From 260907e495d84492a5fe930311f7dda836a7ffde Mon Sep 17 00:00:00 2001 From: elavington <27739361+elavington@users.noreply.github.com> Date: Thu, 29 Mar 2018 19:50:35 -0400 Subject: uploaded beta version of new classifier All new classifier and updated training data source. Two genera from Smacoviridae and one Genus from Bacilladnaviridae are included, Gemyvongvirus is excluded. Genera were excluded based on sample size ( <5 ). --- classifier2.py | 509 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 509 insertions(+) create mode 100644 classifier2.py diff --git a/classifier2.py b/classifier2.py new file mode 100644 index 0000000..3ada9b8 --- /dev/null +++ b/classifier2.py @@ -0,0 +1,509 @@ +#!/home/erik/bin/python3 + +#%% Load libraries +import cgi, cgitb +from sklearn.externals import joblib +import pandas as pd +import numpy as np +from keras.models import Sequential, load_model +from keras.layers import Embedding, Conv1D, Dense, Flatten +from keras.initializers import RandomNormal +from keras.optimizers import RMSprop +from keras.preprocessing.text import one_hot +from keras.preprocessing.sequence import pad_sequences +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split +from keras.utils import to_categorical + +#----------------------------------------------\ +# Parse the web-form information to variables \ +# \_______________________________________________________ +# | +cgitb.enable(display=1, logdir="/var/www/html/bin/") +form=cgi.FieldStorage() +alignment = form.getvalue('fasta') +if alignment.startswith(">"): #naive check for FASTA format + list=alignment.split(">") + if list[0] == "": + list.pop(0)#get rid of the leading empty string + + seqList=[] + lenList=[] + nameList=[] + + for a in list: + tempList=a.split("\r\n") + if tempList[-1]=="": + tempList.pop(-1)#get rid of the trailing empty string + + tempSeq="" + nameList.append(tempList[0]) + for element in tempList[1:]: + tempSeq+=element + + seqList.append(tempSeq) + lenList.append(str(len(tempSeq))) + + if len(seqList)==0: #check for empty sequence list + seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] + nameList=['AAF97593.1'] + lenList=[str(len(alignment[0]))] + +else: + seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] + nameList=['AAF97593.1'] +#%% Constants for the models +max_len=573 +lenList=[str(len(alignment[0]))] + +#%%load the models +GeneModel=load_model("./GeneModel.model") +GenusModel=load_model("./Gene_Rep_vs_Other.model") +le=joblib.load("./LabelEncoder.pkl") + +#%%transfrom the input for the models +X=[" ".join(seq) for seq in seqList] +X=[one_hot(x,26, lower=False) for x in X] +X=[one_hot(x,n=26,lower=False) for x in X] +X=pad_sequences(X,maxlen=max_len,value=0,padding="post") + +#%% make predictions +rep_pred=GeneModel.predict_classes(X) +genus_prob=[max(x) for x in GenusModel.predict(X)] +genus_pred=GenusModel.predict_classes(X) + + +#----------------------------------------------\ +# Build HTML table of results \ +# \_______________________________________________________ +# +#results="

Entered Text Content Seq Name is {0} length {1}

".format(nameList,predictions) +resultsTable="" +resultsTable+=""" + + + + + + + + + +""" + + +for k in range(len(nameList)): + resultsTable+="".format(nameList[k],lenList[k],rep_pred[k],genus_pred[k],genus_prob[k]) + +resultsTable+="
Sequence NameLengthRep?Predicted GenusGenus Probability
{0}{1}{2}
" + + +#----------------------------------------------\ +# Build output page \ +# \_______________________________________________________ +# | +#build output page parts +#Header and CSS Style bits + +header="""Content-type:text/html + + + + +CRESS virus home + + + + +""" +#----------------------------------------------\ +# Build output page \ +# \_______________________________________________________ +# | +#build output page parts +#Header and CSS Style bits + +header="""Content-type:text/html + + + + + + +""" +#Page contents, first part +body1=""" + +

Welcome to CRESSdna.org

+ + + +
+

Home

+

Part of the National Science Foundation's Assembling the Tree of Life.

+ Sponsored with a Grant from the National Science Foundation +
+ +
+

Circoviridae

+

+
+Many animal-infecting CRESS-DNA viruses are classified into the Circoviridae family. There are two genera within the group, the older Circovirus and the more recently codified Cyclovirus, but both are well represented. At least one disease of economic importance is associated with circovirus infections: post-weaning maturation wasting syndrome in pigs (caused in part by porcine circovirus 2, which is now largely controlled through vaccination in commercial hog production). However, several worldwide veterinary diseases are due to circoviruses, including beak and feather disease and fatal acute diarrhea in dogs. +

+
+ missing +
Gastrointestinal system of dogs infected with dog circovirus (DogCV) with hemorrhaging in stomach and intestines. CC-BY Li et al. 2013
+
+
+ missing +
Immune electron microscopy image of PCV2 (porcine circovirus 2) particles. CC-BY Guo et al. 2011
+
+

+While some of the environmental isolates assigned to Circoviridae have genomes over 3,000 and 4,000 bases, it also contains some of the smallest genomes of CRESS-DNA viruses - some well-studied circoviruses have genomes about 1700nt long, and circularized putative genomes from metagenomics studies can be even smaller. Most analyzed sequences have two ORFs: the replication-associated protein (Rep, also referred to as the replication initiator protein) and capsid protein (Cp or Cap), with some isolates having had a third ORF experimentally verified, and some sequences having many hypothetical ORFs called that have not yet been studied in the lab. +

+

+Both cycloviruses and circoviruses have non-enveloped, icosahedral virions of 15-25nm encapsidating their circular, ssDNA genomes, but while members of Circovirus are found infecting or associated with mammals, birds and fish, cycloviruses have been found infecting or associated with mammals, birds and insects. Sequences assigned to Circovirus have ambisense genomes, with the Rep gene in sense, sequences in Cyclovirus typically are ambisense in the opposite orientation (Rep gene in anti-sense). +

+

+A great primer on Circoviridae +

+

+For more information about Circovirus: +
+ICTV report on circovirus
+ExPASy ViralZone summary of circovirus +Type species: Porcine circovirus 1 (NC_001792.2) +

+

+For more information about Cyclovirus: +
+ICTV report on cyclovirus
+ExPASy ViralZone summary of cyclovirus +Type species: Human-associated cyclovirus 8 (KF031466) +

+
+ +
+

Nanoviridae

+

+The plant infecting CRESS-DNA viruses with more than two genomic segments belong in the family Nanoviridae, which includes the genera Babuvirus and Nanovirus. One of the most economically important species in the family Nanoviridae is Banana bunchy top virus (BBTV), the type species of babuvirus. BBTV causes banana bunchy top disease, which is common in banana growing areas such as Southeast Asia, the South Pacific, India and Africa. This virus is transmitted by the banana aphid and causes plant crumpling, shrinking and chlorosis, which may develop into necrosis. +
+

+ missing +
Banana bunchy top, caused by Banana bunchy top virus (BBTV). CC-BY Scott Nelson 2014.
+
+ +

+Viruses in the Family Nanoviridae have multipartite genomes consisting of 6 to 8 ~1000 nucleotide segments of circular ssDNA. Five of these DNA components are shared between babuviruses and nanoviruses. (DNA-R, -N, -S, -C and -M). Nanoviruses infect dicots, have 8 genomic DNAs and may include three other DNA components with functions that have yet to be determined (DNA-U1, -U2 and U-4). Babuviruses infect monocots, have 6 genomic DNAs and may include another DNA component with an unknown function (DNA-U4). Each of these components encode a single ORF that is transcribed in one direction, thogh a second putative ORF has been identified on one segment of BBTV (DNA-R). The virions are non-enveloped, sized 17-20nm in diameter and have on CP (coat protein). Additional DNA segments (alphasatellites) are also associated with many viruses in the family and can alter disease symptoms. +

+ +

+For more information about Nanovirus: +
+ICTV report on nanovirus. +
+ExPASy ViralZone summary of nanovirus +
+Type Species: Subterranean clover stunt virus (NC_003818.1) +

+ +

+For more information about Babuvirus: +
+ICTV report on babuvirus. +
+ExPASy ViralZone summary of babuvirus +
+Type Species: Banana bunchy top virus (NC_003479.1) +

+
+ +
+

Taxonomy

+

+ + +
+ + +
+

+

+
+
+

Contact

+

This site is under construction

+

Please be patient while we tidy up a bit!

+ +
+ +
+

Contributors

+

This site is under construction

+

Please be patient while we tidy up a bit!

+ +
+ +
+

Results

+

Results from Taxonomy prediction

+ +""" + +#Page contents, second part (results fit between body1 and body2) +body2="""



This classifier will return the best fit of the submitted sequence to the training data.
+Currently included in the training data:
+

  • Circoviridae
  • + +
  • Nanoviridae
  • + +
  • Genomoviridae
  • + +
  • Geminiviridae
  • + +
  • Smacovirus
  • + +
  • Bacilladnaviridae
  • + +

    +

    +
    + + + +""" + +#close the Page +footer=""" +""" + +#build the output page +page=header+body1+results+body2+footer + +#build the output page +page=header+body1+home+aboutus+classifier+circovirus+contact+results1+resultsTable+results2+body2+footer + +#send the output as html +print (page) +quit() + -- cgit v1.2.3