#!/home/erik/bin/python3 #import packages to be used import cgi, cgitb import warnings from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib import re warnings.simplefilter("ignore", UserWarning)#ignore a joblib version warning #----------------------------------------------\ # Parse the web-form information to variables \ # \_______________________________________________________ # | cgitb.enable(display=1, logdir="/var/www/html/bin/") form=cgi.FieldStorage() alignment = form.getvalue('fasta') if alignment.startswith(">"): #naive check for FASTA format list=alignment.split(">") if list[0] == "": list.pop(0)#get rid of the leading empty string seqList=[] lenList=[] nameList=[] for a in list: tempList=a.split("\r\n") if tempList[-1]=="": tempList.pop(-1)#get rid of the trailing empty string tempSeq="" nameList.append(tempList[0]) for element in tempList[1:]: tempSeq+=element seqList.append(tempSeq) lenList.append(str(len(tempSeq))) if len(seqList)==0: #check for empty sequence list seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['Demo'] lenList=[str(len(alignment[0]))] else: seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['Demo'] lenList=[str(len(alignment[0]))] #--------------------------------------------------------------------------------------------------------+ #----------------------------------------------\ # predict genus of input sequences \ # \_______________________________________________________ # | #list of amino acids as vocabulary for the CountVectorizer AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] #load the classifier and scaler clf=joblib.load("./clf_11_21_2017.pkl") StSc=joblib.load("./StSc_11_21_2017.pkl") cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) #initialize text data vectorizer dataVect=cv.transform(seqList) #Scale the data to the training set X=StSc.transform(dataVect.astype("float64")) #make predictions for the original dataset predictions=clf.predict(X) #----------------------------------------------\ # Build HTML table of results \ # \_______________________________________________________ # #results="

Entered Text Content Seq Name is {0} length {1}

".format(nameList,predictions) results="" results+=""" """ for k in range(len(nameList)): results+="".format(nameList[k],lenList[k],predictions[k]) results+="

Sequence Name	Length	Prediction
{0}	{1}	{2}

" #----------------------------------------------\ # Build output page \ # \_______________________________________________________ # | #build output page parts #Header and CSS Style bits header="""Content-type:text/html """ #Page contents, first part body1="""

Welcome to CRESSdna.org

Home

Part of the National Science Foundation's Assembling the Tree of Life.

Sponsored with a Grant from the National Science Foundation

Taxonomy

This classifier requires Rep protein sequence to be:

Complete
Unaligned
in FASTA format

And has been trained on the following Genera:

Circoviridae

Circovirus
Cyclovirus

Nanoviridae

Babuvirus
Nanovirus

Genomoviridae

Gemycircularvirus
Gemygorvirus
Gemykibivirus
Gemykolovirus
Gemykrogvirus
Gemyvongvirus

Geminiviridae

Becurtovirus
Begomovirus
Capulavirus
Curtovirus
Eragrovirus
Grablovirus
Mastrevirus
Turncurtovirus

Smacovirus

Contact

This site is under construction

Please be patient while we tidy up a bit!

Results

Results from Taxonomy prediction

""" #Page contents, second part (results fit between body1 and body2) body2="""

This classifier will return the best fit of the submitted sequence to the training data.
Currently included in the training data:

Circoviridae

Circovirus
Cyclovirus

Nanoviridae

Babuvirus
Nanovirus

Genomoviridae

Gemycircularvirus
Gemygorvirus
Gemykibivirus
Gemykolovirus
Gemykrogvirus
Gemyvongvirus

Geminiviridae

Becurtovirus
Begomovirus
Capulavirus
Curtovirus
Eragrovirus
Grablovirus
Mastrevirus
Turncurtovirus

Smacovirus

""" #close the Page footer=""" """ #build the output page page=header+body1+results+body2+footer #send the output as html print (page) quit()