#!/home/erik/bin/python3 #import packages to be used import cgi, cgitb import warnings from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib import re warnings.simplefilter("ignore", UserWarning)#ignore a joblib version warning #----------------------------------------------\ # Parse the web-form information to variables \ # \_______________________________________________________ # | cgitb.enable(display=1, logdir="/var/www/html/bin/") form=cgi.FieldStorage() alignment = form.getvalue('fasta') if alignment.startswith(">"): #naive check for FASTA format list=alignment.split(">") if list[0] == "": list.pop(0)#get rid of the leading empty string seqList=[] lenList=[] nameList=[] for a in list: tempList=a.split("\r\n") if tempList[-1]=="": tempList.pop(-1)#get rid of the trailing empty string tempSeq="" nameList.append(tempList[0]) for element in tempList[1:]: tempSeq+=element seqList.append(tempSeq) lenList.append(str(len(tempSeq))) if len(seqList)==0: #check for empty sequence list seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['Demo'] lenList=[str(len(alignment[0]))] else: seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['Demo'] lenList=[str(len(alignment[0]))] #--------------------------------------------------------------------------------------------------------+ #----------------------------------------------\ # predict genus of input sequences \ # \_______________________________________________________ # | #list of amino acids as vocabulary for the CountVectorizer AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] #load the classifier and scaler clf=joblib.load("./clf_11_21_2017.pkl") StSc=joblib.load("./StSc_11_21_2017.pkl") cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) #initialize text data vectorizer dataVect=cv.transform(seqList) #Scale the data to the training set X=StSc.transform(dataVect.astype("float64")) #make predictions for the original dataset predictions=clf.predict(X) #----------------------------------------------\ # Build HTML table of results \ # \_______________________________________________________ # #results="

Entered Text Content Seq Name is {0} length {1}

".format(nameList,predictions) results="" results+=""" """ for k in range(len(nameList)): results+="".format(nameList[k],lenList[k],predictions[k]) results+="
Sequence Name Length Prediction
{0}{1}{2}
" #----------------------------------------------\ # Build output page \ # \_______________________________________________________ # | #build output page parts #Header and CSS Style bits header="""Content-type:text/html """ #Page contents, first part body1="""

Welcome to CRESSdna.org

Home

Part of the National Science Foundation's Assembling the Tree of Life.

Sponsored with a Grant from the National Science Foundation

Taxonomy



Contact

This site is under construction

Please be patient while we tidy up a bit!

Results

Results from Taxonomy prediction

""" #Page contents, second part (results fit between body1 and body2) body2="""

This classifier will return the best fit of the submitted sequence to the training data.
Currently included in the training data:

  • Circoviridae
  • Nanoviridae
  • Genomoviridae
  • Geminiviridae
  • Smacovirus


  • """ #close the Page footer=""" """ #build the output page page=header+body1+results+body2+footer #send the output as html print (page) quit()