#!/home/erik/bin/python3.6 #import packages to be used from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib import cgi, cgitb import warnings warnings.simplefilter("ignore", UserWarning) #----------------------------------------------\ # Parse the web-form information to variables \ # \_______________________________________________________ # | cgitb.enable() form=cgi.FieldStorage() alignment = str(form.getvalue('fasta')) if alignment.startswith(">"): #naive check for FASTA format list=alignment.split(">") book={} for a in list: tempList=a.splitlines() nameLine=tempList.pop(0) name=nameLine.split(" ")[0] seq="".join(tempList) book[name]=seq seqList=[] lenList=[] nameList=[] for i in book: nameList.append(i) seqList.append(book[i]) lenList.append(str(len(book[i]))) if len(seqList)==0: #check for empty sequence list seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['demo'] lenList=[str(len(alignment[0]))] else: seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['demo'] lenList=[str(len(alignment[0]))] #--------------------------------------------------------------------------------------------------------+ #----------------------------------------------\ # predict genus of input sequences \ # \_______________________________________________________ # | #list of amino acids as vocabulary for the CountVectorizer AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] #load the classifier and scaler clf=joblib.load("SVM_linear_aa_clf.pkl") StSc=joblib.load("UniqRepsGemys_6089_StSCALER.pkl") cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) #initialize text data vectorizer dataVect=cv.transform(seqList) #Scale the data to the training set X=StSc.transform(dataVect.astype("float64")) #make predictions for the original dataset predictions=clf.predict(X) #----------------------------------------------\ # Build HTML table of results \ # \_______________________________________________________ # | results="""""" if "demo" in nameList: results+="""

There seems to have been an error.
If you are expecting more than one prediction or do not see the name you entered please try the submission form again, making sure that the input is in FASTA format.
""" else: results+=""" """ for k in len(seqList): results+="""""".format(nameList[k],lenList[k],predictions[k]) results+="""

Sequence Name Length Prediction
{0} {1} {2}
""" #----------------------------------------------\ # Build output page \ # \_______________________________________________________ # | #build output page parts #Header and CSS Style bits header=""" """ #Page contents, first part body1="""

Sequence Name	Length	Prediction
{0}	{1}	{2}

Welcome to CRESSdna.org

Home

Part of the National Science Foundation's Assembling the Tree of Life.

Sponsored with a Grant from the National Science Foundation

Taxonomy

This classifier requires Rep protein sequence to be:

Complete
Unaligned
in FASTA format

And has been trained on the following Genera:

Circoviridae

Circovirus
Cyclovirus

Nanoviridae

Babuvirus
Nanovirus

Genomoviridae

Gemycircularvirus
Gemygorvirus
Gemykibivirus
Gemykolovirus
Gemykrogvirus
Gemyvongvirus

Geminiviridae

Becurtovirus
Begomovirus
Capulavirus
Curtovirus
Eragrovirus
Grablovirus
Mastrevirus
Turncurtovirus

Smacovirus

Contact

Questions or comments? Send us an email:

email At domain Dot something

Results

Results from Taxonomy prediction

""" #Page contents, second part (results fit between body1 and body2) body2="""

This classifier will return the best fit of the submitted sequence to the training data.
Currently included in the training data:

Circoviridae

Circovirus
Cyclovirus

Nanoviridae

Babuvirus
Nanovirus

Genomoviridae

Gemycircularvirus
Gemygorvirus
Gemykibivirus
Gemykolovirus
Gemykrogvirus
Gemyvongvirus

Geminiviridae

Becurtovirus
Begomovirus
Capulavirus
Curtovirus
Eragrovirus
Grablovirus
Mastrevirus
Turncurtovirus

Smacovirus

""" #close the Page footer=""" """ #build the output page page=header+body1+results+body2+footer #send the output as html #output = page.format() print (page) quit()