#!/home/erik/bin/python3 #import packages to be used import cgi, cgitb import warnings from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib import re warnings.simplefilter("ignore", UserWarning)#ignore a joblib version warning #----------------------------------------------\ # Parse the web-form information to variables \ # \_______________________________________________________ # | cgitb.enable(display=1, logdir="/var/www/html/bin/") form=cgi.FieldStorage() alignment = form.getvalue('fasta') if alignment.startswith(">"): #naive check for FASTA format list=alignment.split(">") if list[0] == "": list.pop(0)#get rid of the leading empty string seqList=[] lenList=[] nameList=[] for a in list: tempList=a.split("\r\n") if tempList[-1]=="": tempList.pop(-1)#get rid of the trailing empty string tempSeq="" nameList.append(tempList[0]) for element in tempList[1:]: tempSeq+=element seqList.append(tempSeq) lenList.append(str(len(tempSeq))) if len(seqList)==0: #check for empty sequence list seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['Demo'] lenList=[str(len(alignment[0]))] else: seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['Demo'] lenList=[str(len(alignment[0]))] #--------------------------------------------------------------------------------------------------------+ #----------------------------------------------\ # predict genus of input sequences \ # \_______________________________________________________ # | #list of amino acids as vocabulary for the CountVectorizer AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] #load the classifier and scaler clf=joblib.load("./clf_11_21_2017.pkl") StSc=joblib.load("./StSc_11_21_2017.pkl") cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) #initialize text data vectorizer dataVect=cv.transform(seqList) #Scale the data to the training set X=StSc.transform(dataVect.astype("float64")) #make predictions for the original dataset predictions=clf.predict(X) #----------------------------------------------\ # Build HTML table of results \ # \_______________________________________________________ # #results="
Entered Text Content Seq Name is {0} length {1}
".format(nameList,predictions) results="" results+="""Sequence Name | Length | Prediction |
---|---|---|
{0} | {1} | {2} |
Welcome to CRESSdna.org
Part of the National Science Foundation's Assembling the Tree of Life.
And has been trained on the following Genera:
This site is under construction
Please be patient while we tidy up a bit!
Results from Taxonomy prediction
""" #Page contents, second part (results fit between body1 and body2) body2="""This classifier will return the best fit of the submitted sequence to the training data.
Currently included in the training data: