#!/home/erik/bin/python3.6 #import packages to be used from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib import cgi, cgitb import warnings warnings.simplefilter("ignore", UserWarning) #----------------------------------------------\ # Parse the web-form information to variables \ # \_______________________________________________________ # | cgitb.enable() form=cgi.FieldStorage() alignment = str(form.getvalue('fasta')) if alignment.startswith(">"): #naive check for FASTA format list=alignment.split(">") book={} for a in list: tempList=a.splitlines() nameLine=tempList.pop(0) name=nameLine.split(" ")[0] seq="".join(tempList) book[name]=seq seqList=[] lenList=[] nameList=[] for i in book: nameList.append(i) seqList.append(book[i]) lenList.append(str(len(book[i]))) if len(seqList)==0: #check for empty sequence list seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['demo'] lenList=[str(len(alignment[0]))] else: seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['demo'] lenList=[str(len(alignment[0]))] #--------------------------------------------------------------------------------------------------------+ #----------------------------------------------\ # predict genus of input sequences \ # \_______________________________________________________ # | #list of amino acids as vocabulary for the CountVectorizer AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] #load the classifier and scaler clf=joblib.load("SVM_linear_aa_clf.pkl") StSc=joblib.load("UniqRepsGemys_6089_StSCALER.pkl") cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) #initialize text data vectorizer dataVect=cv.transform(seqList) #Scale the data to the training set X=StSc.transform(dataVect.astype("float64")) #make predictions for the original dataset predictions=clf.predict(X) #----------------------------------------------\ # Build HTML table of results \ # \_______________________________________________________ # | results="""""" if "demo" in nameList: results+="""
There seems to have been an error.
If you are expecting more than one prediction or do not see the name you entered please try the submission form again, making sure that the input is in FASTA format.
"""
else:
results+="""
Sequence Name | Length | Prediction |
---|---|---|
{0} | {1} | {2} |
Welcome to CRESSdna.org
And has been trained on the following Genera:
Questions or comments? Send us an email:
email At domain Dot something
Results from Taxonomy prediction
""" #Page contents, second part (results fit between body1 and body2) body2="""This classifier will return the best fit of the submitted sequence to the training data.
Currently included in the training data: