#!/home/erik/bin/python3.6 #import packages to be used from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib import cgi, cgitb #----------------------------------------------\ # Parse the web-form information to variables \ # \_______________________________________________________ # | cgitb.enable() form=cgi.FieldStorage() alignment = form.getvalue('fasta') if alignment.startswith(">"): #naive check for FASTA format list=alignment.split(">") book={} for a in list: tempList=a.splitlines() nameLine=tempList.pop(0) name=nameLine.split(" ")[0] seq="".join(tempList) book[name]=seq seqList=[] lenList=[] nameList=[] for i in book: nameList.append(i) seqList.append(book[i]) lenList.append(str(len(book[i]))) if len(seqList)=0: #check for empty sequence list seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['demo'] lenList=[str(len(alignment[0]))] else: seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['demo'] lenList=[str(len(alignment[0]))] #--------------------------------------------------------------------------------------------------------+ #----------------------------------------------\ # predict genus of input sequences \ # \_______________________________________________________ # | #list of amino acids as vocabulary for the CountVectorizer AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] #load the classifier and scaler clf=joblib.load("./cgi-bin/SVM_linear_aa_clf.pkl") StSc=joblib.load("./cgi-bin/UniqRepsGemys_6089_StSCALER.pkl") cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) #initialize text data vectorizer dataVect=cv.transform(seqList) #Scale the data to the training set X=StSc.transform(dataVect.astype("float64")) #make predictions for the original dataset predictions=clf.predict(X) #----------------------------------------------\ # Build HTML table of results \ # \_______________________________________________________ # | results="""""" for k in len(seqList): results+="""
There seems to have been an error.
If you are expecting more than one prediction or
do not see the name you entered please try the submission form again, making sure that the input is in FASTA format."""
#----------------------------------------------\
# Build output page \
# \_______________________________________________________
# |
#build output page parts
#Header and CSS Style bits
header="""
Welcome to CRESSdna.org
Part of the National Science Foundation's Assembling the Tree of Life.
Please enter only one word as the name(no space) and only one Rep sequence
And has been trained on the following Genera:
Questions or comments? Send us an email:
email At domain Dot something
Results from Taxonomy prediction
Sequence Name | Length | Prediction |
---|
This classifier will return the best fit of the submitted sequence to the training data.
Currently included in the training data: