#!/home/erik/bin/python3.6 #import packages to be used from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib import cgi, cgitb #----------------------------------------------\ # Parse the web-form information to variables \ # \_______________________________________________________ # | cgitb.enable() form=cgi.FieldStorage() alignment = form.getvalue('fasta') if alignment.startswith(">"): #naive check for FASTA format list=alignment.split(">") book={} for a in list: tempList=a.splitlines() nameLine=tempList.pop(0) name=nameLine.split(" ")[0] seq="".join(tempList) book[name]=seq seqList=[] lenList=[] nameList=[] for i in book: nameList.append(i) seqList.append(book[i]) lenList.append(str(len(book[i]))) if len(seqList)=0: #check for empty sequence list seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['demo'] lenList=[str(len(alignment[0]))] else: seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] nameList=['demo'] lenList=[str(len(alignment[0]))] #--------------------------------------------------------------------------------------------------------+ #----------------------------------------------\ # predict genus of input sequences \ # \_______________________________________________________ # | #list of amino acids as vocabulary for the CountVectorizer AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] #load the classifier and scaler clf=joblib.load("./cgi-bin/SVM_linear_aa_clf.pkl") StSc=joblib.load("./cgi-bin/UniqRepsGemys_6089_StSCALER.pkl") cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) #initialize text data vectorizer dataVect=cv.transform(seqList) #Scale the data to the training set X=StSc.transform(dataVect.astype("float64")) #make predictions for the original dataset predictions=clf.predict(X) #----------------------------------------------\ # Build HTML table of results \ # \_______________________________________________________ # | results="""""" for k in len(seqList): results+="""{0}{1}{2}""".format(nameList[k],lenList[k],predictions[k]) if "demo" in nameList: results+="""

There seems to have been an error.
If you are expecting more than one prediction or do not see the name you entered please try the submission form again, making sure that the input is in FASTA format.""" #----------------------------------------------\ # Build output page \ # \_______________________________________________________ # | #build output page parts #Header and CSS Style bits header=""" """ #Page contents, first part body1="""

Welcome to CRESSdna.org

Home

Part of the National Science Foundation's Assembling the Tree of Life.

Sponsored with a Grant from the National Science Foundation

Taxonomy

Please enter only one word as the name(no space) and only one Rep sequence




Contact

Questions or comments? Send us an email:

email At domain Dot something

Results

Results from Taxonomy prediction

""" #Page contents, second part (results fit between body1 and body2) body2="""
Sequence Name Length Prediction

This classifier will return the best fit of the submitted sequence to the training data.
Currently included in the training data:

  • Circoviridae
  • Nanoviridae
  • Genomoviridae
  • Geminiviridae
  • Smacovirus


  • """ #close the Page footer=""" """ #build the output page page=header+body1+results+body2+footer #send the output as html output = page.format() print (output) quit()