From c5a53342b9be73b7ff11a44e22cfc39eb7af551c Mon Sep 17 00:00:00 2001 From: elavington <27739361+elavington@users.noreply.github.com> Date: Thu, 31 Aug 2017 13:15:24 -0400 Subject: Add files via upload --- SVM_linear_aa_clf.pkl | Bin 0 -> 187597 bytes UniqRepsGemys_6089_StSCALER.pkl | Bin 0 -> 980 bytes classifier.py | 313 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 313 insertions(+) create mode 100644 SVM_linear_aa_clf.pkl create mode 100644 UniqRepsGemys_6089_StSCALER.pkl create mode 100644 classifier.py diff --git a/SVM_linear_aa_clf.pkl b/SVM_linear_aa_clf.pkl new file mode 100644 index 0000000..1afce0a Binary files /dev/null and b/SVM_linear_aa_clf.pkl differ diff --git a/UniqRepsGemys_6089_StSCALER.pkl b/UniqRepsGemys_6089_StSCALER.pkl new file mode 100644 index 0000000..3a098bd Binary files /dev/null and b/UniqRepsGemys_6089_StSCALER.pkl differ diff --git a/classifier.py b/classifier.py new file mode 100644 index 0000000..ecf7c15 --- /dev/null +++ b/classifier.py @@ -0,0 +1,313 @@ +#!/home/erik/bin/python3.6 + +#import packages to be used +from sklearn.svm import SVC +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.preprocessing import StandardScaler +from sklearn.externals import joblib +import cgi, cgitb + +#----------------------------------------------\ +# Parse the web-form information to variables \ +# \_______________________________________________________ +# | +cgitb.enable() +form=cgi.FieldStorage() +alignment = form.getvalue('fasta') +if alignment.startswith(">"): #naive check for FASTA format + list=alignment.split(">") + book={} + for a in list: + tempList=a.splitlines() + nameLine=tempList.pop(0) + name=nameLine.split(" ")[0] + seq="".join(tempList) + book[name]=seq + seqList=[] + lenList=[] + nameList=[] + for i in book: + nameList.append(i) + seqList.append(book[i]) + lenList.append(str(len(book[i]))) + + if len(seqList)=0: #check for empty sequence list + seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] + nameList=['demo'] + lenList=[str(len(alignment[0]))] + +else: + seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] + nameList=['demo'] + lenList=[str(len(alignment[0]))] + +#--------------------------------------------------------------------------------------------------------+ + +#----------------------------------------------\ +# predict genus of input sequences \ +# \_______________________________________________________ +# | +#list of amino acids as vocabulary for the CountVectorizer +AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] + +#load the classifier and scaler +clf=joblib.load("./cgi-bin/SVM_linear_aa_clf.pkl") +StSc=joblib.load("./cgi-bin/UniqRepsGemys_6089_StSCALER.pkl") +cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) + +#initialize text data vectorizer +dataVect=cv.transform(seqList) + +#Scale the data to the training set +X=StSc.transform(dataVect.astype("float64")) + +#make predictions for the original dataset +predictions=clf.predict(X) + + +#----------------------------------------------\ +# Build HTML table of results \ +# \_______________________________________________________ +# | +results="""""" +for k in len(seqList): + results+="""{0}{1}{2}""".format(nameList[k],lenList[k],predictions[k]) +if "demo" in nameList: + results+="""

There seems to have been an error.
If you are expecting more than one prediction or + do not see the name you entered please try the submission form again, making sure that the input is in FASTA format.""" + +#----------------------------------------------\ +# Build output page \ +# \_______________________________________________________ +# | +#build output page parts +#Header and CSS Style bits +header=""" + + + + + + +""" + +#Page contents, first part +body1=""" + + +

Welcome to CRESSdna.org

+ +
+ + + + +
+ +
+

Home

+

Part of the National Science Foundation's Assembling the Tree of Life.

+ Sponsored with a Grant from the National Science Foundation +
+ +
+

Taxonomy

+

Please enter only one word as the name(no space) and only one Rep sequence

+

+
+ +
+ + +
+

+

+
+
+

Contact

+

Questions or comments? Send us an email:

+

email At domain Dot something

+
+ +
+

Results

+

Results from Taxonomy prediction

+ + + + + + +""" + +#Page contents, second part (results fit between body1 and body2) +body2=""" +
Sequence NameLengthPrediction
+

This classifier will return the best fit of the submitted sequence to the training data.
+Currently included in the training data:
+

  • Circoviridae
  • + +
  • Nanoviridae
  • + +
  • Genomoviridae
  • + +
  • Geminiviridae
  • + +
  • Smacovirus
  • +

    +

    +
    + + + +""" + +#close the Page +footer=""" + +""" + +#build the output page +page=header+body1+results+body2+footer + +#send the output as html +output = page.format() +print (output) + +quit() \ No newline at end of file -- cgit v1.2.3