From c5a53342b9be73b7ff11a44e22cfc39eb7af551c Mon Sep 17 00:00:00 2001 From: elavington <27739361+elavington@users.noreply.github.com> Date: Thu, 31 Aug 2017 13:15:24 -0400 Subject: Add files via upload --- SVM_linear_aa_clf.pkl | Bin 0 -> 187597 bytes UniqRepsGemys_6089_StSCALER.pkl | Bin 0 -> 980 bytes classifier.py | 313 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 313 insertions(+) create mode 100644 SVM_linear_aa_clf.pkl create mode 100644 UniqRepsGemys_6089_StSCALER.pkl create mode 100644 classifier.py diff --git a/SVM_linear_aa_clf.pkl b/SVM_linear_aa_clf.pkl new file mode 100644 index 0000000..1afce0a Binary files /dev/null and b/SVM_linear_aa_clf.pkl differ diff --git a/UniqRepsGemys_6089_StSCALER.pkl b/UniqRepsGemys_6089_StSCALER.pkl new file mode 100644 index 0000000..3a098bd Binary files /dev/null and b/UniqRepsGemys_6089_StSCALER.pkl differ diff --git a/classifier.py b/classifier.py new file mode 100644 index 0000000..ecf7c15 --- /dev/null +++ b/classifier.py @@ -0,0 +1,313 @@ +#!/home/erik/bin/python3.6 + +#import packages to be used +from sklearn.svm import SVC +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.preprocessing import StandardScaler +from sklearn.externals import joblib +import cgi, cgitb + +#----------------------------------------------\ +# Parse the web-form information to variables \ +# \_______________________________________________________ +# | +cgitb.enable() +form=cgi.FieldStorage() +alignment = form.getvalue('fasta') +if alignment.startswith(">"): #naive check for FASTA format + list=alignment.split(">") + book={} + for a in list: + tempList=a.splitlines() + nameLine=tempList.pop(0) + name=nameLine.split(" ")[0] + seq="".join(tempList) + book[name]=seq + seqList=[] + lenList=[] + nameList=[] + for i in book: + nameList.append(i) + seqList.append(book[i]) + lenList.append(str(len(book[i]))) + + if len(seqList)=0: #check for empty sequence list + seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] + nameList=['demo'] + lenList=[str(len(alignment[0]))] + +else: + seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] + nameList=['demo'] + lenList=[str(len(alignment[0]))] + +#--------------------------------------------------------------------------------------------------------+ + +#----------------------------------------------\ +# predict genus of input sequences \ +# \_______________________________________________________ +# | +#list of amino acids as vocabulary for the CountVectorizer +AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] + +#load the classifier and scaler +clf=joblib.load("./cgi-bin/SVM_linear_aa_clf.pkl") +StSc=joblib.load("./cgi-bin/UniqRepsGemys_6089_StSCALER.pkl") +cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) + +#initialize text data vectorizer +dataVect=cv.transform(seqList) + +#Scale the data to the training set +X=StSc.transform(dataVect.astype("float64")) + +#make predictions for the original dataset +predictions=clf.predict(X) + + +#----------------------------------------------\ +# Build HTML table of results \ +# \_______________________________________________________ +# | +results="""""" +for k in len(seqList): + results+="""
There seems to have been an error.
If you are expecting more than one prediction or
+ do not see the name you entered please try the submission form again, making sure that the input is in FASTA format."""
+
+#----------------------------------------------\
+# Build output page \
+# \_______________________________________________________
+# |
+#build output page parts
+#Header and CSS Style bits
+header="""
+
+
+
+
Welcome to CRESSdna.org
+ +Part of the National Science Foundation's Assembling the Tree of Life.
+ +Please enter only one word as the name(no space) and only one Rep sequence
+ ++
And has been trained on the following Genera:
+Questions or comments? Send us an email:
+email At domain Dot something
+Results from Taxonomy prediction
+Sequence Name | +Length | +Prediction | +
---|
This classifier will return the best fit of the submitted sequence to the training data.
+Currently included in the training data:
+