From c5a53342b9be73b7ff11a44e22cfc39eb7af551c Mon Sep 17 00:00:00 2001 From: elavington <27739361+elavington@users.noreply.github.com> Date: Thu, 31 Aug 2017 13:15:24 -0400 Subject: Add files via upload --- SVM_linear_aa_clf.pkl | Bin 0 -> 187597 bytes UniqRepsGemys_6089_StSCALER.pkl | Bin 0 -> 980 bytes classifier.py | 313 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 313 insertions(+) create mode 100644 SVM_linear_aa_clf.pkl create mode 100644 UniqRepsGemys_6089_StSCALER.pkl create mode 100644 classifier.py diff --git a/SVM_linear_aa_clf.pkl b/SVM_linear_aa_clf.pkl new file mode 100644 index 0000000..1afce0a Binary files /dev/null and b/SVM_linear_aa_clf.pkl differ diff --git a/UniqRepsGemys_6089_StSCALER.pkl b/UniqRepsGemys_6089_StSCALER.pkl new file mode 100644 index 0000000..3a098bd Binary files /dev/null and b/UniqRepsGemys_6089_StSCALER.pkl differ diff --git a/classifier.py b/classifier.py new file mode 100644 index 0000000..ecf7c15 --- /dev/null +++ b/classifier.py @@ -0,0 +1,313 @@ +#!/home/erik/bin/python3.6 + +#import packages to be used +from sklearn.svm import SVC +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.preprocessing import StandardScaler +from sklearn.externals import joblib +import cgi, cgitb + +#----------------------------------------------\ +# Parse the web-form information to variables \ +# \_______________________________________________________ +# | +cgitb.enable() +form=cgi.FieldStorage() +alignment = form.getvalue('fasta') +if alignment.startswith(">"): #naive check for FASTA format + list=alignment.split(">") + book={} + for a in list: + tempList=a.splitlines() + nameLine=tempList.pop(0) + name=nameLine.split(" ")[0] + seq="".join(tempList) + book[name]=seq + seqList=[] + lenList=[] + nameList=[] + for i in book: + nameList.append(i) + seqList.append(book[i]) + lenList.append(str(len(book[i]))) + + if len(seqList)=0: #check for empty sequence list + seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] + nameList=['demo'] + lenList=[str(len(alignment[0]))] + +else: + seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] + nameList=['demo'] + lenList=[str(len(alignment[0]))] + +#--------------------------------------------------------------------------------------------------------+ + +#----------------------------------------------\ +# predict genus of input sequences \ +# \_______________________________________________________ +# | +#list of amino acids as vocabulary for the CountVectorizer +AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] + +#load the classifier and scaler +clf=joblib.load("./cgi-bin/SVM_linear_aa_clf.pkl") +StSc=joblib.load("./cgi-bin/UniqRepsGemys_6089_StSCALER.pkl") +cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) + +#initialize text data vectorizer +dataVect=cv.transform(seqList) + +#Scale the data to the training set +X=StSc.transform(dataVect.astype("float64")) + +#make predictions for the original dataset +predictions=clf.predict(X) + + +#----------------------------------------------\ +# Build HTML table of results \ +# \_______________________________________________________ +# | +results="""""" +for k in len(seqList): + results+="""{0}{1}{2}""".format(nameList[k],lenList[k],predictions[k]) +if "demo" in nameList: + results+="""

There seems to have been an error.
If you are expecting more than one prediction or + do not see the name you entered please try the submission form again, making sure that the input is in FASTA format.""" + +#----------------------------------------------\ +# Build output page \ +# \_______________________________________________________ +# | +#build output page parts +#Header and CSS Style bits +header=""" + + + + + + +""" + +#Page contents, first part +body1=""" + + +

Welcome to CRESSdna.org

+ +

+ + + + +

+ +

Home

Part of the National Science Foundation's Assembling the Tree of Life.

Sponsored with a Grant from the National Science Foundation

+ +

Taxonomy

Please enter only one word as the name(no space) and only one Rep sequence

+ +

This classifier requires Rep protein sequence to be:

Complete
Unaligned
in FASTA format

And has been trained on the following Genera:

Circoviridae

Circovirus
Cyclovirus

Nanoviridae

Babuvirus
Nanovirus

Genomoviridae

Gemycircularvirus
Gemygorvirus
Gemykibivirus
Gemykolovirus
Gemykrogvirus
Gemyvongvirus

Geminiviridae

Becurtovirus
Begomovirus
Capulavirus
Curtovirus
Eragrovirus
Grablovirus
Mastrevirus
Turncurtovirus

Smacovirus

Contact

Questions or comments? Send us an email:

email At domain Dot something

+ +

Results

Results from Taxonomy prediction

+ + + + + + +""" + +#Page contents, second part (results fit between body1 and body2) +body2=""" +

Sequence Name	Length	Prediction

This classifier will return the best fit of the submitted sequence to the training data.
+Currently included in the training data:
+

Circoviridae

Circovirus
Cyclovirus

Nanoviridae

Babuvirus
Nanovirus

Genomoviridae

Gemycircularvirus
Gemygorvirus
Gemykibivirus
Gemykolovirus
Gemykrogvirus
Gemyvongvirus

Geminiviridae

Becurtovirus
Begomovirus
Capulavirus
Curtovirus
Eragrovirus
Grablovirus
Mastrevirus
Turncurtovirus

Smacovirus

+

+

+ + + +""" + +#close the Page +footer=""" + +""" + +#build the output page +page=header+body1+results+body2+footer + +#send the output as html +output = page.format() +print (output) + +quit() \ No newline at end of file -- cgit v1.2.3