From 06d4e734d88e04cff9069099ba47cfa9c506923e Mon Sep 17 00:00:00 2001 From: elavington Date: Thu, 12 Oct 2017 11:25:07 -0400 Subject: Functional classifier.py --- bin/classifier.py | 180 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 92 insertions(+), 88 deletions(-) (limited to 'bin') diff --git a/bin/classifier.py b/bin/classifier.py index e3150ee..2766f08 100755 --- a/bin/classifier.py +++ b/bin/classifier.py @@ -1,47 +1,52 @@ -#!/home/erik/bin/python3.6 +#!/home/erik/bin/python3 #import packages to be used +import cgi, cgitb +import warnings from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import StandardScaler from sklearn.externals import joblib -import cgi, cgitb -import warnings - -warnings.simplefilter("ignore", UserWarning) +import re +warnings.simplefilter("ignore", UserWarning)#ignore a joblib version warning #----------------------------------------------\ # Parse the web-form information to variables \ # \_______________________________________________________ # | -cgitb.enable() +cgitb.enable(display=1, logdir="/var/www/html/bin/") form=cgi.FieldStorage() -alignment = str(form.getvalue('fasta')) +alignment = form.getvalue('fasta') if alignment.startswith(">"): #naive check for FASTA format list=alignment.split(">") - book={} - for a in list: - tempList=a.splitlines() - nameLine=tempList.pop(0) - name=nameLine.split(" ")[0] - seq="".join(tempList) - book[name]=seq + if list[0] == "": + list.pop(0)#get rid of the leading empty string + seqList=[] lenList=[] nameList=[] - for i in book: - nameList.append(i) - seqList.append(book[i]) - lenList.append(str(len(book[i]))) - + + for a in list: + tempList=a.split("\r\n") + if tempList[-1]=="": + tempList.pop(-1)#get rid of the trailing empty string + + tempSeq="" + nameList.append(tempList[0]) + for element in tempList[1:]: + tempSeq+=element + + seqList.append(tempSeq) + lenList.append(str(len(tempSeq))) + if len(seqList)==0: #check for empty sequence list seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] - nameList=['demo'] + nameList=['Demo'] lenList=[str(len(alignment[0]))] - + else: seqList = ["MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHIEKAKGTDQQNKEYCSKEGHILIECGAPRNQGKRSDLSTAYFDYQQSGPPGMVLLNCCPSCRSSLSEDYYFAILEDCWRTINGGTRRPI"] - nameList=['demo'] + nameList=['Demo'] lenList=[str(len(alignment[0]))] #--------------------------------------------------------------------------------------------------------+ @@ -54,41 +59,44 @@ else: AAs=['a','c','d','e','f','g','h','i','k','l','m','n','p','q','r','s','t','v','w','y'] #load the classifier and scaler -clf=joblib.load("SVM_linear_aa_clf.pkl") -StSc=joblib.load("UniqRepsGemys_6089_StSCALER.pkl") +clf=joblib.load("./SVM_linear_aa_clf.pkl") + +StSc=joblib.load("./UniqRepsGemys_6089_StSCALER.pkl") + cv=CountVectorizer(analyzer='char',ngram_range=(1,1),vocabulary=AAs) #initialize text data vectorizer dataVect=cv.transform(seqList) - + #Scale the data to the training set X=StSc.transform(dataVect.astype("float64")) #make predictions for the original dataset predictions=clf.predict(X) - + #----------------------------------------------\ # Build HTML table of results \ # \_______________________________________________________ -# | -results="""""" -if "demo" in nameList: - results+="""

There seems to have been an error.
If you are expecting more than one prediction or do not see the name you entered please try the submission form again, making sure that the input is in FASTA format.
""" -else: - results+=""" - - - - - - - """ - for k in len(seqList): - results+="""""".format(nameList[k],lenList[k],predictions[k]) - results+=""" -
Sequence NameLengthPrediction
{0}{1}{2}
- """ +# +#results="

Entered Text Content Seq Name is {0} length {1}

".format(nameList,predictions) +results="" +results+=""" + + + + + + + +""" + + +for k in range(len(nameList)): + results+="".format(nameList[k],lenList[k],predictions[k]) + +results+="
Sequence NameLengthPrediction
{0}{1}{2}
" + #----------------------------------------------\ # Build output page \ @@ -96,7 +104,8 @@ else: # | #build output page parts #Header and CSS Style bits -header=""" + +header="""Content-type:text/html @@ -105,66 +114,64 @@ header=""" body {font-family: "Lato", sans-serif;} /* Style the tab */ div.tab { - float: left; - border: 1px solid #ccc; - background-color: #f1f1f1; - width: 20%; - height: 250px; + float: left; + border: 1px solid #ccc; + background-color: #f1f1f1; + width: 20%; + height: 250px; } /* Style the buttons inside the tab */ div.tab button { - display: block; - background-color: inherit; - color: black; - padding: 22px 16px; - width: 100%; - border: none; - outline: none; - text-align: left; - cursor: pointer; - transition: 0.3s; - font-size: 17px; + display: block; + background-color: inherit; + color: black; + padding: 22px 16px; + width: 100%; + border: none; + outline: none; + text-align: left; + cursor: pointer; + transition: 0.3s; + font-size: 17px; } /* Change background color of buttons on hover */ div.tab button:hover { - background-color: #ddd; + background-color: #ddd; } /* Create an active/current "tab button" class */ div.tab button.active { - background-color: #1acefc; + background-color: #1acefc; } /* Style the tab content */ .tabcontent { - float: left; - padding: 0px 12px; - border: 1px solid #ccc; - width: 80%; - min-height: 250px; + float: left; + padding: 0px 12px; + border: 1px solid #ccc; + width: 80%; + min-height: 250px; } table { - border-collapse: collapse; - width: 80%; + border-collapse: collapse; + width: 80%; } th, td { - text-align: left; - padding: 8px; + text-align: left; + padding: 8px; } tr:nth-child(even){background-color: #f2f2f2} th { - background-color: #ff0000; - color: white; + background-color: #ff0000; + color: white; } - -""" +""" #Page contents, first part -body1=""" - +body1="""

Welcome to CRESSdna.org

@@ -173,7 +180,7 @@ body1=""" - +

Home

@@ -236,8 +243,9 @@ MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHI

Contact

-

Questions or comments? Send us an email:

-

email At domain Dot something

+

This site is under construction

+

Please be patient while we tidy up a bit!

+
@@ -247,8 +255,7 @@ MPSKKSGPQPHKRWVFTLNNPSEEEKNKIRELPISLFDYFVCGEEGLEEGRTAHLQGFANFAKKQTFNKVKWYFGARCHI """ #Page contents, second part (results fit between body1 and body2) -body2=""" -

This classifier will return the best fit of the submitted sequence to the training data.
+body2="""

This classifier will return the best fit of the submitted sequence to the training data.
Currently included in the training data:

  • Circoviridae