diff --git a/build_models.sh b/build_models.sh index 38cbd15..1853f5c 100644 --- a/build_models.sh +++ b/build_models.sh @@ -1,43 +1,46 @@ #!/bin/bash #Steps -#1 Download data from: +#1 Download data from: see GitHub README to download the plncpro_data.zip file #2 Setup a blastdb and replace ~/uniprotdb/prot.db with your blastdb #3 execute the required commands to build prediction models #NOTE: make sure all relative paths to files and data are correct +# Monocot plncpro build -p plncpro_data/plant_new_fasta/monocot/train/monocot_pct_train.fa -n plncpro_data/plant_new_fasta/monocot/train/monocot_lnct_train.fa -o monocot_model -m monocot.model -d ~/uniprotdb/prot.db -t 6 +# Amborella trichopoda plncpro build -p plncpro_data/plant_new_fasta/amt/train/amt_pct_train.fa -n plncpro_data/plant_new_fasta/amt/train/amt_lnct_train.fa -o amt_model -m amt.model -d ~/uniprotdb/prot.db -t 6 +# Arabidopsis thaliana plncpro build -p plncpro_data/plant_new_fasta/at/train/at_pct_train.fa -n plncpro_data/plant_new_fasta/at/train/at_lnct_train.fa -o at_model -m at.model -d ~/uniprotdb/prot.db -t 6 +# Chlamydomonas reinhardtii plncpro build -p plncpro_data/plant_new_fasta/cr/train/cr_pct_train.fa -n plncpro_data/plant_new_fasta/cr/train/cr_lnct_train.fa -o cr_model -m cr.model -d ~/uniprotdb/prot.db -t 6 - +# Glycine max plncpro build -p plncpro_data/plant_new_fasta/gm/train/gm_pct_train.fa -n plncpro_data/plant_new_fasta/gm/train/gm_lnct_train.fa -o gm_model -m gm.model -d ~/uniprotdb/prot.db -t 6 - +# Oryza sativa plncpro build -p plncpro_data/plant_new_fasta/os/train/os_pct_train.fa -n plncpro_data/plant_new_fasta/os/train/os_lnct_train.fa -o os_model -m os.model -d ~/uniprotdb/prot.db -t 6 - +# Physcomitrella patens plncpro build -p plncpro_data/plant_new_fasta/pp/train/pp_pct_train.fa -n plncpro_data/plant_new_fasta/pp/train/pp_lnct_train.fa -o pp_model -m pp.model -d ~/uniprotdb/prot.db -t 6 - +# Selaginella moellendorffii plncpro build -p plncpro_data/plant_new_fasta/sm/train/sm_pct_train.fa -n plncpro_data/plant_new_fasta/sm/train/sm_lnct_train.fa -o sm_model -m sm.model -d ~/uniprotdb/prot.db -t 6 - +# Solanum tuberosum plncpro build -p plncpro_data/plant_new_fasta/st/train/st_pct_train.fa -n plncpro_data/plant_new_fasta/st/train/st_lnct_train.fa -o st_model -m st.model -d ~/uniprotdb/prot.db -t 6 - - +# Vinis vitifera plncpro build -p plncpro_data/plant_new_fasta/vv/train/vv_pct_train.fa -n plncpro_data/plant_new_fasta/vv/train/vv_lnct_train.fa -o vv_model -m vv.model -d ~/uniprotdb/prot.db -t 6 - +# Zea mays plncpro build -p plncpro_data/plant_new_fasta/zm/train/zm_pct_train.fa -n plncpro_data/plant_new_fasta/zm/train/zm_lnct_train.fa -o zm_model -m zm.model -d ~/uniprotdb/prot.db -t 6 - +# Homo sapiens plncpro build -p plncpro_data/hg24/train/hg24_pct_train_5000.fa -n plncpro_data/hg24/train/hg24_lnct_train_5000.fa -o hg_model -m hg.model -d ~/uniprotdb/prot.db -t 6 - +# Mus musculus plncpro build -p plncpro_data/mm8/train/m8_pct_train_2500.fa -n plncpro_data/mm8/train/m8_lnct_train_2500.fa -o mm_model -m mm.model -d ~/uniprotdb/prot.db -t 6 diff --git a/plncpro/bin/blastparse_mt3.py b/plncpro/bin/blastparse_mt3.py index f20f01e..1221035 100644 --- a/plncpro/bin/blastparse_mt3.py +++ b/plncpro/bin/blastparse_mt3.py @@ -15,7 +15,8 @@ import sys import math import re -import threading +#import threading +import multiprocess as mp from collections import OrderedDict import queue import urllib.request, urllib.error, urllib.parse @@ -66,7 +67,8 @@ def getentropy(a,b,c): #print -1*((pa*getlog2(pa))+(pb*getlog2(pb))+(pc*getlog2(pc))) return -1*((pa*getlog2(pa))+(pb*getlog2(pb))+(pc*getlog2(pc))) -class BlastParse(threading.Thread): +#$class BlastParse(threading.Thread): +class BlastParse(mp.Process): def __init__(self,data,ids): super(BlastParse, self).__init__() self.data=data @@ -91,7 +93,7 @@ def run(self): qframe=int(line.split('\t')[8]) #print 'q qframe' #print q,str(qframe) - + ctr=ctr+1 hitscore=hitscore+(-1*getlog10(evalue)) bitscore=bitscore+float(line.split('\t')[7]) @@ -101,14 +103,14 @@ def run(self): frame2=frame2+1 elif(qframe==3): frame3=frame3+1 - + #print q,':',str(ctr) #self.reslist.append(str(q)+'~'+str(ctr)) #hitscore=hitscore/ctr #self.reslist.append([str(q),ctr]) - #print q,str(frame1),str(frame2),str(frame3),str(qcov) - self.reslist.append([str(q),ctr,hitscore,frame1,frame2,frame3,bitscore]) - + #print q,str(frame1),str(frame2),str(frame3),str(qcov) + self.reslist.append([str(q),ctr,hitscore,frame1,frame2,frame3,bitscore]) + def joinresults(l): #print l @@ -117,9 +119,9 @@ def joinresults(l): for y in x: #print y[0] all_qids.append(y[0]) - + #remove duplicates - all_qids=list(OrderedDict.fromkeys(all_qids)) + all_qids=list(OrderedDict.fromkeys(all_qids)) #print all_qids numhits=[] hit_scores=[] @@ -139,7 +141,7 @@ def joinresults(l): for x in l: foundflag=0 for y in x: - + if q==y[0]: #print 'here' #print y[1] @@ -175,7 +177,9 @@ def joinresults(l): ##divide the blast file (content) and do multithreading #print len(content) #N=len(content)/500 -N=10 +N=1 +if len(sys.argv) == 2 + N=sys.argv[2] c=split_list(content,N ) #extract queryids for each sublist i.e. c[i] qids=[] @@ -186,7 +190,7 @@ def joinresults(l): #remove duplicates q=list(OrderedDict.fromkeys(q)) qids.append(q) - + #print 'final qids' #print qids @@ -199,6 +203,7 @@ def joinresults(l): # (i) does not make a sequence, so (i,) #t = threading.Thread(target=countnumhits(c[i],qids[i]), args=(i,)) t=BlastParse(c[i],qids[i]) + #t=mp.Process(target=BlastParse,args=[c[i],qids[i]]) #print t # Sticks the thread in a list so that it remains accessible thread_list.append(t) diff --git a/plncpro/build.py b/plncpro/build.py index aadd670..bea0ba9 100644 --- a/plncpro/build.py +++ b/plncpro/build.py @@ -20,8 +20,8 @@ from Bio import SeqIO def printhelp(): - - + + @@ -30,34 +30,36 @@ def printhelp(): print("This script generates classification model from codin and non coding transcripts") print("Arguments:") print("-h print this message") + print("-f,--force overwrite output, by default if the file exists") print("-p,--pos path to file containing protein coding examples") print("-n,--neg path to file containing non coding examples") print("-m,--model output model name") - print("-o,--outdir output directory name to store all results") + print("-o,--outdir output directory name to store all results") print("-d path to blast database") print(" OPTIONAL") print("-t number of threads[default: 4]") print("-k number of trees[default: 1000]") print("-r clean up intermediate files") - print("-v show more messages") + print("-v show more messages") print("--min_len specifiy min_length to filter input files") print("--noblast Don't use blast features") print("--no_ff Don't use framefinder features") print("--qcov_hsp specify qcov parameter for blast[default:30]") print("--pos_blastres path to blast output for positive input file") print("--neg_blastres path to blast output for negative input file") - + def main(args = sys.argv,home=None): - + ###################################### ############Define variables############## pos_flag=False neg_flag=False db_flag=False + force_flag=False model_flag=False removefiles_flag=False noblast_flag=False @@ -86,7 +88,7 @@ def main(args = sys.argv,home=None): path_sep=os.pathsep ############################Set input options###################### try: - opts, args = getopt.getopt(sys.argv[2:],"ht:n:rm:p:o:d:k:v",["pos=","neg=","noblast","no_ff","qcov_hsp=","threads=","db=","outdir=","model=","num_trees=","remove_temp","pos_blastres=","neg_blastres=","min_len="]) + opts, args = getopt.getopt(sys.argv[2:],"hft:n:rm:p:o:d:k:v",["pos=","neg=","noblast","no_ff","qcov_hsp=","threads=","db=","outdir=","model=","num_trees=","remove_temp","pos_blastres=","neg_blastres=","min_len=","force"]) except getopt.GetoptError: printhelp() sys.exit(2) @@ -95,12 +97,14 @@ def main(args = sys.argv,home=None): if opt == '-h': printhelp() sys.exit() - + elif opt in ("-p", "--pos"): #print 'pos found' pos_file=os.path.abspath(arg) #print (pos_file) #print arg + elif opt in ("-f", "--force"): + force_flag=True elif opt in ("-n", "--neg"): #print 'neg found' neg_file=os.path.abspath(arg) @@ -179,12 +183,12 @@ def main(args = sys.argv,home=None): if (os.path.isfile(model_file) ): print(('Error... model file already exists: '+model_file+'\nExiting...')) sys.exit(0) - + #check pos,neg exists if not (os.path.isfile(pos_file) ): print(('Please check pos file...Error file:'+pos_file+ ' doesn\'t exist')) sys.exit(0) - + if not (os.path.isfile(neg_file) ): print(('Please check neg file...Error file:'+neg_file+' doesn\'t exist')) sys.exit(0) @@ -195,14 +199,14 @@ def main(args = sys.argv,home=None): if db_flag==False: print('Please specify blast database...Error') sys.exit(0) - + ##check for blastres files if pos_blastres_flag==True: if not (os.path.isfile(pos_blastres_file) ): print(('Please check pos_blastres file...Error file: '+pos_blastres_file+ ' doesn\'t exist')) sys.exit(0) - - + + if neg_blastres_flag==True: if not (os.path.isfile(neg_blastres_file) ): print(('Please check neg_blastres file...Error file: '+neg_blastres_file+ ' doesn\'t exist')) @@ -245,12 +249,14 @@ def main(args = sys.argv,home=None): print('New files with filtered sequences:') print(neg_file) print(pos_file) - + ############################################Read neg file############################################################ if vflag: print('Reading Negative File...\nExtracting Features...') - os.system("python "+home+"/bin/extractfeatures.py "+neg_file+" 0") - + okfile=outdir+"/neg_features.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/extractfeatures.py "+neg_file+" 0") + with open(okfile, mode="w"): pass ########Run framefinder ff_featurefile_neg=neg_file+"_ffout_framefinderfeatures" if no_ff_flag==True: @@ -258,21 +264,33 @@ def main(args = sys.argv,home=None): os.system("echo '' > "+neg_file+"_ffout") else: - os.system(home+"/lib/framefinder/framefinder -r False -w "+home+"/lib/framefinder/framefinder.model "+neg_file+" > "+neg_file+"_ffout") + okfile=outdir+"/neg_ffout.ok" + if not os.path.isfile(okfile) or force_flag : + os.system(home+"/lib/framefinder/framefinder -r False -w "+home+"/lib/framefinder/framefinder.model "+neg_file+" > "+neg_file+"_ffout") + with open(okfile, mode="w"): pass #parse framefinder results and write framefinder feature files if vflag: print('Extracting Framefinder Features...') - os.system("python "+home+"/bin/ffparse.py "+neg_file+"_ffout") + okfile=outdir+"/neg_ff_features.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/ffparse.py "+neg_file+"_ffout") + with open(okfile, mode="w"): pass ######Run BLASTX######## if noblast_flag==False: if neg_blastres_flag==True: if vflag: print('Parsing Negative Blast results...\nExtracting Features...') - os.system("python "+home+"/bin/blastparse_mt3.py "+neg_blastres_file) + okfile=outdir+"/neg_blastres.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/blastparse_mt3.py "+neg_blastres_file+" "+str(num_threads)) + with open(okfile, mode="w"): pass if vflag: print('Merging all Features...') - os.system("python "+home+"/bin/mergefeatures.py "+neg_file+"_features "+no_ff_flagval+" "+ff_featurefile_neg+" "+noblast_flag_val+" "+neg_blastres_file+"_blastfeatures "+" "+neg_file+"_all_features") + okfile=outdir+"/neg_blast_merge.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/mergefeatures.py "+neg_file+"_features "+no_ff_flagval+" "+ff_featurefile_neg+" "+noblast_flag_val+" "+neg_blastres_file+"_blastfeatures "+" "+neg_file+"_all_features") + with open(okfile, mode="w"): pass else: #filename for blastres blastres_neg=neg_file+"_blastres" @@ -283,19 +301,26 @@ def main(args = sys.argv,home=None): os.system(str(bcommand)) if vflag: print('Parsing Blast Results...') - print("python "+home+"/bin/blastparse_mt3.py "+blastres_neg) + print("python "+home+"/bin/blastparse_mt3.py "+blastres_neg+" "+str(num_threads)) - os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_neg) + okfile=outdir+"/neg_blastres.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_neg+" "+str(num_threads)) + with open(okfile, mode="w"): pass print('Merging all Features...') print("python "+home+"/bin/mergefeatures.py "+neg_file+"_features "+no_ff_flagval+" "+ff_featurefile_neg+" "+noblast_flag_val+" "+blastres_neg+"_blastfeatures "+" "+neg_file+"_all_features") - os.system("python "+home+"/bin/mergefeatures.py "+neg_file+"_features "+no_ff_flagval+" "+ff_featurefile_neg+" "+noblast_flag_val+" "+blastres_neg+"_blastfeatures "+" "+neg_file+"_all_features") + + okfile=outdir+"/neg_blast_merge.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/mergefeatures.py "+neg_file+"_features "+no_ff_flagval+" "+ff_featurefile_neg+" "+noblast_flag_val+" "+blastres_neg+"_blastfeatures "+" "+neg_file+"_all_features") + with open(okfile, mode="w"): pass else: if vflag: print('Skipping Blast...') blastres_neg=neg_file+"_blastres" os.system("echo 'X X 0 0 0 0 0 0 0 0' > "+blastres_neg) - os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_neg) + os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_neg+" "+num_threads) print('Merging all Features...') os.system("python "+home+"/bin/mergefeatures.py "+neg_file+"_features "+no_ff_flagval+" "+ff_featurefile_neg+" "+noblast_flag_val+" "+blastres_neg+"_blastfeatures "+" "+neg_file+"_all_features") @@ -303,7 +328,11 @@ def main(args = sys.argv,home=None): ############################################Read pos file############################################################### if vflag: print('Reading Positive File...\nExtracting Features...') - os.system("python "+home+"/bin/extractfeatures.py "+pos_file+" 1") + + okfile=outdir+"/pos_features.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/extractfeatures.py "+pos_file+" 1") + with open(okfile, mode="w"): pass ########Run framefinder ff_featurefile_pos=pos_file+"_ffout_framefinderfeatures" @@ -311,21 +340,33 @@ def main(args = sys.argv,home=None): print('Skipping framefinder...') os.system("echo '' > "+pos_file+"_ffout") else: - os.system(home+"/lib/framefinder/framefinder -r False -w "+home+"/lib/framefinder/framefinder.model "+pos_file+" > "+pos_file+"_ffout") + okfile=outdir+"/pos_ffout.ok" + if not os.path.isfile(okfile) or force_flag : + os.system(home+"/lib/framefinder/framefinder -r False -w "+home+"/lib/framefinder/framefinder.model "+pos_file+" > "+pos_file+"_ffout") + with open(okfile, mode="w"): pass #parse framefinder results and write framefinder feature files if vflag: print('Extracting Framefinder Features...') - os.system("python "+home+"/bin/ffparse.py "+pos_file+"_ffout") + okfile=outdir+"/pos_ff_features.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/ffparse.py "+pos_file+"_ffout") + with open(okfile, mode="w"): pass ######Run BLASTX######## if noblast_flag==False: if pos_blastres_flag==True: if vflag: print('Parsing Blast results...\nExtracting Features...') - os.system("python "+home+"/bin/blastparse_mt3.py "+pos_blastres_file) + okfile=outdir+"/pos_blastres.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/blastparse_mt3.py "+pos_blastres_file+" "+str(num_threads)) + with open(okfile, mode="w"): pass if vflag: print('Merging all Features...') - os.system("python "+home+"/bin/mergefeatures.py "+pos_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+pos_blastres_file+"_blastfeatures "+" "+pos_file+"_all_features") + okfile=outdir+"/pos_blast_merge.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/mergefeatures.py "+pos_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+pos_blastres_file+"_blastfeatures "+" "+pos_file+"_all_features") + with open(okfile, mode="w"): pass else: #filename for blastres blastres_pos=pos_file+"_blastres" @@ -336,17 +377,22 @@ def main(args = sys.argv,home=None): os.system(str(bcommand)) if vflag: print('Parsing Blast Results...') - - os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_pos) + okfile=outdir+"/pos_blastres.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_pos+" "+str(num_threads)) + with open(okfile, mode="w"): pass print('Merging all Features...') - os.system("python "+home+"/bin/mergefeatures.py "+pos_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_pos+"_blastfeatures "+" "+pos_file+"_all_features") + okfile=outdir+"/pos_blast_merge.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/mergefeatures.py "+pos_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_pos+"_blastfeatures "+" "+pos_file+"_all_features") + with open(okfile, mode="w"): pass else: if vflag: print('Skipping Blast...') blastres_pos=pos_file+"_blastres" os.system("echo 'X X 0 0 0 0 0 0 0 0' > "+blastres_pos) - os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_pos) + os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_pos+" "+str(num_threads)) print('Merging all Features...') os.system("python "+home+"/bin/mergefeatures.py "+pos_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_pos+"_blastfeatures "+" "+pos_file+"_all_features") @@ -362,41 +408,51 @@ def main(args = sys.argv,home=None): if vflag: print('Building Model...') - os.system("python "+home+"/bin/rf/buildmodel.py "+neg_file+"_final_features "+model_name+" "+str(num_trees)+" "+str(num_threads)) + okfile=outdir+"/build.ok" + if not os.path.isfile(okfile) or force_flag : + os.system("python "+home+"/bin/rf/buildmodel.py "+neg_file+"_final_features "+model_name+" "+str(num_trees)+" "+str(num_threads)) + with open(okfile, mode="w"): pass ################Remove Temp Files################## if removefiles_flag==True: print('Removing temp files...') #print "rm -f "+pos_file+"_ffout_framefinderfeatures" os.system("rm -f "+pos_file+"_ffout_framefinderfeatures") - os.system("rm -f "+pos_file+"_blastres_blastfeatures") os.system("rm -f "+pos_file+"_features") os.system("rm -f "+pos_file+"_ffout") - + if not pos_blastres_flag : + os.system("rm -f "+pos_file+"_blastres_blastfeatures") + os.system("rm -f "+neg_file+"_ffout_framefinderfeatures") - os.system("rm -f "+neg_file+"_blastres_blastfeatures") os.system("rm -f "+neg_file+"_features") os.system("rm -f "+neg_file+"_ffout") - if min_len_flag==True: + if not neg_blastres_flag : + os.system("rm -f "+neg_file+"_blastres_blastfeatures") + + if min_len_flag==True: os.system("rm -f "+neg_file) - os.system("rm -f "+pos_file) + os.system("rm -f "+pos_file) #########################################Move files to out_dir########################################## neg_files_dir=os.path.dirname(os.path.realpath(neg_file)) if not os.path.exists(outdir): os.makedirs(outdir) os.system("mv "+neg_file+"_all_features"+" "+outdir+"/") - os.system("mv "+neg_file+"_blastres"+" "+outdir) + if not neg_blastres_flag : + os.system("mv "+neg_file+"_blastres"+" "+outdir) os.system("mv "+neg_file+"_final_features"+" "+outdir) os.system("mv "+pos_file+"_all_features"+" "+outdir+"/") - os.system("mv "+pos_file+"_blastres"+" "+outdir) + if not pos_blastres_flag : + os.system("mv "+pos_file+"_blastres"+" "+outdir) if removefiles_flag==False: - os.system("mv "+pos_file+"_blastres_blastfeatures"+" "+outdir) + if not pos_blastres_flag : + os.system("mv "+pos_file+"_blastres_blastfeatures"+" "+outdir) os.system("mv "+pos_file+"_features"+" "+outdir) os.system("mv "+pos_file+"_ffout"+" "+outdir) os.system("mv "+pos_file+"_ffout_framefinderfeatures"+" "+outdir) - os.system("mv "+neg_file+"_blastres_blastfeatures"+" "+outdir) + if not neg_blastres_flag : + os.system("mv "+neg_file+"_blastres_blastfeatures"+" "+outdir) os.system("mv "+neg_file+"_features"+" "+outdir) os.system("mv "+neg_file+"_ffout"+" "+outdir) os.system("mv "+neg_file+"_ffout_framefinderfeatures"+" "+outdir) @@ -405,6 +461,6 @@ def main(args = sys.argv,home=None): print(('All outputs saved to: '+ outdir)) print('END') - + if __name__ == "__main__": main() diff --git a/plncpro/prediction.py b/plncpro/prediction.py index b61fe38..6541cc4 100644 --- a/plncpro/prediction.py +++ b/plncpro/prediction.py @@ -10,7 +10,7 @@ Author : Urminder Singh email: urmind13_sit@jnu.ac.in UrMi 21/4/16 -''' +''' import sys, getopt import os from Bio import SeqIO @@ -22,6 +22,7 @@ def printhelp(): print ("This script classifies transcripts as coding or non coding transcripts") print ("Arguments:") print ("-h print this message") + print ("-f overwrite existing results") print ("-p output file name to store prediction results") print ("-i path to file containing input sequences") print ("-m path to the model file") @@ -31,22 +32,23 @@ def printhelp(): print ("-t number of threads[default: 4]") print ("-l path to the files containg labels(this outputs performance of the classifier)") print ("-r clean up intermediate files") - print ("-v show more messages") + print ("-v show more messages") print ("--min_len specifiy min_length to filter input files") print ("--noblast Don't use blast features") print ("--no_ff Don't use framefinder features") print ("--qcov_hsp specify qcov parameter for blast[default:30]") print ("--blastres path to blast output for input file") - - + + def main(args = sys.argv,home=None): - - + + ###################################### ############Define variables############## in_flag=False out_flag=False db_flag=False + force_flag=False model_flag=False removefiles_flag=False noblast_flag=False @@ -74,7 +76,7 @@ def main(args = sys.argv,home=None): label_file="" ##################################### try: - opts, args = getopt.getopt(sys.argv[2:],"ht:i:rm:p:o:d:vl:",["prediction_out=","infile=","noblast","no_ff","qcov_hsp=","threads=","db=","outdir=","model=","remove_temp","blastres=","labels=","min_len="]) + opts, args = getopt.getopt(sys.argv[2:],"fht:i:rm:p:o:d:vl:",["prediction_out=","infile=","noblast","no_ff","qcov_hsp=","threads=","db=","outdir=","model=","remove_temp","force","blastres=","labels=","min_len="]) except getopt.GetoptError: printhelp() sys.exit(2) @@ -89,6 +91,8 @@ def main(args = sys.argv,home=None): out_file_flag=True #print (out_file) #print arg + elif opt in ("-f", "--force"): + force_flag=True elif opt in ("-i", "--infile"): #print 'neg found' in_file=os.path.abspath(arg) @@ -146,7 +150,7 @@ def main(args = sys.argv,home=None): #print 'ml found' min_length=int(arg) min_len_flag=True - + ###Check all necessary inputs @@ -210,13 +214,13 @@ def main(args = sys.argv,home=None): #print 'writing',record.id else: short_ctr=short_ctr+1 - + SeqIO.write(temp_rec, output_handle, "fasta") output_handle.close() #important else gives errors - + print(('Short Sequences <',str(min_length),str(short_ctr),' removed,',str(ctr),' retained')) in_file=in_file+'_temp_'+str(min_length) - print ('New file with filtered sequences:') + print ('New file with filtered sequences:') print (in_file) @@ -224,8 +228,10 @@ def main(args = sys.argv,home=None): ############################################Read in file############################################################### if vflag: print ('Reading Input File...\nExtracting Features...') - - os.system("python "+home+"/bin/extractfeatures.py "+in_file+" 1") + okfile=outdir+"/features.ok" + if not os.path.exists(okfile) or force_flag : + os.system("python "+home+"/bin/extractfeatures.py "+in_file+" 1") + with open(okfile, mode="w"): pass ########Run framefinder ff_featurefile_pos=in_file+"_ffout_framefinderfeatures" @@ -235,25 +241,35 @@ def main(args = sys.argv,home=None): else: #print "lib/framefinder/framefinder -r False -w lib/framefinder/framefinder.model "+in_file+" > "+in_file+"_ffout" - os.system(home+"/lib/framefinder/framefinder -r False -w "+home+"/lib/framefinder/framefinder.model "+in_file+" > "+in_file+"_ffout") + okfile=outdir+"/ffout.ok" + if not os.path.exists(okfile) or force_flag : + os.system(home+"/lib/framefinder/framefinder -r False -w "+home+"/lib/framefinder/framefinder.model "+in_file+" > "+in_file+"_ffout") + with open(okfile, mode="w"): pass + #parse framefinder results and write framefinder feature files if vflag: print ('Extracting Framefinder Features...') - os.system("python "+home+"/bin/ffparse.py "+in_file+"_ffout") + okfile=outdir+"/ffout_features.ok" + if not os.path.exists(okfile) or force_flag : + os.system("python "+home+"/bin/ffparse.py "+in_file+"_ffout") + with open(okfile, mode="w"): pass ######Run BLASTX######## if noblast_flag==False: if blastres_flag==True: if vflag: print ('Parsing Blast results...\nExtracting Features...') - - os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_file) - + okfile=outdir+"/blast_features.ok" + if not os.path.exists(okfile) or force_flag : + os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_file+" "+str(num_threads)) + with open(okfile, mode="w"): pass if vflag: print ('Merging all Features...') - - os.system("python "+home+"/bin/mergefeatures.py "+in_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_file+"_blastfeatures "+" "+in_file+"_all_features") - + okfile=outdir+"/blast_merge.ok" + if not os.path.exists(okfile) or force_flag : + os.system("python "+home+"/bin/mergefeatures.py "+in_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_file+"_blastfeatures "+" "+in_file+"_all_features") + with open(okfile, mode="w"): pass + else: #filename for blastres blastres_pos=in_file+"_blastres" @@ -264,38 +280,52 @@ def main(args = sys.argv,home=None): os.system(str(bcommand)) if vflag: print ('Parsing Blast Results...') - - os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_pos) + okfile=outdir+"/blast_features.ok" + if not os.path.exists(okfile) or force_flag : + os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_pos+" "+str(num_threads)) + with open(okfile, mode="w"): pass print ('Merging all Features...') - os.system("python "+home+"/bin/mergefeatures.py "+in_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_pos+"_blastfeatures "+" "+in_file+"_all_features") + + okfile=outdir+"/blast_merge.ok" + if not os.path.exists(okfile) or force_flag : + os.system("python "+home+"/bin/mergefeatures.py "+in_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_pos+"_blastfeatures "+" "+in_file+"_all_features") + with open(okfile, mode="w"): pass else: if vflag: print ('Skipping Blast...') blastres_pos=in_file+"_blastres" os.system("echo 'X X 0 0 0 0 0 0 0 0' > "+blastres_pos) - os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_pos) + os.system("python "+home+"/bin/blastparse_mt3.py "+blastres_pos+" "+str(num_threads)) print ('Merging all Features...') - os.system("python "+home+"/bin/mergefeatures.py "+in_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_pos+"_blastfeatures "+" "+in_file+"_all_features") + okfile=outdir+"/blast_merge.ok" + if not os.path.exists(okfile) or force_flag : + os.system("python "+home+"/bin/mergefeatures.py "+in_file+"_features "+no_ff_flagval+" "+ff_featurefile_pos+" "+noblast_flag_val+" "+blastres_pos+"_blastfeatures "+" "+in_file+"_all_features") + with open(okfile, mode="w"): pass + ######################################################################################################################### ##############################################Start prediction########################################################### ##mergeboth files in one if vflag: print ('Predicting...') - + print((str("python "+home+"/bin/rf/predict.py "+in_file+"_all_features "+model_file+" "+out_file+" "+lflag_val+" "+label_file))) - os.system("python "+home+"/bin/rf/predict.py "+in_file+"_all_features "+model_file+" "+out_file+" "+lflag_val+" "+label_file) + okfile=outdir+"/prediction.ok" + if not os.path.exists(okfile) or force_flag : + os.system("python "+home+"/bin/rf/predict.py "+in_file+"_all_features "+model_file+" "+out_file+" "+lflag_val+" "+label_file) + with open(okfile, mode="w"): pass ################Remove Temp Files################## if removefiles_flag==True: print ('Removing temp files...') #print "rm -f "+in_file+"_ffout_framefinderfeatures" os.system("rm -f "+in_file+"_ffout_framefinderfeatures") - os.system("rm -f "+in_file+"_blastres_blastfeatures") + if not blastres_flag : + os.system("rm -f "+in_file+"_blastres_blastfeatures") os.system("rm -f "+in_file+"_features") os.system("rm -f "+in_file+"_ffout") - if min_len_flag==True: + if min_len_flag==True: os.system("rm -f "+in_file) #########################################Move files to out_dir########################################## @@ -303,16 +333,16 @@ def main(args = sys.argv,home=None): if not os.path.exists(outdir): os.makedirs(outdir) os.system("mv "+in_file+"_all_features"+" "+outdir+"/") - os.system("mv "+in_file+"_blastres"+" "+outdir) + if not blastres_flag : + os.system("mv "+in_file+"_blastres"+" "+outdir) os.system("mv "+out_file+" "+outdir) print(('All outputs saved to: '+ outdir)) print ('END') - - - - + + + + if __name__ == "__main__": main() - - \ No newline at end of file + diff --git a/requirements.txt b/requirements.txt index c096600..773d367 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ biopython +multiprocess regex -sklearn \ No newline at end of file +sklearn