#!/usr/bin/python # program: validate2rr.py # convert the lwnn_validate output into a format that we can use # options: # -c : chain id (required). Used in output also to get sequence, pairs, etc. # -p : percent of sequence size to limit predictions (default is .5) import getopt, sys, os, string def usage(): print """usage: validate2rr.py [options] -c converts the lwnn_validate output into a format that we can use -t, -target : chain id (required). Used in output also to get sequence, etc. -n, -network : the name of the network used to generate the predictions. -b, -abbrv : an abbreviated name for the network. options: -s, --start : starting residue index. Used for domains. -p, --percent : percent of sequence size to limit predictions (default is .5) -l, --limit : limit predictions to this number -f, --format : format of stdin input: lwnn: lightweight format. requires supplemental file with column pair data. (default) raw: simple format consisting of columns i,j and the raw score. No extra files needed. -a, --author : id given on the author line (default is XXXX-XXXX-XXXX) """ percentage = 1.0 id = "" limit = 0 author = "XXXX-XXXX-XXXX" doConstraints = 0 network = "???.net" gStart = 0 format = 'lwnn' network_id = None sticks = "wireframe" def getargs(): global id global network global network_id global doConstrains global limit global gLimit global gStart global format global author try: opts, args = getopt.getopt(sys.argv[1:],"hct:n:s:b:l:p:a:f:", ["help", "constraints", "target=", "network=", "start=", "abbrv=", "limit=", "percent=", "author=", "format="]) except getopt.GetoptError: usage() sys.exit(2) for o,a in opts: if o in ("-l","--limit"): try: limit = int(a) except ValueError: print "-l needs an integer value" sys.exit(5) if o in ("-p","--percent"): try: percentage = float(a) except ValueError: print "-p needs a floating point value" sys.exit(5) if o in ("-f","--format"): if a!='raw' and a!='lwnn': print "-f has limited acceptable inputs" usage() sys.exit(5) format = a if o in ("-s","--start"): try: gStart = int(a) except ValueError: print "-s needs an integer value" if o in ("-t","--target"): id = a if o in ("-a","--author"): author = a if o in ("-n","--network"): network = a if o in ("-b","--abbrv"): network_id = a if o in ("-c","--constraints"): doConstraints = 1 if o in ("-h","--help"): usage() sys.exit(3) if not id: usage() sys.exit(4) if gStart<0: gStart=0 if gStart: gStart -= 1 if not network_id: network_id = network[:8] # assume the stdin is the validation file, NOT inputlist = sys.stdin.readlines() # inputsize = len(string.split(inputlist[0]))-3 def prologue(sequence): global id global network seq = sequence[:] # make a copy! print "PFRMAT RR" print "TARGET "+id # next is the id for Group SAM-T04-hand print "AUTHOR "+author print """\ REMARK From group SAM-TO6 under the direction of REMARK Kevin Karplus. REMARK Residue-residue contact predictions REMARK by George Shackelford and Kevin Karplus REMARK The value given is an approximation of the probability REMARK as provided by the neural network. REMARK The predictions are limited to a score >= .18 which REMARK is an approximation of the probability. REMARK METHOD The predictor is an artificial neural network. METHOD NN: %s METHOD Inputs may include separation and sequence length, METHOD e-value statistic based on mutual information values, METHOD a statistic based on propensity of residues in contact. METHOD Other inputs are distributions from alignments and METHOD secondary predictions from SAM-t04 and/or SAM-t06 of METHOD the University of California, Santa Cruz MODEL 1""" % (network) while seq: print seq[:50] seq = seq[50:] # readin pairs, readin sequence(?) pairs = [] sequence = "" def getdata(): global pairs, sequence, format if format=='lwnn': f = open(id+".pairs") pairs = f.readlines() f.close() f = open(id+".a2m") f.readline() lines = f.readlines() sequence = "" for line in lines: sequence += string.strip(line) f.close() def colsort(a,b): if a[1]!=b[1]: return a[1]-b[1] return a[2]-b[2] # ============== main part ====================== getargs() getdata() # get a prediction predictions = [] inputIndex = 0 if format=='lwnn': # we need to process the pairs to get the predictions if len(inputlist)-5 != len(pairs): print "WARNING! pairs do not match predictions" print "pairs",len(pairs)," preds",len(inputlist) for pair in pairs: p = inputlist[inputIndex] inputIndex += 1 p = string.strip(p) if not p: break start = p.find("->") + 2 end = p.find(" ",start) predict = float(p[start:end]) i,j = map(int,string.split(pair)) predictions.append([predict,i,j]) elif format=='raw': # the simpler format. We just need to rearrange for line in inputlist: if line[0]=='#': continue fields = string.split(line) predict = float(fields[2]) i,j = map(int,fields[0:2]) predictions.append([predict,i,j]) lim = int(percentage*len(sequence)) predictions.sort() predictions.reverse() submitted = predictions[0:lim] def makeatom(i): global sequence, gStart result = sequence[i-1]+str(i+gStart)+".C" if sequence[i-1]=='G': result += "A " else: result += "B " return result if 1: f = open(id+"."+network_id+".rr.constraints",'w') f.write("""\ # file generated by validate2rr # Neural Net: # %s # The score is currently uncalibrated. """% (network) ) # predct = 0 for l in predictions: # predct += 1 if l[0]<.3: continue s = "Constraint "+makeatom(l[1])+makeatom(l[2])+"-10. 7.0 14.0 "+str(l[0]) if l[0]<.6: s += ' bonus' f.write(s+'\n') f.close() def writeRR(predictions,distinctPairs=0): global gStart colors = ["Red","Orange","Yellow","Green","Cyan",\ "Blue","Violet","Brown","Magenta","Grey","DarkRed","DarkGreen"] colored={} if distinctPairs: filename = id+"."+network_id+".rr.distinct.rasmol" else: filename = id+"."+network_id+".rr.rasmol" f = open(filename,'w') f.write("select none\n") predct = 1 for l in predictions: if predct > len(colors): break unassigned = 0 if colored.has_key(l[1]): if distinctPairs: continue color = colored[l[1]] unassigned = 2 if colored.has_key(l[2]): if distinctPairs: continue if unassigned: # oh! both have been assigned colors! # then just skip on continue color = colored[l[2]] unassigned = 1 # if neither has been assigned, get a new color if not (unassigned): color = colors[predct-1] predct += 1 wireframe = max(.15,l[0]) if not unassigned: colored[l[1]] = color colored[l[2]] = color selected = "("+str(l[1]+gStart)+" or "+str(l[2]+gStart)+")" else: colored[l[unassigned]] = color selected = str(l[unassigned]+gStart) f.write("select "+selected+" and not (*.N or *.C or *.O)\n") f.write(sticks+" "+str(wireframe)+"\n") f.write("color "+color+"\n") f.write("select *\n") f.close() writeRR(predictions) writeRR(predictions,1) submitted.sort(colsort) prologue(sequence) for l in submitted: prob = l[0] prob *= prob print l[1]+gStart,l[2]+gStart,0,8,prob print "END"