#!/usr/bin/python # program: validate2rr.py # convert the lwnn_validate output into a format that we can use import getopt, sys, os, string def errout(s,e): sys.stderr.write(s+"\n") sys.exit(e) def usage(e=0): errout("""usage: validate2rr.py [options] -c converts the lwnn_validate output into a format that we can use -t, -target : chain id (required). Used in output also to get sequence, etc. -n, -network : the name of the network used to generate the predictions. -b, -abbrv : an abbreviated name for the network. options: -s, --start : starting residue index. Used for domains. -p, --percent : percent of sequence size to limit predictions (default is 2.0) -l, --limit : limit predictions to this number -f, --format : format of stdin input: lwnn: lightweight format. requires supplemental file with column pair data. (default) raw: simple format consisting of columns i,j and the raw score. No extra files needed. -a, --author : id given on the author line (default is XXXX-XXXX-XXXX) -c, --constraints : do generate constraint file -m, --factor : float value This is used to multiply that actual probabilities for constraints -x, --cutoffs , Sets the probabililty values used for limiting the constraints output. The bonus cutoff is the probability below which all constraints are marked as bonuses. """,e) percentage = 10.0 id = "" limit = 0 author = "XXXX-XXXX-XXXX" doConstraints = 0 network = "???.net" gStart = 0 format = 'lwnn' network_id = None calibration = [0.901505,0.587057,0.895140,0.552388,0.890689,0.530832,0.887221,0.511753,0.884140,0.496146,0.881355,\ 0.479445,0.878879,0.468282,0.876518,0.461361,0.874404,0.449889,0.872447,0.441566,0.870562,0.433217,\ 0.868811,0.425543,0.867114,0.419948,0.865470,0.413106,0.863890,0.404564,0.862475,0.398285,0.861061,\ 0.395066,0.859650,0.390012,0.858294,0.384890,0.857008,0.380233,0.855770,0.376065,0.854535,0.371154,\ 0.853353,0.366361,0.852163,0.362842,0.851068,0.359566,0.849954,0.355986,0.848861,0.353492,0.847794,\ 0.350110,0.846755,0.346448,0.845740,0.343475,0.844735,0.339749,0.843746,0.337092,0.842788,0.333972,\ 0.841808,0.330643,0.840893,0.328106,0.839953,0.325724,0.839032,0.323082,0.838151,0.320227,0.837264,\ 0.318060,0.836366,0.315666,0.835497,0.313602,0.834640,0.311527,0.833795,0.308986,0.832962,0.306584,\ 0.832130,0.304883,0.831303,0.302842,0.830518,0.300604,0.829716,0.298159,0.828927,0.296391,0.828130,\ 0.294838,0.827371,0.292499,0.826576,0.290556,0.825806,0.288532,0.825057,0.286934,0.824306,0.285106,\ 0.823592,0.283323,0.822835,0.281351,0.822127,0.279721,0.821396,0.277773,0.820694,0.275872,0.819968,\ 0.273874,0.819261,0.272488,0.818552,0.270834,0.817856,0.269201,0.817161,0.267601,0.816484,0.266020,\ 0.816139,0.265492] factor = 1.0 cutoffs = [0.4,0.5] sticks = "wireframe" def getargs(): global id global network global network_id global cutoffs global limit global gLimit global gStart global format global author global doConstraints global calibration global factor try: opts, args = getopt.getopt(sys.argv[1:],"hcx:t:n:s:b:l:p:a:f:r:m:", ["help", "constraints", "cutoffs=", "target=", "network=", "start=", "abbrv=", "limit=", "percent=", "author=", "format=","calibrate=","factor="]) except getopt.GetoptError: usage(2) for o,a in opts: if o in ("-l","--limit"): try: limit = int(a) except ValueError: errout("-l needs an integer value",5) if o in ("-p","--percent"): try: percentage = float(a) except ValueError: errout("-p needs a floating point value",5) if o in ("-f","--format"): if a!='raw' and a!='lwnn': print "-f has limited acceptable inputs" usage(5) format = a if o in ("-s","--start"): try: gStart = int(a) except ValueError: errout("-s needs an integer value",5) if o in ("-t","--target"): id = a if o in ("-a","--author"): author = a if o in ("-c","--constraints"): doConstraints = 1 if o in ("-n","--network"): network = a if o in ("-b","--abbrv"): network_id = a if o in ("-x","--cutoffs"): ok = True try: cutoffs = map(float,string.split(a,',')) except ValueError: ok = False ok = (ok and len(cutoffs)==2) if not ok: errout("-c, --cutoffs needs a list of two floats",5) if o in ("-r","--calibrate"): try: calibration = map(float,string.split(a,',')) except ValueError: errout("-r,--calibrate needs a list of floats",5) if len(calibration)<4: errout("-r,--calibrate must be a list greater than three",5) if o in ("-m","--factor"): try: factor = float(a) except ValueError: errout("-m,--factor needs a floating point value",5) if o in ("-h","--help"): usage(0) if not id: usage(4) if gStart<0: gStart=0 if gStart: gStart -= 1 if not network_id: network_id = network[:8] # assume the stdin is the validation file, NOT inputlist = sys.stdin.readlines() # inputsize = len(string.split(inputlist[0]))-3 def prologue(sequence): global id global network seq = sequence[:] # make a copy! print "PFRMAT RR" print "TARGET "+id # next is the id for Group SAM-T04-hand print "AUTHOR "+author print """\ REMARK From group SAM-TO6 under the direction of REMARK Kevin Karplus. REMARK Residue-residue contact predictions REMARK by George Shackelford and Kevin Karplus REMARK The value given is an approximation of the probability REMARK as provided by the neural network. REMARK The predictions are limited to a score >= .18 which REMARK is an approximation of the probability. REMARK METHOD The predictor is an artificial neural network. METHOD NN: %s METHOD Inputs may include separation and sequence length, METHOD e-value statistic based on mutual information values, METHOD a statistic based on propensity of residues in contact. METHOD Other inputs are distributions from alignments and METHOD secondary predictions from SAM-t04 and/or SAM-t06 of METHOD the University of California, Santa Cruz MODEL 1""" % (network) while seq: print seq[:50] seq = seq[50:] # readin pairs, readin sequence(?) pairs = [] sequence = "" def getdata(): global pairs, sequence, format if format=='lwnn': f = open(id+".pairs") pairs = f.readlines() f.close() f = open(id+".a2m") f.readline() lines = f.readlines() sequence = "" for line in lines: for c in line: if (c>='a' and c<='z') or (c>='A' and c<='Z'): sequence += c f.close() def colsort(a,b): if a[1]!=b[1]: return a[1]-b[1] return a[2]-b[2] # ============== main part ====================== def calibrate(v): global calibration if not calibration: return v # interpolate if v<=calibration[-2]: return v*calibration[-1]/calibration[-2] l = len(calibration)-2 for i in range(0,l,2): if v>calibration[i+2] or i==l-2: newv= calibration[i+1]-(calibration[i]-v)/\ (calibration[i]-calibration[i+2])*\ (calibration[i+1]-calibration[i+3]) break return newv getargs() getdata() # get a prediction predictions = [] inputIndex = 0 if format=='lwnn': # we need to process the pairs to get the predictions if len(inputlist)-5 != len(pairs): print "WARNING! pairs do not match predictions" print "pairs",len(pairs)," preds",len(inputlist) for pair in pairs: p = inputlist[inputIndex] inputIndex += 1 p = string.strip(p) if not p: break start = p.find("->") + 2 end = p.find(" ",start) predict = float(p[start:end]) i,j = map(int,string.split(pair)) predictions.append([calibrate(predict),i,j]) elif format=='raw': # the simpler format. We just need to rearrange for line in inputlist: if line[0]=='#': continue fields = string.split(line) predict = float(fields[2]) i,j = map(int,fields[0:2]) predictions.append([calibrate(predict),i,j]) lim = int(percentage*len(sequence)) predictions.sort() predictions.reverse() submitted = predictions[0:lim] def makeatom(i): global sequence, gStart result = sequence[i-1]+str(i+gStart)+".C" if sequence[i-1]=='G': result += "A " else: result += "B " return result if 1: f = open(id+"."+network_id+".rr.constraints",'w') f.write("""\ # file generated by validate2rr # Neural Net: # %s # """% (network) ) # predct = 0 for l in predictions: # predct += 1 prob = l[0]*factor if prob len(colors): break assignedNode = None vertexNum = 0 if colored.has_key(l[1]): if distinctPairs: continue (color,vertexNum) = colored[l[1]] assignedNode = 1 if colored.has_key(l[2]): if distinctPairs: continue if assignedNode: # oh! both have been assigned colors! # then just skip on continue (color,vetexNum) = colored[l[2]] assignedNode = 2 wireframe = max(.15,l[0]) # if neither has been assigned, get a new color if not (assignedNode): color = colors[predct-1] predct += 1 colored[l[1]] = (color,2) colored[l[2]] = (color,2) selected = "("+str(l[1]+gStart)+" or "+str(l[2]+gStart)+")" elif vertexNum>1: unassignedNode = 3-assignedNode # tricky. colored[l[unassignedNode]] = (color,vertexNum-1) selected = str(l[unassignedNode]+gStart) else: continue f.write("select "+selected+" and not (*.N or *.C or *.O)\n") f.write(sticks+" "+str(wireframe)+"\n") f.write("color "+color+"\n") f.write("select *\n") f.close() writeRR(predictions) writeRR(predictions,1) submitted.sort(colsort) prologue(sequence) for l in submitted: prob = l[0] print l[1]+gStart,l[2]+gStart,0,8,prob print "END"