#!/usr/bin/env python2.6 """ structureHarvester version 0.6.9 dent earl, dearl (a) soe ucsc edu Dec 2011 http://users.soe.ucsc.edu/~dearl/software/struct_harvest/ http://taylor0.biology.ucla.edu/struct_harvest/ ############################## CITATION Earl, Dent A. and vonHoldt, Bridgett M. (2011) STRUCTURE HARVESTER: a website and program for visualizing STRUCTURE output and implementing the Evanno method. Conservation Genetics Resources DOI: 10.1007/s12686-011-9548-7 ############################## REFERENCES Evanno et al., 2005. Detecting the number of clusters of individuals using the software STRUCTURE: a simulation study. Molecular Ecology 14, 2611-2620. Jakobsson M., Rosenberg N. 2007. CLUMPP: a cluster matching and permutation program for dealing with label switching and multimodality in analysis of population structure. Bioinformatics 23(14): 1801-1806. Pritchard J., Stephens M., Donnelly. P. 2000. Genetics 155:945-959. ############################## LICENSE Copyright (C) 2007-2011 by Dent Earl (dearl@soe.ucsc.edu, dentearl@gmail.com) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import glob import math from optparse import OptionParser import os import re import sys import time EPSILON = 0.0000001 # for determining if a stdev ~ 0 __version__ = 'v0.6.91 February 2012' try: eval("enumerate([1, 2, 3], 1)") except TypeError: raise ImportError('This script requires Python version 2.5 <= v < 3.0. ' 'This version %d.%d' % (sys.version_info[0], sys.version_info[1])) class Data: """ Data will be used to store variables in a fashion that allows for easy access between functions. """ pass class RunRecord: """ Stores the output of a single structure run. """ def __init__(self): self.name = '' self.k = -1 self.indivs = -1 self.loci = -1 self.burnin = -1 self.reps = -1 self.estLnProb = -1 self.meanLlh = -1 self.varLlh = -1 self.runNumber = -1 def initOptions(parser): """initOptions() """ parser.add_option('--dir',dest='resultsDir', type ='string', help='The structure Results/ directory.') parser.add_option('--out',dest='outDir', type ='string', help='The out directory. If it does not exist, it will be created. Output written to summary.txt') parser.add_option('--evanno',dest='evanno', action='store_true', default=False, help='If possible, performs the Evanno 2005 method. Written to evanno.txt. default=%default') parser.add_option('--clumpp',dest='clumpp', action='store_true', default=False, help='Generates one K*.indfile for each value of K run, for use with CLUMPP. default=%default') def checkOptions(parser, options): """checkOptions() """ if not options.resultsDir: parser.error('Error, specify --dir \n') if not os.path.exists(options.resultsDir): parser.error('Error, --dir %s does not exist!\n' % options.resultsDir) if not os.path.isdir(options.resultsDir): parser.error('Error, --dir %s is not a directory!\n' % options.resultsDir) options.resultsDir = os.path.abspath(options.resultsDir) if not options.outDir: parser.error('Error, specify --out \n') if os.path.exists(options.outDir) and not os.path.isdir(options.outDir): parser.error('Error, --out %s already exists but is not a directory!\n' % options.outDir) if not os.path.exists(options.outDir): os.mkdir(options.outDir) def addAttribute(p, value, run , data): if p == 'indivs': run.indivs = int(value) elif p == 'loci': run.loci = int(value) elif p == 'k': run.k = int(value) elif p == 'burnin': run.burnin = int(value) elif p == 'reps': run.reps = int(value) elif p == 'lnprob': run.estLnProb = float(value) elif p == 'meanln': run.meanLlh = float(value) elif p == 'varln': run.varLlh = float(value) else: sys.stderr.write('Error, unknown pattern type %s\n' % (p)) def readFile(filename, data): run = RunRecord() run.name = os.path.basename(filename) m = re.search('.*_(\d+)_f$', run.name) if m is not None: run.runNumber = m.group(1) file = open(filename, 'r') regExs = {'indivs' : '^(\d+) individuals$', 'loci' : '^(\d+) loci$', 'k' : '^(\d+) populations assumed$', 'burnin' : '^(\d+) Burn\-in period$', 'reps' : '^(\d+) Reps$', 'lnprob' : '^Estimated Ln Prob of Data\s+=\s+(\S+)$', 'meanln' : '^Mean value of ln likelihood\s+=\s+(\S+)$', 'varln' : '^Variance of ln likelihood\s+=\s+(\S+)$', } pats = {} for r in regExs: pats[r] = re.compile(regExs[r]) isDone = False for line in file: if isDone: break line = line.strip() for p in pats: m = re.search(pats[p], line) if m is not None: addAttribute(p, m.group(1), run, data) if p == 'varln': isDone = True return validateRecord(run) def validateRecord(run): if run.name == '': return None i = 0 for r in [run.k, run.indivs, run.loci, run.burnin, run.reps, run.estLnProb, run.meanLlh, run.varLlh]: if r == -1: return None i += 1 return run def harvestFiles(data, options): files = glob.glob(os.path.join(options.resultsDir, '*_f')) if len(files) < 1: sys.stderr.write('Error, unable to locate any _f files in ' 'the results directory %s' % options.resultsDir) sys.exit(1) data.records = {} # key is K, value is an array for f in files: run = readFile(f, data) if run is not None: if run.k not in data.records: data.records[run.k] = [] data.records[run.k].append(run) else: sys.stderr.write('Error, unable to extract results from file %s.\n' % f) file.write('Did you remember to check the box \'Compute the probability of the data (for estimating K)\' in STRUCTURE?\n') file.close() sys.exit(0) data.sortedKs = data.records.keys() data.sortedKs.sort() def writeRawOutputToFile(data, options): file = open(os.path.join(options.outDir, 'summary.txt'), 'w') file.write('# This document produced by structureHarvester.py %s\n' % __version__) file.write('# http://users.soe.ucsc.edu/~dearl/struct_harvest\n') file.write('# http://taylor0.biology.ucla.edu/struct_harvest\n') file.write('# Written by Dent Earl, dearl (a) soe ucsc edu.\n') file.write('# CITATION:\n# Earl, Dent A. and vonHoldt, Bridgett M. (2011)\n' '# STRUCTURE HARVESTER: a website and program for visualizing\n' '# STRUCTURE output and implementing the Evanno method.\n' '# Conservation Genetics Resources DOI: 10.1007/s12686-011-9548-7\n' '# Version: %s\n' % __version__) file.write('# File generated at %s\n' % (time.strftime('%Y-%b-%d %H:%M:%S %Z', time.localtime()))) file.write('#\n') file.write('\n##########\n') file.write('# K\tReps\t' 'mean est. LnP(Data)\t' 'stdev est. LnP(Data)\n') for i in xrange(0, len(data.sortedKs)): k = data.sortedKs[i] file.write('%d\t%d\t%f\t%f\n' % (k, len(data.records[k]), data.estLnProbMeans[k], data.estLnProbStdevs[k])) file.write('\n##########\n') file.write('# File name\tRun #\t' 'K\tEst. Ln prob. of data\t' 'Mean value of Ln likelihood\tVariance of Ln likelihood\n') for k in data.records: for r in data.records[k]: if r.runNumber != -1: numStr = r.runNumber else: numStr = '' file.write('%s\t%s\t%d\t%.1f\t%.1f\t%.1f\n' % (r.name, numStr, r.k, r.estLnProb, r.meanLlh, r.varLlh)) file.write('\nSoftware written by Dent Earl while at EEB Dept, UCLA, BME Dept UCSC\n' 'dearl (a) soe ucsc edu') file.write('CITATION:\nEarl, Dent A. and vonHoldt, Bridgett M. (2011)\n' 'STRUCTURE HARVESTER: a website and program for visualizing\n' 'STRUCTURE output and implementing the Evanno method.\n' 'Conservation Genetics Resources DOI: 10.1007/s12686-011-9548-7\n' 'Version: %s' % __version__) file.close() def clumppGeneration(data, options): if not options.clumpp: return minK = data.sortedKs[0] maxK = data.sortedKs[-1] regex1 = r'''^(\d+) # individual number \s+(\S+) # Label \s+(\S+) # %Miss \s+([\-\d]+)\s+: # Pop, can be -9 \s+(.*?)$ # Inferred clusters''' regex2 = r'''^(\d+) # individual number \s+(\S+) # Label \s+(\S+)\s+: # %Miss. if only 1 inferred cluster, then 1 is not printed \s+(.*?)$ # Inferred clusters''' # regex2 also covers the new style of confidence intervals following the standard values regex3 = r'''^(\d+) # individual number \s+(\S+)\s+: # %Miss \s+([\d. ]*?)$ # Inferred clusters''' regexIntervals = '\(\S+,\S+\)' # used to trim off the confidence intervals pats = [] for r in [regex1, regex2, regex3]: pats.append(re.compile(r, re.X)) patInterval = re.compile(regexIntervals) for k in data.sortedKs: f = open (os.path.join(options.outDir, 'K%d.indfile' % k), 'w') for r in data.records[k]: inFile = open (os.path.join(options.resultsDir, r.name), 'r') printing = False for lineno, line in enumerate(inFile, 1): line = line.strip() if line == '': printing = False continue d = line.split(' ') if d[0] == 'Label' and d[1] == '(%Miss)' or d[0] == '(%Miss)': printing = True continue # find all confidence intervals and remove them m = re.findall(patInterval, line) for i in m: line = line.replace(i, '') line = line.strip() if printing: m1 = re.match(pats[0], line) m2 = re.match(pats[1], line) m3 = re.match(pats[2], line) if m1 is None and m2 is None and m3 is None: sys.stderr.write('Error clumpp generation, %s failed ' 'to match with regexs "%s", "%s", "%s" on lineno %d [%s]\n' % (r.name, regex1, regex2, regex3, lineno, line)) sys.exit(1) else: if m1 is not None: f.write('%3d %3d %s %2d : %s\n' % (int(m1.group(1)), int(m1.group(1)), m1.group(3), int(m1.group(4)), m1.group(5))) elif m2 is not None: f.write('%3d %3d %s 1 : %s\n' % (int(m2.group(1)), int(m2.group(1)), m2.group(3), m2.group(4))) elif m3 is not None: f.write('%3d %3d %s 1 : %s\n' % (int(m3.group(1)), int(m3.group(1)), m3.group(2), m3.group(3))) f.write('\n') f.close() clumppPopFile(data, options) def clumppPopFile(data, options): regex1 = r'''^(\d+): # Given Pop \s+([\d. ]*?) # All Inferred Cluster columns \s+(\d+)$ # # Number of Individuals''' f = None pats = [] for r in [regex1]: pats.append(re.compile(r, re.X)) for k in data.sortedKs: for r in data.records[k]: inFile = open (os.path.join(options.resultsDir, r.name), 'r') printing = False skipLine = False workComplete = False for lineno, line in enumerate(inFile, 1): if workComplete: continue line = line.strip() d = line.split() if printing == True and line.startswith('--------------------'): workComplete = True printing = False continue if len(d) < 3: continue if d[0] == 'Given' and d[1] == 'Inferred' and d[2] == 'Clusters': printing = True f = open (os.path.join(options.outDir, 'K%d.popfile' % k), 'a') skipLine = True continue if skipLine: skipLine = False continue if printing: m1 = re.match(pats[0], line) if m1 is None: sys.stderr.write('Error, clumpp popfile generation failed\n' 'Your file %s has unexpected line structure!\n' 'Job failed ' 'to match regex\n%s\non this line:\n%s\n' 'Please verify your input file\'s characters ' 'and try again after making the correction.\n\n' % (r.name, regex1, line)) else: f.write('%3d:' % (int(m1.group(1)))) for i in xrange(2, len(m1.groups()) + 1): f.write('\t%s' % m1.group(i)) f.write('\n') if f is not None: f.write('\n') f.close() f = None def evannoMethod(data, options): if not options.evanno: return value = evannoTests(data) if value is not None: sys.stderr.write('Unable to perform Evanno method for the follow reason(s):\n') sys.stderr.write(value) sys.exit(1) calculatePrimesDoublePrimesDeltaK(data) writeEvannoTableToFile(data, options) def writeEvannoTableToFile(data, options): file = open(os.path.join(options.outDir, 'evanno.txt'), 'w') file.write('# This document produced by structureHarvester.py %s\n' % __version__) file.write('# http://users.soe.ucsc.edu/~dearl/struct_harvest\n') file.write('# http://taylor0.biology.ucla.edu/struct_harvest\n') file.write('# Written by Dent Earl, dearl (a) soe ucsc edu.\n') file.write('# CITATION:\n# Earl, Dent A. and vonHoldt, Bridgett M. (2011)\n' '# STRUCTURE HARVESTER: a website and program for visualizing\n' '# STRUCTURE output and implementing the Evanno method.\n' '# Conservation Genetics Resources DOI: 10.1007/s12686-011-9548-7\n' '# Version: %s\n' % __version__) file.write('# File generated at %s\n' % (time.strftime('%Y-%b-%d %H:%M:%S %Z', time.localtime()))) file.write('#\n') file.write('\n##########\n') file.write('# K\tReps\t' 'Mean LnP(K)\tStdev LnP(K)\t' 'Ln\'(K)\t|Ln\'\'(K)|\tDelta K\n') for i in xrange(0, len(data.sortedKs)): k = data.sortedKs[i] if k in data.LnPK: LnPKstr = '%f' % data.LnPK[k] else: LnPKstr = 'NA' if k in data.LnPPK: LnPPKstr = '%f' % data.LnPPK[k] else: LnPPKstr = 'NA' if k in data.deltaK: deltaKstr = '%f' % data.deltaK[k] else: deltaKstr = 'NA' file.write('%d\t' '%d\t%.4f\t%.4f\t' '%s\t%s\t%s\n' % (k, len(data.records[k]), data.estLnProbMeans[k], data.estLnProbStdevs[k], LnPKstr, LnPPKstr, deltaKstr)) file.close() def evannoTests(data): """ sucess returns None, failure returns an html formatted string which can be directly printed in to the .html output. """ fail = False numRuns = 0 for k in data.records: numRuns += len(data.records[k]) numReps = numRuns / float(len(data.records)) out = 'Stats: number of runs = %d, number of replicates = %.2f, ' % (numRuns, numReps) out += 'minimum K tested = %d, maximum K tested = %d.\n' % (data.sortedKs[0], data.sortedKs[-1]) # Must test at least three values of K if len(data.sortedKs) >= 3: out += 'Test: You must test at least 3 values of K. PASS\n' else: fail = True out += 'Test: You must test at least 3 values of K. FAIL\n' # K values must be sequential if ((data.sortedKs[-1] + 1) - data.sortedKs[0]) == len(data.sortedKs): out += 'Test: K values must be sequential. PASS\n' else: fail = True out += 'Test: K values must be sequential. FAIL\n' out += ' * ' prevK = data.sortedKs[0] - 1 i = 0 for k in data.sortedKs: i += 1 if i == len(data.sortedKs): spacer = '' else: spacer = ', ' if k != prevK + 1: out += 'MISSING: %d%s' % (k, spacer) else: out += '%d%s' % (k, spacer) prevK = k out += '\n' # Number of replicates must be greater than 2 (stdev) if numReps > 1: out += 'Test: The number of replicates per K > 1. PASS\n' else: fail = True out += 'Test: The number of replicates per K > 1. FAIL\n' # Standard Devation for a K (but not the first or last K) is zero for i in xrange(1, len(data.sortedKs) - 1): k = data.sortedKs[i] if data.estLnProbStdevs[ k ] < EPSILON: # our epsilon fail = True out += ('Test: Standard devation of est. Ln Pr(Data) ' 'is less than 0.0000001. FAIL for K = %d. The Evanno test requires division ' 'by the standard deviation of the est. Ln Pr(Data) values for all K between ' 'the first and last K value, and thus when the standard deviation is within ' 'Epsilon (%f) of zero we cannot perform the test. (Try running more ' 'replicates to hopefully increase the standard deviation for that value of ' 'K.)\n' % (k, EPSILON) ) # Standard Devation for one K is zero for k in data.sortedKs: if data.estLnProbStdevs[k] < EPSILON: # our epsilon fail = True out += ('Test: Standard devation of est. Ln Pr(Data) ' ' < 0.0000001. FAIL on K = %d. (Try running more replicates.)\n' % k) if fail == True: return out return None def calculateMeansAndSds(data, options): data.estLnProbMeans = {} data.estLnProbStdevs = {} for k in data.records: data.estLnProbMeans[k] = 0 data.estLnProbStdevs[k] = 0 for r in data.records[k]: data.estLnProbMeans[k] += r.estLnProb data.estLnProbMeans[k] /= len(data.records[k]) if len(data.records[k]) > 1: for r in data.records[k]: data.estLnProbStdevs[k] += (r.estLnProb - data.estLnProbMeans[k]) ** 2 data.estLnProbStdevs[k] /= (len(data.records[k]) - 1) data.estLnProbStdevs[k] = math.sqrt(data.estLnProbStdevs[k]) def calculatePrimesDoublePrimesDeltaK(data): data.LnPK = {} data.LnPPK = {} data.deltaK = {} for i in xrange(1, len(data.sortedKs)): thisK = data.sortedKs[i] prevK = data.sortedKs[i - 1] data.LnPK[thisK] = data.estLnProbMeans[thisK] - data.estLnProbMeans[prevK] for i in xrange(1, len(data.sortedKs) - 1): prevK = data.sortedKs[i - 1] thisK = data.sortedKs[i] nextK = data.sortedKs[i + 1] data.LnPPK[thisK] = abs(data.LnPK[nextK] - data.LnPK[thisK]) data.deltaK[thisK] = abs(data.estLnProbMeans[nextK] - 2.0 * data.estLnProbMeans[thisK] + data.estLnProbMeans[prevK]) / float(data.estLnProbStdevs[thisK]) def main(): usage = ('usage: %prog --dir=path/to/dir/ --out=path/to/dir/ [options]\n\n' '%prog takes a STRUCTURE results directory (--dir) and an\n' 'output directory (--out will be created if it does not exist) and then\n' 'depending on the other options selected harvests data from the results\n' 'directory and performs the selected analyses') data = Data() parser = OptionParser(usage = usage, version = '%prog '+ __version__) initOptions(parser) (options, args) = parser.parse_args() checkOptions(parser, options) harvestFiles(data, options) calculateMeansAndSds(data, options) clumppGeneration(data, options) evannoMethod(data, options) writeRawOutputToFile(data, options) if __name__ == '__main__': main()