#!/usr/bin/env python2.6
""" structureHarvester version 0.6.9
dent earl, dearl (a) soe ucsc edu
Dec 2011

http://users.soe.ucsc.edu/~dearl/software/struct_harvest/
http://taylor0.biology.ucla.edu/struct_harvest/

##############################
CITATION
Earl, Dent A. and vonHoldt, Bridgett M. (2011)                
STRUCTURE HARVESTER: a website and program for visualizing    
STRUCTURE output and implementing the Evanno method.          
Conservation Genetics Resources DOI: 10.1007/s12686-011-9548-7

##############################
REFERENCES

Evanno et al., 2005.  Detecting the number of clusters of individuals using 
  the software STRUCTURE: a simulation study. Molecular Ecology 14, 2611-2620.

Jakobsson M., Rosenberg N. 2007. CLUMPP: a cluster matching and permutation 
  program for dealing with label switching and multimodality in analysis 
  of population structure. Bioinformatics 23(14): 1801-1806.

Pritchard J., Stephens M., Donnelly. P. 2000. Genetics 155:945-959.


##############################
LICENSE

Copyright (C) 2007-2011 by 
Dent Earl (dearl@soe.ucsc.edu, dentearl@gmail.com)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

"""
import glob
import math
from optparse import OptionParser
import os
import re
import sys
import time

EPSILON = 0.0000001 # for determining if a stdev ~ 0
__version__ = 'v0.6.91 February 2012'

try:
    eval("enumerate([1, 2, 3], 1)")
except TypeError:
    raise ImportError('This script requires Python version 2.5 <= v < 3.0. '
                      'This version %d.%d' % (sys.version_info[0], sys.version_info[1]))

class Data:
    """ Data will be used to store variables in a fashion
    that allows for easy access between functions.
    """
    pass

class RunRecord:
    """ Stores the output of a single structure run.
    """
    def __init__(self):
        self.name        = ''
        self.k           = -1
        self.indivs      = -1
        self.loci        = -1
        self.burnin      = -1
        self.reps        = -1
        self.estLnProb   = -1
        self.meanLlh     = -1
        self.varLlh      = -1
        self.runNumber   = -1

def initOptions(parser):
   """initOptions()
   """
   parser.add_option('--dir',dest='resultsDir',
                     type ='string', help='The structure Results/ directory.')
   parser.add_option('--out',dest='outDir',
                     type ='string', help='The out directory. If it does not exist, it will be created. Output written to summary.txt')
   parser.add_option('--evanno',dest='evanno', action='store_true', default=False,
                     help='If possible, performs the Evanno 2005 method. Written to evanno.txt. default=%default')
   parser.add_option('--clumpp',dest='clumpp', action='store_true', default=False,
                     help='Generates one K*.indfile for each value of K run, for use with CLUMPP. default=%default')

def checkOptions(parser, options):
   """checkOptions()
   """
   if not options.resultsDir:
       parser.error('Error, specify --dir \n')
   if not os.path.exists(options.resultsDir):
       parser.error('Error, --dir %s does not exist!\n' % options.resultsDir)
   if not os.path.isdir(options.resultsDir):
       parser.error('Error, --dir %s is not a directory!\n' % options.resultsDir)
   options.resultsDir = os.path.abspath(options.resultsDir)
   if not options.outDir:
       parser.error('Error, specify --out \n')
   if os.path.exists(options.outDir) and not os.path.isdir(options.outDir):
       parser.error('Error, --out %s already exists but is not a directory!\n' % options.outDir)
   if not os.path.exists(options.outDir):
       os.mkdir(options.outDir)
   
def addAttribute(p, value, run , data):
    if p == 'indivs':
        run.indivs = int(value)
    elif p == 'loci':
        run.loci = int(value)
    elif p == 'k':
        run.k = int(value)
    elif p == 'burnin':
        run.burnin = int(value)
    elif p == 'reps':
        run.reps = int(value)
    elif p == 'lnprob':
        run.estLnProb = float(value)
    elif p == 'meanln':
        run.meanLlh = float(value)
    elif p == 'varln':
        run.varLlh = float(value)
    else:
        sys.stderr.write('Error, unknown pattern type %s\n' % (p))

def readFile(filename, data):
    run = RunRecord()
    run.name = os.path.basename(filename)
    m = re.search('.*_(\d+)_f$', run.name)
    if m is not None:
        run.runNumber = m.group(1)
    file = open(filename, 'r')
    regExs = {'indivs' : '^(\d+) individuals$',
              'loci'   : '^(\d+) loci$',
              'k'      : '^(\d+) populations assumed$',
              'burnin' : '^(\d+) Burn\-in period$',
              'reps'   : '^(\d+) Reps$',
              'lnprob' : '^Estimated Ln Prob of Data\s+=\s+(\S+)$',
              'meanln' : '^Mean value of ln likelihood\s+=\s+(\S+)$',
              'varln'  : '^Variance of ln likelihood\s+=\s+(\S+)$',
              }
    pats = {}
    for r in regExs:
        pats[r] = re.compile(regExs[r])
    isDone = False
    for line in file:
        if isDone:
            break
        line = line.strip()
        for p in pats:
            m = re.search(pats[p], line)
            if m is not None:
                addAttribute(p, m.group(1), run, data)
                if p == 'varln':
                    isDone = True
    return validateRecord(run)

def validateRecord(run):
    if run.name == '':
        return None
    i = 0
    for r in [run.k, run.indivs, run.loci, run.burnin, run.reps,
               run.estLnProb, run.meanLlh, run.varLlh]:
        if r == -1:
            return None
        i += 1
    return run
    
def harvestFiles(data, options):
    files = glob.glob(os.path.join(options.resultsDir, '*_f'))
    if len(files) < 1:
        sys.stderr.write('Error, unable to locate any _f files in '
                         'the results directory %s' % options.resultsDir)
        sys.exit(1)
    data.records = {} # key is K, value is an array
    for f in files:
        run = readFile(f, data)
        if run is not None:
            if run.k not in data.records:
                data.records[run.k] = []
            data.records[run.k].append(run)
        else:
            sys.stderr.write('Error, unable to extract results from file %s.\n' % f)
            file.write('Did you remember to check the box \'Compute the probability of the data (for estimating K)\' in STRUCTURE?\n')
            file.close()
            sys.exit(0)
    data.sortedKs = data.records.keys()
    data.sortedKs.sort()

def writeRawOutputToFile(data, options):
    file = open(os.path.join(options.outDir, 'summary.txt'), 'w')
    file.write('# This document produced by structureHarvester.py %s\n' % __version__)
    file.write('# http://users.soe.ucsc.edu/~dearl/struct_harvest\n')
    file.write('# http://taylor0.biology.ucla.edu/struct_harvest\n')
    file.write('# Written by Dent Earl, dearl (a) soe ucsc edu.\n')
    file.write('# CITATION:\n# Earl, Dent A. and vonHoldt, Bridgett M. (2011)\n'
               '# STRUCTURE HARVESTER: a website and program for visualizing\n'
               '# STRUCTURE output and implementing the Evanno method.\n'
               '# Conservation Genetics Resources DOI: 10.1007/s12686-011-9548-7\n'
               '# Version: %s\n'
               % __version__)
    file.write('# File generated at %s\n' % (time.strftime('%Y-%b-%d %H:%M:%S %Z', time.localtime())))
    file.write('#\n')
    file.write('\n##########\n')
    file.write('# K\tReps\t'
                'mean est. LnP(Data)\t'
                'stdev est. LnP(Data)\n')

    for i in xrange(0, len(data.sortedKs)):
        k = data.sortedKs[i]
        file.write('%d\t%d\t%f\t%f\n' % (k, len(data.records[k]), data.estLnProbMeans[k],
                                           data.estLnProbStdevs[k]))
    file.write('\n##########\n')
    file.write('# File name\tRun #\t'
                'K\tEst. Ln prob. of data\t'
                'Mean value of Ln likelihood\tVariance of Ln likelihood\n')

    for k in data.records:
        for r in data.records[k]:
            if r.runNumber != -1:
                numStr = r.runNumber
            else:
                numStr = ''
            file.write('%s\t%s\t%d\t%.1f\t%.1f\t%.1f\n' % (r.name, numStr, r.k, r.estLnProb,
                                                            r.meanLlh, r.varLlh))
    file.write('\nSoftware written by Dent Earl while at EEB Dept, UCLA, BME Dept UCSC\n'
               'dearl (a) soe ucsc edu')
    file.write('CITATION:\nEarl, Dent A. and vonHoldt, Bridgett M. (2011)\n'
               'STRUCTURE HARVESTER: a website and program for visualizing\n'
               'STRUCTURE output and implementing the Evanno method.\n'
               'Conservation Genetics Resources DOI: 10.1007/s12686-011-9548-7\n'
               'Version: %s' % __version__)
    file.close()

def clumppGeneration(data, options):
    if not options.clumpp:
        return
    minK = data.sortedKs[0]
    maxK = data.sortedKs[-1]
    regex1 = r'''^(\d+) # individual number
                 \s+(\S+) # Label
                 \s+(\S+) # %Miss 
                 \s+([\-\d]+)\s+: # Pop, can be -9
                 \s+(.*?)$ # Inferred clusters'''
    regex2 = r'''^(\d+) # individual number
                 \s+(\S+) # Label
                 \s+(\S+)\s+: # %Miss. if only 1 inferred cluster, then 1 is not printed
                 \s+(.*?)$ # Inferred clusters'''
    # regex2 also covers the new style of confidence intervals following the standard values
    regex3 = r'''^(\d+) # individual number
                 \s+(\S+)\s+: # %Miss
                 \s+([\d. ]*?)$ # Inferred clusters'''
    regexIntervals = '\(\S+,\S+\)' # used to trim off the confidence intervals
    pats = []
    for r in [regex1, regex2, regex3]:
        pats.append(re.compile(r, re.X))
    patInterval = re.compile(regexIntervals)
    for k in data.sortedKs:
        f = open (os.path.join(options.outDir, 'K%d.indfile' % k), 'w')
        for r in data.records[k]:
            inFile = open (os.path.join(options.resultsDir, r.name), 'r')
            printing = False
            for lineno, line in enumerate(inFile, 1):
                line = line.strip()
                if line == '':
                    printing = False
                    continue
                d = line.split(' ')
                if d[0] == 'Label' and d[1] == '(%Miss)' or d[0] == '(%Miss)':
                    printing = True
                    continue
                # find all confidence intervals and remove them
                m = re.findall(patInterval, line)
                for i in m:
                    line = line.replace(i, '')
                line = line.strip()
                if printing:
                    m1 = re.match(pats[0], line)
                    m2 = re.match(pats[1], line)
                    m3 = re.match(pats[2], line)
                    if m1 is None and m2 is None and m3 is None:
                        sys.stderr.write('Error clumpp generation, %s failed '
                                         'to match with regexs "%s", "%s", "%s" on lineno %d [%s]\n' 
                                         % (r.name, regex1, regex2, regex3, lineno, line))
                        sys.exit(1)
                    else:
                        if m1 is not None:
                            f.write('%3d %3d %s %2d : %s\n' % (int(m1.group(1)), int(m1.group(1)), m1.group(3),
                                                                int(m1.group(4)), m1.group(5)))
                        elif m2 is not None:
                            f.write('%3d %3d %s 1 : %s\n' % (int(m2.group(1)), int(m2.group(1)),
                                                              m2.group(3), m2.group(4)))
                        elif m3 is not None:
                            f.write('%3d %3d %s 1 : %s\n' % (int(m3.group(1)), int(m3.group(1)),
                                                              m3.group(2), m3.group(3)))
            f.write('\n')
        f.close()
    clumppPopFile(data, options)

def clumppPopFile(data, options):
    regex1 = r'''^(\d+): # Given Pop                         
                 \s+([\d. ]*?) # All Inferred Cluster columns
                 \s+(\d+)$ # # Number of Individuals'''
    f = None
    pats = []
    for r in [regex1]:
        pats.append(re.compile(r, re.X))
    for k in data.sortedKs:
        for r in data.records[k]:
            inFile = open (os.path.join(options.resultsDir, r.name), 'r')
            printing = False
            skipLine = False
            workComplete = False
            for lineno, line in enumerate(inFile, 1):
                if workComplete:
                    continue
                line = line.strip()
                d = line.split()
                if printing == True and line.startswith('--------------------'):
                    workComplete = True
                    printing = False
                    continue
                if len(d) < 3:
                    continue
                if d[0] == 'Given' and d[1] == 'Inferred' and d[2] == 'Clusters':
                    printing = True
                    f = open (os.path.join(options.outDir, 'K%d.popfile' % k), 'a')
                    skipLine = True
                    continue
                if skipLine:
                    skipLine = False
                    continue
                if printing:
                    m1 = re.match(pats[0], line)
                    if m1 is None:
                        sys.stderr.write('Error, clumpp popfile generation failed\n'
                                         'Your file %s has unexpected line structure!\n'
                                         'Job failed '
                                         'to match regex\n%s\non this line:\n%s\n'
                                         'Please verify your input file\'s characters '
                                         'and try again after making the correction.\n\n' % 
                                         (r.name, regex1, line))
                    else:
                        f.write('%3d:' % (int(m1.group(1))))
                        for i in xrange(2, len(m1.groups()) + 1):
                            f.write('\t%s' % m1.group(i))
                        f.write('\n')
            if f is not None:
                f.write('\n')
                f.close()
                f = None

def evannoMethod(data, options):
    if not options.evanno:
        return
    value = evannoTests(data)
    if value is not None:
        sys.stderr.write('Unable to perform Evanno method for the follow reason(s):\n')
        sys.stderr.write(value)
        sys.exit(1)
    calculatePrimesDoublePrimesDeltaK(data)
    writeEvannoTableToFile(data, options)

def writeEvannoTableToFile(data, options):
    file = open(os.path.join(options.outDir, 'evanno.txt'), 'w')
    file.write('# This document produced by structureHarvester.py %s\n' % __version__)
    file.write('# http://users.soe.ucsc.edu/~dearl/struct_harvest\n')
    file.write('# http://taylor0.biology.ucla.edu/struct_harvest\n')
    file.write('# Written by Dent Earl, dearl (a) soe ucsc edu.\n')
    file.write('# CITATION:\n# Earl, Dent A. and vonHoldt, Bridgett M. (2011)\n'
               '# STRUCTURE HARVESTER: a website and program for visualizing\n'
               '# STRUCTURE output and implementing the Evanno method.\n'
               '# Conservation Genetics Resources DOI: 10.1007/s12686-011-9548-7\n'
               '# Version: %s\n'
               % __version__)
    file.write('# File generated at %s\n' % (time.strftime('%Y-%b-%d %H:%M:%S %Z', time.localtime())))
    file.write('#\n')
    file.write('\n##########\n')
    file.write('# K\tReps\t'
                'Mean LnP(K)\tStdev LnP(K)\t'
                'Ln\'(K)\t|Ln\'\'(K)|\tDelta K\n')
    for i in xrange(0, len(data.sortedKs)):
        k = data.sortedKs[i]
        if k in data.LnPK:
            LnPKstr = '%f' % data.LnPK[k]
        else:
            LnPKstr = 'NA'
        if k in data.LnPPK:
            LnPPKstr = '%f' % data.LnPPK[k]
        else:
            LnPPKstr = 'NA'
        if k in data.deltaK:
            deltaKstr = '%f' % data.deltaK[k]
        else:
            deltaKstr = 'NA'
        file.write('%d\t'
                    '%d\t%.4f\t%.4f\t'
                    '%s\t%s\t%s\n' % (k, len(data.records[k]),
                                       data.estLnProbMeans[k],
                                       data.estLnProbStdevs[k],
                                       LnPKstr, LnPPKstr, deltaKstr))
    file.close()

def evannoTests(data):
    """ sucess returns None, failure returns
    an html formatted string which can be directly
    printed in to the .html output.
    """

    fail = False
    numRuns = 0
    for k in data.records:
        numRuns += len(data.records[k])
    numReps = numRuns / float(len(data.records))
    out  = 'Stats: number of runs = %d, number of replicates = %.2f, ' % (numRuns, numReps)
    out += 'minimum K tested = %d, maximum K tested = %d.\n' % (data.sortedKs[0],
                                                                 data.sortedKs[-1])
    # Must test at least three values of K
    if len(data.sortedKs) >= 3:
        out += 'Test: You must test at least 3 values of K. PASS\n'
    else:
        fail = True
        out += 'Test: You must test at least 3 values of K. FAIL\n'
    
    # K values must be sequential
    if ((data.sortedKs[-1] + 1) - data.sortedKs[0]) == len(data.sortedKs):
        out += 'Test: K values must be sequential. PASS\n'
    else:
        fail = True
        out += 'Test: K values must be sequential. FAIL\n'
        out += ' *  '
        prevK = data.sortedKs[0] - 1
        i = 0
        for k in data.sortedKs:
            i += 1
            if i == len(data.sortedKs):
                spacer = ''
            else:
                spacer = ', '
            if k != prevK + 1:
                out += 'MISSING: %d%s' % (k, spacer)
            else:
                out += '%d%s' % (k, spacer)
            prevK = k
        out += '\n'

    # Number of replicates must be greater than 2 (stdev)
    if numReps > 1:
        out += 'Test: The number of replicates per K > 1. PASS\n'
    else:
        fail = True
        out += 'Test: The number of replicates per K > 1. FAIL\n'

    # Standard Devation for a K (but not the first or last K) is zero                                       
    for i in xrange(1, len(data.sortedKs) - 1):                                                             
        k = data.sortedKs[i]                                                                                
        if data.estLnProbStdevs[ k ] < EPSILON: # our epsilon                                               
            fail = True                                                                                     
            out += ('Test: Standard devation of est. Ln Pr(Data) '
                    'is less than 0.0000001. FAIL for K = %d. The Evanno test requires division '
                    'by the standard deviation of the est. Ln Pr(Data) values for all K between '
                    'the first and last K value, and thus when the standard deviation is within '
                    'Epsilon (%f) of zero we cannot perform the test. (Try running more '
                    'replicates to hopefully increase the standard deviation for that value of '
                    'K.)\n' % (k, EPSILON) )

    # Standard Devation for one K is zero
    for k in data.sortedKs:
        if data.estLnProbStdevs[k] < EPSILON: # our epsilon
            fail = True
            out += ('Test: Standard devation of est. Ln Pr(Data) '
                    ' < 0.0000001. FAIL on K = %d. (Try running more replicates.)\n' % k)
    if fail == True:
        return out
    return None

def calculateMeansAndSds(data, options):
    data.estLnProbMeans  = {}
    data.estLnProbStdevs = {}
    for k in data.records:
        data.estLnProbMeans[k] = 0
        data.estLnProbStdevs[k] = 0
        for r in data.records[k]:
            data.estLnProbMeans[k] += r.estLnProb
        data.estLnProbMeans[k] /= len(data.records[k])
        if len(data.records[k]) > 1:
            for r in data.records[k]:
                data.estLnProbStdevs[k] += (r.estLnProb - data.estLnProbMeans[k]) ** 2
            data.estLnProbStdevs[k] /= (len(data.records[k]) - 1)
            data.estLnProbStdevs[k] = math.sqrt(data.estLnProbStdevs[k])

def calculatePrimesDoublePrimesDeltaK(data):
    data.LnPK   = {} 
    data.LnPPK  = {}
    data.deltaK = {}
    for i in xrange(1, len(data.sortedKs)):
        thisK = data.sortedKs[i]
        prevK = data.sortedKs[i - 1]
        data.LnPK[thisK] = data.estLnProbMeans[thisK] - data.estLnProbMeans[prevK]
    for i in xrange(1, len(data.sortedKs) - 1):
        prevK = data.sortedKs[i - 1]
        thisK = data.sortedKs[i]
        nextK = data.sortedKs[i + 1]
        data.LnPPK[thisK] = abs(data.LnPK[nextK] - data.LnPK[thisK])
        data.deltaK[thisK] = abs(data.estLnProbMeans[nextK] -
                                     2.0 * data.estLnProbMeans[thisK] +
                                     data.estLnProbMeans[prevK]) / float(data.estLnProbStdevs[thisK])
def main():
    usage = ('usage: %prog --dir=path/to/dir/ --out=path/to/dir/ [options]\n\n'
              '%prog takes a STRUCTURE results directory (--dir) and an\n'
              'output directory (--out will be created if it does not exist) and then\n'
              'depending on the other options selected harvests data from the results\n'
              'directory and performs the selected analyses')
    data = Data()
    parser = OptionParser(usage = usage, version = '%prog '+ __version__)
    initOptions(parser)
    (options, args) = parser.parse_args()
    checkOptions(parser, options)

    harvestFiles(data, options)
    calculateMeansAndSds(data, options)
    clumppGeneration(data, options)
    evannoMethod(data, options)
    writeRawOutputToFile(data, options)

if __name__ == '__main__':
    main()