tct.pyΒΆ

#! /usr/bin/env python
"""
tct - Tree Congruence Test(er)

Cymon J. Cox <cymon.cox@googlemail.com>

Reads two rooted trees (NEXUS and/or PHYLIP format), reciprocally prunes each
tree of missing taxa (automatic), deletes any taxa passed to the programme (via
-d), and tests topological congruence among the remaining clades supported by a
value greater than the set threshold.  """

VERSION = 1.7
DATE = '10 Jan 2010'

import sys
import types
import ast
import exceptions
from optparse import OptionParser
import cStringIO
import p4


class TCTError(exceptions.Exception):
    def __init__(self, message):
        self.message = message
        Exception.__init__(self)
        return
    def __str__(self):
        print "\n\n\tTCTError: %s" % self.message
    
def _makeAllCladesList(self):
    """
    Provides a list of lists of all taxa in all clades in the tree
    t = p4.Tree() instance
    """
    all_clades = []
    #for node in internal_nodes:
    for node in self.root.iterInternals():
        all_clades.append([[n.name for n in node.iterLeaves()], node.name,
            node.nodeNum])
    return all_clades

p4.Tree._makeAllCladesList = _makeAllCladesList   

def _compareSubTrees(t1, t2, threshold, verbose, root_taxon):
    tree1Clades = t1._makeAllCladesList()
    tree2Clades = t2._makeAllCladesList()
    # If tregqe has been re-rooted then need to delete first clade which is the
    # root taxon / rest of taxa split
    if root_taxon != None:
        del tree1Clades[0]
        del tree2Clades[0]
    conf = []
    number_of_conflicts = 0
    for subClade1 in [x for x in tree1Clades if x[1] != None and float(x[1]) >=
                threshold]:
        for subClade2 in [y for y in tree2Clades if y[1] != None and float(y[1])
                >= threshold]:
            intersect, notinl1, notinl2 = _checkSubtreeConflict(subClade1[0], 
                        subClade2[0])
            if intersect:
                sys.stderr.write('.')
                number_of_conflicts += 1
                conf.append("\n%s" % (60*"#"))
                conf.append("\nConflict # %s" % number_of_conflicts)
                conf.append("\n\nTree1 (%s):" % t1.name)
                conf.append("\n\tClade (Node # %s) Support = %2.2f, # of Taxa = %s:\n\t"
                        % (str(subClade1[2]), float(subClade1[1]), len(subClade1[0])))
                if verbose or len(subClade1[0]) <= 7:
                    conf.append(', '.join(subClade1[0]))
                else:
                    conf.append(', '.join(subClade1[0][:3]) + '   [...]   ' + 
                                ', '.join(subClade1[0][-3:]))
                conf.append("\nTree2 (%s): " % t2.name)
                conf.append("\n\tClade (Node # %s) Support = %2.2f, # of Taxa = %s:\n\t"
                        % (str(subClade2[2]), float(subClade2[1]), len(subClade2[0])))
                if verbose or len(subClade2[0]) <= 7:                    
                    conf.append(', '.join(subClade2[0]))
                else:
                    conf.append(', '.join(subClade2[0][:3]) + '   [...]   ' + 
                                ', '.join(subClade2[0][-3:]))
                conf.append("\n\nShared taxa in both subclades, # of Taxa = %s : " 
                                % len(intersect))
                if verbose or len(intersect) <= 7:
                    conf.append(', '.join(intersect))
                else:
                    conf.append(', '.join(intersect[:3]) + '   [...]   ' + 
                                ', '.join(intersect[-3:]))
                conf.append("\nTaxa not in %s, # of Taxa = %s : " 
                                % (t1.name, len(notinl1)))
                if verbose or len(notinl1) <= 7:
                    conf.append(', '.join(notinl1))
                else:
                    conf.append(', '.join(notinl1[:3]) + '   [...]   ' + 
                                ', '.join(notinl1[-3:]))
                conf.append("\nTaxa not in %s, # of Taxa = %s : " 
                                % (t2.name, len(notinl2)))
                if verbose or len(notinl2) <= 7:
                    conf.append(', '.join(notinl2))
                else:
                    conf.append(', '.join(notinl2[:3])  + '   [...]   ' + 
                                ', '.join(notinl2[-3:]))
                conf.append("\n")
    return number_of_conflicts, ''.join(conf)

def _checkSubtreeConflict(l1,l2):
    l1.sort()
    l2.sort()
    intersect = [taxon for taxon in l1 if taxon in l2]
    notinl1   = [taxon for taxon in l2 if taxon not in l1]
    notinl2   = [taxon for taxon in l1 if taxon not in l2]
    if not (intersect == [] or intersect == l1 or intersect == l2 or l1 == l2):
        return intersect, notinl1, notinl2
    else:
        return None, None, None
        
def _pruneTreesToComplement(tree1, tree2, root_taxon, delete, outfile, verbose):
    # Delete out the missing taxa from trees so that they intersect:
    tree1_taxa = [n.name for n in tree1.nodes if n.isLeaf]
    tree2_taxa = [n.name for n in tree2.nodes if n.isLeaf]
    delete_from_tree1 = set([t for t in tree1_taxa if t not in tree2_taxa])
    delete_from_tree2 = set([t for t in tree2_taxa if t not in tree1_taxa])
    for t in delete_from_tree1:
        tree1.removeNode(t)
    for t in delete_from_tree2:
        tree2.removeNode(t)
    # If a delete list is passed delete them!
    if len(delete) != 0:
        t1t2d = delete_from_tree1 | delete_from_tree2
        for t in delete:
            if t not in t1t2d and t != '':
                tree1.removeNode(t)
                tree2.removeNode(t)
    # Change root if necessary
    if root_taxon != None:
        try:
            tree1.reRoot(root_taxon)
            tree2.reRoot(root_taxon)
        except AttributeError:
            raise TCTError, 'Re-rooting to %s: Taxon not found in tree' % \
                root_taxon
    if len([n.name for n in tree1.nodes if n.isLeaf]) != \
            len([n.name for n in tree2.nodes if n.isLeaf]):
        raise TCTError, 'Something unexpected went wrong with the taxon ' + \
                'pruning. After pruning trees have different number of taxa.'

    if verbose:
        # Document deletions
        outfile.write('\nNumber of taxa in tree1 ==> %s \n' % len(tree1_taxa))
        outfile.write('Number of taxa in tree2 ==> %s \n' % len(tree2_taxa))
        outfile.write('%s missing taxa deleted from tree1 ==> %s \n' %
                (len(delete_from_tree1), ', '.join(delete_from_tree1)))
        outfile.write('%s missing taxa deleted from tree2 ==> %s \n' %
                (len(delete_from_tree2), ', '.join(delete_from_tree2)))
        if len(delete) != 0:
            outfile.write('Taxa deleted via -d option ==> %s \n' % 
                ', '.join(delete))
    return tree1, tree2, delete_from_tree1 | delete, delete_from_tree2 | delete

def _parseTreesInput(trees):
    """If run as __main__ trees must be filenames, else if run as module they
       can be either tree filenames or p4.Tree instances
    """
    if len(trees) != 2:
        if __name__ == '__main__':
            raise TCTError, 'You must supply two tree file names, each file ' + \
                    'containing a single tree'
        else:
            raise TCTError, 'You must supply either the names of two tree ' + \
                    'files OR two p4.Tree() instances'
    if isinstance(trees[0], p4.Tree):
        if not isinstance(trees[1], p4.Tree):
            raise TCTError, 'You must supply two p4.Tree() instance if you ' + \
                    'supply one'
        else:
            tree1 = trees[0]
            tree2 = trees[1]
    else:
        # Should be dealing with file names
        if isinstance(trees[0], str):
            if not isinstance(trees[1], str):
                raise TCTError, 'You must supply two filenames if you supply one'
        else:
            msg = '%s does not appear to be either a filename or a ' + \
                    'p4.Tree() instance'
            raise TCTError, msg % str(trees[0])
        # Try reading the files
        try:
            p4.read(trees[0])
        except IOError:
            raise TCTError, 'Unable to read tree file: %s' % trees[0]
        if len(p4.var.trees) != 1:
            print p4.var.trees
            raise TCTError, '%s contains more that one tree' % trees[0]
        try:
            p4.read(trees[1])
        except IOError:
            raise TCTError, 'Unable to read tree file: %s' % trees[1]
        if len(p4.var.trees) != 2:
            raise TCTError, '%s contains more that one tree' % trees[1]
        tree1 = p4.var.trees[0]
        tree2 = p4.var.trees[1]
        #Check that support values are valid - ie 0.0-1.0
        for tree in [tree1, tree2]:
            found = False
            for node in tree.iterInternalsNoRoot():
                if not node.name:
                    continue
                else:
                    value = ast.literal_eval(node.name)
                    if not isinstance(value, float):
                        msg = 'A node name (%s) in tree %s is not a floating ' + \
                              'point number. Names on nodes should be decimal ' + \
                              'numbers (0.0-1.0) representing the support ' + \
                              'probability.'
                        raise TCTError,  msg % (node.name, tree.fName)
                    else:
                        if value > 1.0:
                            msg = 'Node support value > 1.0 in tree %s'
                            raise TCTError, msg % tree.fName
                        found = True
            if not found:
                msg = 'No valid support values (0.0-1.0) were found in tree %s'
                raise TCTError, msg % tree.fName
    return tree1, tree2

def _parseSpecifiedDeleteTaxa(delete):
    """If run as __main__ delete should be a filename, else if run interactively
    may be either a filename or a list object
    """
    if isinstance(delete, list):
        delete = set(delete)
    else:
        try:
            delete = set([t.strip() for t in open(delete, 'rU').readlines()])
        except IOError:
            raise TCTError, 'Could not open file %s' % delete
    return delete

def _writePrunedTrees(t1, t2, d1, d2):
    message = '\n\tTree Congruence Test (tct.py): Tree used to conduct the ' + \
              'tree congruence\n\ttest after pruning of taxa missing from ' + \
              'comparison tree (%s)\n\tand any taxa deleted via the -d ' + \
              'option.\n\n\tNEXUS taxset of deleted taxa:\n\tbegin sets; ' + \
              '\n\t\ttaxset delete = %s;\n\tend;\n'

    t1.writeNexusFile(outFileName='%s-TCT1.nex' % t1.name, message=message % 
            (t2.name, ' '.join(d1)))
    t2.writeNexusFile(outFileName='%s-TCT2.nex' % t2.name, message=message % 
            (t1.name, ' '.join(d2)))
    
def treeCongruenceTest(trees=[], root_taxon=None, threshold=0.95, verbose=False,
                        delete=None, write_trees=False, outfile=None):
    """
    If running as module this function accepts the following options:

    trees=[]          -> list of 2 tree filenames or list of 2 p4.Tree() 
                         instances
    root_taxon=None   -> name of taxon to re-root the two trees (Optional)
    threshold=0.95    -> threshold support value for conflict
    verbose=False     -> when True gives long description of conflicts
    delete=None       -> either a filename of file with taxa to be deleted 
                         from both trees, or a list of such taxa
    write_trees=False -> if true, the pruned trees used to test incongruence 
                         are written in (alt)NEXUS format
    outfile=None      -> filename to write conflict descriptions to, else is 
                         printed to stdout
    """
    # Deal with output direction
    if outfile == None:
        outfile = sys.stdout
    else:
        try:
            outfile = open(outfile, 'w')
        except IOError:
            raise TCTError, 'Could not write outfile to disk'
    tree1, tree2 = _parseTreesInput(trees)
    if threshold > 1:
        raise TCTError, 'Incorrect support threshold: %f' % threshold
    if delete != None:
        sys.stderr.write('\tNote: Deleting taxa from file %s ... \n' % delete)
        delete = _parseSpecifiedDeleteTaxa(delete)
    else:
        delete = set([])
    if verbose:
        outfile.write('Tree Congruence Test(er) - Version %s\n\n' + \
                '\t***** %s vs %s *****\n' % (VERSION, tree1.name, tree2.name))
        sys.stderr.write('\tNote: Pruning trees to complement...\n')
        sys.stderr.write('\tNote: Searching for conflicts:')
    t1, t2, d1, d2 = _pruneTreesToComplement(tree1, tree2, root_taxon, delete, 
            outfile, verbose)
    number_of_conflicts, description = _compareSubTrees(t1, t2, threshold, 
            verbose, root_taxon)
    # Finish up
    if write_trees:
        _writePrunedTrees(t1, t2, d1, d2)
    if number_of_conflicts == 0:
        outfile.write('\n ***** No conflict detected at %s threshold ' + \
                '*****' % threshold)
        pass
    else:
        outfile.write('\n%s\n' % (60*'#'))
        outfile.write('\n%s%s CONFLICTS DETECTED!\n' % (' '*8,
                number_of_conflicts))
        outfile.write('\n%s\n' % (60*'#'))
        outfile.write(description)
    sys.stderr.write('\nDone.\n')
    outfile.close()
            
if __name__=='__main__':
    msg   = 'usage: Tree Congruence Tester [options] treefile1 treefile2 ' + \
            '\n\nTree Congruence Test(er): reads two rooted trees (NEXUS and/or ' + \
            'PHYLIP format),\nreciprocally prunes each tree of missing taxa ' + \
            '(automatic), deletes any taxa\npassed to the programme (via -d), ' + \
            'and tests topological congruence among the\nremaining clades ' + \
            'supported by a value greater than the set threshold.\n\n' + \
            'Version  %s %s \nCymon J. Cox <cymon.cox@googlemail.com> & ' + \
            'Frank Kauff'
    usage = msg % (VERSION, DATE)

    parser=OptionParser(usage)
    parser.add_option('-t','--threshold', dest='threshold',
                        help='Support threshold - values must be between ' + \
                        '0.0-1.0 (Default: 0.95)', default=.95, type='float')
    parser.add_option('-r','--root-with', dest='root_taxon', 
                        help='Specify taxon to root trees. (Default: None)', default=None)
    parser.add_option('-d', '--delete-taxa', dest='delete', default=None,
                        help='Taxa specified in a file will be deleted from the ' + \
                        'congruence test. The input file should be a plain ASCII ' + \
                        'text file with one taxon on each line, as they appear ' + \
                        'in the tree description, and without punctuation. Use ' + \
                        'this to iterate over taxon deletion until no conflicts ' + \
                        'occur.')
    parser.add_option('-o', '--outfile-name', dest='outfile', default=None,
                        help='Write results to named file. (Default: False)')
    parser.add_option('-w', '--write-tree-files', dest='write_trees', 
                        action='store_true', default=False,
                        help='Write the pruned trees used for the tree ' + \
                        'congruence test in (alt)NEXUS format. File names are ' + \
                        'the tree name from the input NEXUS tree file + TCT(1 ' + \
                        'or 2).nex. A NEXUS sets block containing a taxset of ' + \
                        'taxa deleted from the tree is included in a comment ' + \
                        'in the written tree file. (Default: False)')
    parser.add_option('-v','--verbose', dest='verbose', 
                        action='store_true', default=False,
                        help='If set, all taxa in a clade are printed. ' + \
                        '(Default: False)')
    (options,args) = parser.parse_args(sys.argv[1:])
    treeCongruenceTest(args, options.root_taxon, options.threshold, options.verbose,
                        options.delete, options.write_trees, options.outfile)