combineNexus.pyΒΆ

#! /usr/bin/env python2.7

"""
combineNexus.py - Combine Nexus formatted matrices

Cymon J. Cox <cymon.cox@googlemail.com>

Reads Nexus formatted matrices and combines them into a single matrix with
blank sequences for genes that are missing from individual matrices.

"""

VERSION = 1.0
DATE = '10 Jan 2010'

import os
import sys
from optparse import OptionParser
from collections import Counter
import p4

def Matrix(self):
    return dict([(seq.name, seq.sequence) for seq in self.sequences])
p4.Alignment.Matrix = property(Matrix)

def main(matrix_paths, outfile_name):
    for path in matrix_paths:
        try:
            p4.read(path)
        except p4.Glitch, e:
            if "Alignment.checkForAllGapColumns" in e.__str__():
                error = e.__str__().split("\n")
                filename = error[3].split("=")[1]
                sites = error[6].split("-")[2]
                raise RuntimeError, "\t All gap column(s) in file %s at site(s): %s" \
                        % (filename, sites)
            else:
                raise
    data_types = Counter()
    for a in p4.var.alignments:
        if not isinstance(a, p4.Alignment):
            raise RuntimeError, "\t Alignments must be a list of p4.Alignment objects"
        data_types[a.dataType] += 1
    if len(data_types) > 1:
        raise RuntimeError, "\t Matrices have different data types: %s" \
                % ", ".join(list(data_types.elements()))
    else:
        datatype = list(data_types.elements())[0]
    matrices = [(a.Matrix, a.nChar) for a in p4.var.alignments]
    nChars  = sum([m[1] for m in matrices])
    taxa = []
    for a in p4.var.alignments:
        for taxon in a.taxNames:
            if taxon not in taxa:
                taxa.append(taxon)
    NTax = len(taxa)
    if not taxa:
        raise RuntimeError, "\t TaxNames must be set in the alignments"
    of = open(outfile_name, "w")
    s = "#NEXUS\n\nbegin data;\ndimensions ntax=%s nChar=%s;\nformat " + \
        "datatype=%s gap=- missing=?;\nmatrix\n" 
    of.write(s % (NTax, nChars, datatype))
    for taxon in taxa:
        of.write("\t%-70s%s\n" % (taxon, 
            "".join([matrix.get(taxon, "-"*nChar) for matrix, nChar in matrices])))
    of.write("\t;\nend;\n\nbegin sets;\n\n")
    #Print out the alignment lengths
    count = 0
    for a in p4.var.alignments:
        #print "%s: %s = %s" % (os.path.basename(a.fName), a.nChar, count)
        of.write("\tcharset %s = %s-%s [len: %s]\n" % (os.path.basename(a.fName),
                    count+1, count+a.nChar, a.nChar))
        count += a.nChar
    of.write("\nend;\n")
    of.close()

if __name__ == "__main__":
    msg = """\n\tcombineNexus.py [options] <Nexus matrix 1> <Nexus matrix 2> etc,


    Reads Nexus formatted matrices and combines them into a single matrix with
    blank sequences for genes that are missing from individual matrices.

    Version  %s %s
    Cymon J. Cox <cymon.cox@googlemail.com>
    """
    usage = msg % (VERSION, DATE)
    parser=OptionParser(usage)
    parser.add_option("-o", "--outfile", dest="outfile",
            help="Name of output file (Default: combinedNexus.nex)",
            type="string", default="combinedNexus.nex")
    parser.add_option("-c", "--check-matrices", dest="check",
            help="Check for all gap columns (Default: False)",
            action="store_true", default=False)
    (options,args) = parser.parse_args(sys.argv[1:])
    #Override any Config variables
    p4.var.doCheckForDuplicateSequences = False #Should be off
    if options.check:
        p4.var.doCheckForAllGapColumns = True
    else:
        p4.var.doCheckForAllGapColumns = False
    if os.path.exists(options.outfile):
        parser.error("Ouput file '%s' already exists" % options.outfile)
    if len(args) == 0:
        parser.error("No matrices indicated")
    if len(args) < 2:
        parser.error("Only one Nexus matrix indicated")
    else:
        for mat in args:
            if not os.path.exists(mat):
                parser.error("Couldn't find matrix '%s'" % mat)
    main(args, options.outfile)