ACEAssemblySplitter.pyΒΆ

#! /usr/bin/env python2.7

desc = """

Split an ACE format assembly file into multiple single ACE files each with a
    single contig. The resulting ACE contig files are formatted so that they can
    be read by CodonCode (http://www.codoncode.com).

$ ACEAssemblySplitter.py test.ace -s 2 -e 3 
  - prints the second and third contigs in the test.ace file

Version 1.0
Cymon J. Cox
Nov 2011
"""

import sys
try:
    import argparse
except ImportError:
    print "Requires Python >= 2.7."
    sys.exit()
import os
import argparse
import textwrap
import subprocess

def count(filename, verbose=True):

    cmd = "grep -c '^CO ' %s" % filename
    child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    stdout, stderr = child.communicate()
    if not child.returncode == 0:
        print "grep returned error %s" % child.returncode
        print "stderr: %s\n" %stderr
        sys.exit(1)
    else:
        if verbose:
            print "Number of contigs in %s: %s" % (filename, stdout)
    return int(stdout)


def main(filename, count_only, start, end, verbose):

    if not os.path.exists(filename):
        print "Cannot find file %s" % filename
        sys.exit(1)

    if count_only:
        count(filename)
        sys.exit(0)
    else:
        if start or end:
            n_contigs = count(filename, verbose=False)
        if start and end:
            if start > end:
                msg = "Cannot start at %i and end at %i"
                print msg (start, end)
                sys.exit(1)
        if start > n_contigs:
            msg = "Only %i contigs in %s - cannot start at %i"
            print msg % (n_contigs, filename, start)
            sys.exit(1)
        if end > n_contigs:
            msg = "Only %i contigs in %s - cannot end at %i"
            print msg % (n_contigs, filename, end)
            sys.exit(1)
    
    if not end:
        end = n_contigs

    isPreviousAF = False
    inRT = False
    inCT = False
    skipNextLine = False
    inHeader = True
    foundStart = False
    lines = []
    counter = 1
    fileLines = open(filename).readlines()
    
    for line in fileLines:
        if line.startswith("CO "):
            if lines != []:
                if inHeader:
                    #Skip over header:
                    lines = []
                    inHeader = False
                    lines.append(line)
                    continue

                #Are we there yet?
                if not foundStart:
                    if counter == start:
                        foundStart = True
                    else:
                        lines = []
                        counter +=1
                        lines.append(line)
                        continue

                if foundStart:
                    if counter < end:
                        #CO FucusALL_rep_c1 1819 1428 94 U
                        name = lines[0].split()[1]
                        outfilename = "%s.ace" % name
                        head = "AS 1 1\n\n"
                        lines.insert(0, head)
                        fh = open(outfilename, "w")
                        fh.writelines(lines)
                        fh.close()
                        counter +=1 
                        lines = []
                    else:
                        break

            lines.append(line)
            continue
        if skipNextLine:
            skipNextLine = False
            continue
        #Delete CT and RT stanzas
        elif line.startswith("RT{"):
            inRT = True
            continue
        elif line.startswith("CT{"):
            inCT = True
        elif line.startswith("}"):
            #Both CT and RT finish with a C}
            inCT = inRT = False
            #Need to miss the next "\n" line
            skipNextLine = True
            continue
        elif inCT or inRT:
            continue
        #Delete the blank line after AF line
        elif line.startswith("AF"):
            isPreviousAF = True
            lines.append(line)
        elif line.startswith("\n") and isPreviousAF:
            isPreviousAF = False
            continue
        elif line.startswith("RD "):
            #delete previous "\n" line
            lines = lines[:-1]
            lines.append(line)
        elif line.startswith("DS "):
            #delete previous "\n" line
            lines = lines[:-1]
            lines.append(line)
        else:
            lines.append(line)
    
    #print the last one:
    name = lines[0].split()[1]
    outfilename = "%s.ace" % name
    head = "AS 1 1\n\n"
    lines.insert(0, head)
    fh = open(outfilename, "w")
    fh.writelines(lines)
    fh.close()
    counter +=1 
    
    if verbose:
        print "Done. Split %i Contigs." % counter

if __name__ == "__main__":

    parser = argparse.ArgumentParser(
            formatter_class=argparse.RawDescriptionHelpFormatter,
            description=textwrap.dedent(desc),
            )
    parser.add_argument("filename",
                        help="ACE assembly filename",
                        )
    parser.add_argument("-c", "--count", default=False, action='store_true',
        help="Count number of contigs in assembly only. Default=False")
    parser.add_argument("-s", "--start", default=1, type=int, dest="start",
        help="Start at record number N (integer). Default=1")
    parser.add_argument("-e", "--end", default=None, type=int, dest="end",
        help="End at record number N (integer). Default=last in file")
    parser.add_argument("-v", "--verbose", default=False, action='store_true',
        help="Verbose. Default=False")
    args = parser.parse_args()
    main(args.filename, args.count, args.start, args.end, args.verbose)