martyr.pyΒΆ

#! /usr/bin/env python

"""
A re-implementation of MARTA (http://bergelson.uchicago.edu/software/marta)

Unlike MARTA, this re-implementation works with mRNA transcripts from EST
projects and works with blastn(x) output.

"""
import os
import sys
import subprocess
from optparse import OptionParser
from BioSQL import BioSeqDatabase
from Bio.Blast import NCBIXML


def retrieve_taxonomy(ncbi_taxon_id):
    """Retrieve the taxonomic lineage using the NCBI taxon-id from a BIOSQL
    DB"""
    taxonomy = []
    #We only have the ncbi_taxon_id so use it to get the first bioSQL taxon_id
    #then iterate up through the taxonomy
    SQL = """SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id 
        FROM taxon, taxon_name
        WHERE taxon.taxon_id=taxon_name.taxon_id
        AND taxon_name.name_class='scientific name'
        AND taxon.ncbi_taxon_id = %s"""
    name, rank, parent_taxon_id = server.adaptor.execute_one(SQL % (ncbi_taxon_id,))
    taxonomy.append(name)
    taxon_id = parent_taxon_id
    while taxon_id:
        SQL = """SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id 
        FROM taxon, taxon_name
        WHERE taxon.taxon_id=taxon_name.taxon_id
        AND taxon_name.name_class='scientific name'
        AND taxon.taxon_id = %s"""

        name, rank, parent_taxon_id = server.adaptor.execute_one(SQL % (taxon_id,))
        
        if taxon_id == parent_taxon_id:
        # If the taxon table has been populated by the BioSQL script
        # load_ncbi_taxonomy.pl this is how top parent nodes are stored.
        # Personally, I would have used a NULL parent_taxon_id here.
            break
        if rank != "no rank":
        #For consistency with older versions of Biopython, we are only
        #interested in taxonomy entries with a stated rank.
        #Add this to the start of the lineage list.
            taxonomy.insert(0, name)
        taxon_id = parent_taxon_id
    return taxonomy

def get_NCBI_taxonID_from_GI_number(gi_number, bdbname="nr", 
                                    path_to_blastdbcmd="blastdbcmd",
                                    strict_annotation=True,
                                    strict_taxonomy=True,
                                    verbose=True):
    """Use blastdbcmd to obtain the NCBI taxon-id using a GI number
    """
    cmdline = "%s -db %s -entry %s -outfmt %%T" % (path_to_blastdbcmd, bdbname, gi_number)
    #print cmdline
    child = subprocess.Popen(str(cmdline),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
            shell=(sys.platform!="win32"))
    output, error = child.communicate()
    #print "OUTPUT: X%sX" % output
    #The record is missing from the db 
    if "not found" in error != "":
        if strict_annotation:
            print "\n\n\tUnable to find GI %s in BLAST DB %s. This suggest " \
                    "that the BLAST analyses were not run against the %s " \
                    "BLAST DB." % (gi_number, bdbname, bdbname)
            print "\t(Note 'strict_annotation' is currently set True. Set " \
                    "False to ignore unfound GI numbers. It is possible for " \
                    "NCBI to remove or update records: see GI 109050796 for " \
                    "instance.)"
            sys.exit(1)
        else:
            sys.stderr.write("Unable to find GI %s in BLAST DB %s.\n" % \
                (gi_number, bdbname))
            sys.stderr.flush()
            return None
    elif output.rstrip("\n") == "0":
        #This maybe due to the taxonomy not being applied to the database or it
        #may be due to an odd record in e.g. "nr" that doesnt have a taxon_id
        print "Taxonomy is missing from BLAST DB %s" % bdbname
        if strict_taxonomy:
            print "Format BLAST DB with makeblastdb -taxid_map <taxon map>"
            print "To ignore missing taxon_id's use the -i option"
            print "Refusing to continue."
            sys.exit(1)
        else:
            print "WARNING - continuing anyway"
            return None
    else:
        lines = output.split("\n")# Maybe more than one taxon_id on seperate
                                  # lines
        #print "lines: %s" % lines
        ncbi_taxon_id = lines[0].strip()
        try:
            ncbi_taxon_id = int(ncbi_taxon_id)
        except ValueError:
            if verbose:
                print "\tTaxon id returned from blasdbcmd is not a number: " \
                      "%s" % output
                print "Command: %s" % cmdline
    return ncbi_taxon_id

def main(blast_record, bdbname,
                    e_value_threshold=1e-4,
                    restrict_list=None,
                    outfilename="martyr.annot",
                    strict_annotation=True,
                    strict_taxonomy=True,
                    verbose=True):
    """Main loop
    """
    
    restriction_records = []
    restriction_records_found = []
    if restrict_list:
        fh = open(restrict_list, "r")
        lines = fh.read()
        fh.close()
        #lose the last ''
        restriction_records = lines.split("\n")[:-1]
        if verbose:
            print "Read %s record identifiers from the restrict_list file: %s" \
                % (len(restriction_records), restrict_list)

    fh = open(outfilename, "w")
    fh.write("%s, %s, %s, %s, %s, %s\n" % 
                ("record",  # Name of the contig query record
                 "hits",    # Number of hits used to determine taxonomy
                 "score",   # Bit score of hit(s)
                 "evalue",  # Expect score
                 "cov",     # Coverage
                 "lineage"  # The lineage
                ))
                
    try:
        blast_results = NCBIXML.parse(open(blast_record))
    except ValueError, e:
        if "Your XML file was empty" in e.__str__():
            print "ERROR failed to obtain valid blast result"
        else:
            raise ValueError, "Could not find blast record: %s" % blast_record

    record_count = 0
    missing_gis  = []
    #May be muliple blast records:
    while 1:
        try:
            blast_rec = blast_results.next()
            record_count += 1
        except StopIteration:
            if verbose:
                print "\nNO MORE RECORDS"
            break
        #Restrict records
        if restriction_records != []:
            if blast_rec.query not in restriction_records:
                if verbose:
                    print "Skipping %s" % blast_rec.query
                    continue
            else:
                restriction_records_found.append(blast_rec.query)
                if verbose:
                    print "\n ***** Record: %s" % blast_rec.query

        #We just want the top blast record hit maybe missing in BLAST+:
        if len(blast_rec.alignments) == 0:
            #<Iteration_message>No hits found</Iteration_message>
            if verbose:
                print "No blast hits - continuing to next record"
            continue
        else:
            if verbose:
                print "Number of hits: %s" % len(blast_rec.alignments)
                print "Query length: %s" % blast_rec.query_length

        count = 1
        lineages = []
        high_bitscore = 0.0
        we_have_a_taxonomy = False
        for hit in blast_rec.alignments:
            if verbose:
                print "Hit number %s" % count
                print "Hit id: %s" % hit.hit_id
                print "Hit def: %s" % hit.hit_def
                print "Number of hsps': %s" % len(hit.hsps)

            #Get stats for first hsps
            expect = hit.hsps[0].expect
            bits = hit.hsps[0].bits
            score = hit.hsps[0].score
            if count == 1:
                high_bitscore = score
            else:
                if score < high_bitscore and we_have_a_taxonomy:
                    if verbose:
                        print "Lower bitscore (%s) breaking..." % score
                    break
            align_length = hit.hsps[0].align_length
            coverage = ((float(align_length)/float(blast_rec.query_length))*100)

            if expect > e_value_threshold:
                if verbose:
                    print "E-value too low: %s" % expect
                count +=1
                continue

            #In custom made BLAST dbs the gi number may appear in the hit_id or
            #the hit_def - search for it and throw an error if its not found
            
            #Hit_id
            ind = hit.hit_id.find("gi|")
            if ind != -1:
                gi_number = int(hit.hit_id[ind+3:].split("|")[0].strip())
            else:
                #Try hit_def
                ind = hit.hit_def.find("gi|")
                if ind == -1:
                    print "Unable to find a GI definition in either the " \
                          "hit_id or hit_def indentifiers. If the target " \
                          "BLAST database entries didn't have a GI " \
                          "identifier in the id or definition line, then " \
                          "they cannot be annotated with this script."
                    sys.exit(1)
                else:
                    gi_number = int(hit.hit_def[ind+3:].split("|")[0].strip())
            
            if verbose:
                print "\tGI: %s" % gi_number,
            ncbi_taxon_id = get_NCBI_taxonID_from_GI_number(gi_number, bdbname,
                                strict_annotation=strict_annotation,
                                strict_taxonomy=strict_taxonomy,
                                verbose=options.verbose)
            if not ncbi_taxon_id:
                missing_gis.append(gi_number)
                continue
            elif ncbi_taxon_id == 32630: #32630 "Synthetic contruct"
                count +=1
                continue
            else:
                we_have_a_taxonomy = True

            if verbose:
                print "TID: %s" % ncbi_taxon_id
            lineage = retrieve_taxonomy(ncbi_taxon_id)
            lineages.append(lineage)
            if verbose:
                print "\tHit %s: %.2f - %.2f - %.2f - %.2f%% - %s" % (count, expect,
                    bits, score, coverage, lineage)

            count +=1

        #There may only be one hit and it may be missing from BLAST DB
        if len(lineages) == 0:
            if verbose:
                print "\n\tNo lineages recovered. Aborting"
            continue

        #Find highest common taxon if lineages > 1
        if len(lineages) > 1:
            ranks = []
            for rank in zip(*lineages):
                same = True
                for i in rank[1:]:
                    if i == rank[0]:
                        continue
                    else:
                        same = False
                if same:
                    ranks.append(rank[0])
            if len(ranks) == 0:
                the_lineage = "Different domains!"
            else:
                the_lineage = " ".join(ranks)
        else:
            the_lineage = " ".join(lineages[0])

        # Because we are only considering the top scoring hits
        # the score and coverage will be the same even if there
        # are several top-scoring hits with different lineages
        # record, hits, score, coverage, lineage
        fh.write("%s, %s, %4.i, %s, %2.i, %s\n" %
                    (blast_rec.query,
                     len(lineages),
                     score,
                     expect,
                     coverage,
                     the_lineage))

    print "Number of BLAST records read: %s" % record_count
    print "Number of missing GI numbers in BLAST DB: %i" % len(missing_gis)
    for g in missing_gis:
        print "\t%s" % g
    if restriction_records != []:
        missing = [r for r in restriction_records if r not in
                restriction_records_found]
        print "Number of records in restrict_list not found in BLAST XML: %s" \
            % len(missing)
        for rec in missing:
            print "\t%s" % rec
    print "Done."



if __name__ == "__main__":
    usage   = """\n\t%prog [options] <blast record filename>

    A Python version of MARTA (http://bergelson.uchicago.edu/software/marta). It
    basically annotates BLAST (x or n) XML output with the NCBI taxonomy.

    Requires: Biopython, blastdbcmd, NCBI taxonomy dump and the target BLAST DB
    in $BLASTDB, and a BioSQL database in PostGreSQL with the NCBI taxonomy
    loaded.

    Note that only top scoring hits are considered - no 'slippage' is
    implemented. Also there is strict lineage scoring. If there are two or more
    best hits (same scores) which have different lineages then the record is
    annotated with the highest taxonomic rank in common.

    Version 1.0
    CJC Fri 20 Aug 2010
              """

    parser=OptionParser(usage)
    #BIOSQL DB NAME
    parser.add_option("-d", "--dbname", dest="dbname", help="BIOSQL DB " +\
            "name (Must have NCBI taxonomy loaded). Default: 'biosqldb'",
            default="biosqldb", type="string")
    #BIOSQL DB USER NAME
    parser.add_option("-u", "--dbusername", dest="dbusername", help="BIOSQL " +\
            "DB username. Default: 'root'", default="root", type="string"),
    #BIOSQL BIODATABASE NAME
    parser.add_option("-b", "--biodbname", dest="biodbname", help="BIOSQL " +\
            "Biodatabase name (Need not have bioentries). Default: 'taxonomy'",
            default="taxonomy", type="string")
    #BLAST DB NAME
    parser.add_option("-n", "--blastdbname", dest="bdbname", help="BLAST DB " +\
            "name Default: 'nr'", default="nr", type="string"),
    #E-VALUE THRESHOLD
    parser.add_option("-e", "--e-value", dest="evalue", help="E-value " +\
            "threshold above which hits will not be considered Default: 1e-4",
            default=1e-4, type="float")
    #CSV OUTFILE NAME
    parser.add_option("-o", "--oufilename", dest="outfile", help="Name of " +\
            "CSV output file. Default: martyr_annotations.csv",
            default="martyr_annotations.csv", type="string")
    #RESTRICT LIST
    parser.add_option("-r", "--restict_list", dest="restrict_list",
            help="Restrict the annotations to only those identifiers/record " +\
            "names listed in this file. One record name per line.", default=None)
    #Strict annotation
    parser.add_option("-s", "--strict_annotation", dest="strict_annotation",
            help="Abort if GI's missing from the BLAST DB. Default: True",
            default=True, action="store_false")
    #Strict taxonomy
    parser.add_option("-i", "--strict_taxonomy", dest="strict_taxonomy",
            help="Abort if taxon ID's missing from the BLAST DB. Default: True",
            default=True, action="store_false")
    parser.add_option("-v", "--verbose", dest="verbose", help="Lots of " +\
            "debugging output to stdout.", default=False, action="store_true")

    (options,args) = parser.parse_args(sys.argv[1:])
    if len(args) != 1:
        print "\n\tThe input XML BLAST record is needed as an argument\n"
        sys.exit(1)
    else:
        if not os.path.exists(args[0]):
            print "\n\tFile %s does not exist\n" % args[0]
            sys.exit(1)

    if options.restrict_list:
        if not os.path.exists(options.restrict_list):
            print "\n\tRestict list file %s does not exist\n" % args[0]
            sys.exit(1)

    #Are BLAST db and TAXONOMY DB available
    child = subprocess.Popen("blastdbcmd -db %s -info" % options.bdbname,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        universal_newlines=True,
        shell=(sys.platform!="win32"))
    output, error = child.communicate()
    if "not found" in error:
        print "\n\t'blastdbcmd' must be in your $PATH\n"
        sys.exit(1)
    elif "BLAST Database error" in error:
        print "\n\tUnable to locate BLAST DB '%s' at $BLASTDB" % \
              options.bdbname
        sys.exit(1)

    #Try connecting to the database
    try:
        server = BioSeqDatabase.open_database(driver="psycopg2",
            user=options.dbusername, passwd="",
            host = "localhost", db=options.dbname)
    except:
        print "\n\tUnable to connect to PostGreSQL database:\n"
        print "\tserver = BioSeqDatabase.open_database(driver='psycopg2', "\
              "user='%s', passwd='', host='localhost', " \
              "db='%s')\n" % (options.dbusername, options.dbname)
        sys.exit(1)

    #Try accessing the biodatabase
    try:
        db = server[options.biodbname]
    except KeyError:
        print "\n\tCould not open BioSQL.biodatabase '%s' in DB '%s'" % \
                (options.biodbname, options.dbname)
        print "\n\tPerhaps you need to create it?"
        print "\n\tdb = server.new_database('taxonomy', " \
              "description='A blank taxonomy db')\n"
        sys.exit(1)
    #Is the NCBI taxonomy install in DB
    SQL = """SELECT count(*) 
        FROM taxon_name
        WHERE taxon_name.name='Homo sapiens'"""
    count = server.adaptor.execute_one(SQL)
    if count[0] != 1:
        print "\n\tBioSQL database needs NCBI taxomony installed."
        print "\tUse load_ncbi_taxonomy.pl\n"
        sys.exit(1)

    #Check output file destination
    if os.path.exists(options.outfile):
        print "Output results file %s already exists. Refusing to overwrite" % \
            options.outfile
        sys.exit(1)

    main(args[0], options.bdbname, options.evalue,
            restrict_list=options.restrict_list,
            outfilename=options.outfile,
            strict_annotation=options.strict_annotation,
            strict_taxonomy=options.strict_taxonomy,
            verbose=options.verbose)