Source code for physcraper.ids

"""Link together NCBI and Open Tree identifiers and names, with Gen Bank information for updated sequences"""
import sys
import os
import time
from urllib.error import HTTPError
from Bio import Entrez
from physcraper import ncbi_data_parser, ConfigObj  # is the ncbi data parser class and associated functions
from physcraper.helpers import debug

_DEBUG = 0




[docs]class IdDicts():
    """Class contains different taxonomic identifiers and helps to find the corresponding ids between ncbi and OToL

        To build the class the following is needed:

          * **config_obj**: Object of class config (see above)
          * **workdir**: the path to the assigned working directory

        During the initializing process the following self objects are generated:

          * **self.workdir**: contains path of working directory
          * **self.config**: contains the Config class object
          * **self.ott_to_ncbi**: dictionary

              * key: OToL taxon identifier
              * value: ncbi taxon identifier
          * **self.ncbi_to_ott**: dictionary

              * key: OToL taxon identifier
              * value: ncbi taxon identifier
          * **self.ott_to_name**: dictionary

              * key: OToL taxon identifier
              * value: OToL taxon name
          * **self.acc_ncbi_dict**: dictionary

              * key: Genbank identifier
              * value: ncbi taxon identifier
          * **self.spn_to_ncbiid**: dictionary

              * key: OToL taxon name
              * value: ncbi taxon identifier
          * **self.ncbiid_to_spn**: dictionary

              * key: ncbi taxon identifier
              * value: ncbi taxon name

          user defined list of mrca OTT-ID's #TODO this is flipped form the dat aobj .ott_mrca. On purpose?
         #reomved mrca's from ida, and put them into scrape object

          * **Optional**:

            * depending on blasting method:
            * self.ncbi_parser: for local blast,
               initializes the ncbi_parser class, that contains information about rank and identifiers
    """

    def __init__(self, configfile=None):
        """Generates a series of name disambiguation dicts"""
        if configfile is None:
            self.config = ConfigObj()
        elif isinstance(configfile, ConfigObj):
            self.config = configfile
        elif os.path.exists(configfile):
            self.config = ConfigObj(configfile)
        else:
            sys.stderr.write("Error reading config file {}\n".format(configfile))
            sys.exit()
        assert self.config
        self.ott_to_ncbi = {}
        self.ncbi_to_ott = {}  # used to get ott_id for new Genbank query taxa
        self.ott_to_name = {}  # used in add_otu to get name from otuId
        self.acc_ncbi_dict = {}  # filled by ncbi_parser (by subprocess in earlier versions of the code).
        self.spn_to_ncbiid = {}  # spn to ncbi_id, it's only fed by the ncbi_data_parser, but makes it faster
        self.ncbiid_to_spn = {}
        fi = open(self.config.ott_ncbi)
        # This is in the taxonomy folder of the repo, needs to be updated by devs when OpenTree taxonomy changes.
        # by downloading teh taxonomy and then using
        # grep ncbi: ottVERSION/taxonomy.tsv | sed -r -e
        # "s/([0-9]+).+?\|.+?\|(.+?)\|.+?\|.*ncbi:([0-9]+).*/\\1,\\3,\\2/" > physcraper/taxonomy/ott_ncbi
        for lin in fi:
            lii = lin.split(",")
            self.ott_to_ncbi[int(lii[0])] = int(lii[1])
            self.ncbi_to_ott[int(lii[1])] = int(lii[0])
            self.ott_to_name[int(lii[0])] = lii[2].strip()  # todo merge into ott_to_ncbi?
        fi.close()
        assert len(self.ott_to_ncbi) > 0
        assert len(self.ott_to_name) > 0
        assert len(self.ncbi_to_ott) > 1000
        if self.config.blast_loc == 'remote':
            debug("Config remote {}".format(self.config.blast_loc))
        else:  # ncbi parser contains information about spn, tax_id, and ranks
            debug("Config not remote {}".format(self.config.blast_loc))
            self.ncbi_parser = ncbi_data_parser.Parser(names_file=self.config.ncbi_names,
                                                       nodes_file=self.config.ncbi_nodes)
        self.acc_tax_seq_dict = {}
        self.full_seq_path = "{}/downloaded_ncbi_sequences".format(self.config.taxonomy_dir)


[docs]    def get_ncbiid_from_acc(self, acc):
        '''checks local dicts, and then runs eftech to get ncbi id for accession'''
        gb_id = acc
        if gb_id in self.acc_ncbi_dict:
            ncbi_id = self.acc_ncbi_dict[gb_id]
        elif gb_id in self.acc_tax_seq_dict:
            ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"]
        else:
            # Disabling unused variable bc not used locally but used by other functions later on
            taxid, taxname, seq = self.get_tax_seq_acc(gb_id) # pylint: disable=unused-variable
            ncbi_id = taxid
        return ncbi_id




 #removed function find_tax_id because it wasn't being used
[docs]    def get_tax_seq_acc(self, acc):
        """Pulls the taxon ID and the full sequences from NCBI"""
        if not os.path.exists(self.full_seq_path):
            os.makedirs(self.full_seq_path)
        gb_id = acc
        if len(gb_id.split(".")) == 1:
            debug("accession number {} not recognized".format(gb_id))
            return None, None, None
        seq_path = "{}/{}_remote.fasta".format(self.full_seq_path, gb_id)
        ncbi_id = tax_name = seq = None
        if gb_id in self.acc_tax_seq_dict:
            tax_name = self.acc_tax_seq_dict[gb_id]["taxname"]
            ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"]
            seq = self.acc_tax_seq_dict[gb_id]["seq"]
        elif os.path.exists(seq_path):
            fi = open(seq_path)
            header = fi.readline().strip('>')
            assert header.split()[1].startswith('taxname:')
            tax_name = header.split()[1].strip('taxname:')
            ncbi_id = header.split()[2].strip('ncbi:')
            self.ncbiid_to_spn[ncbi_id] = tax_name
            self.acc_ncbi_dict[gb_id] = ncbi_id
            seq = "".join(fi.readlines())
        if seq is None:
            read_handle = self.entrez_efetch(gb_id)
            tax_name = ncbi_data_parser.get_ncbi_tax_name(read_handle)
            ncbi_id = ncbi_data_parser.get_ncbi_tax_id(read_handle)
            seq = read_handle[0][u'GBSeq_sequence']
            tax_name = tax_name.replace(" ", "_") #TODO check that searches are using names without spaces
            self.ncbiid_to_spn[ncbi_id] = tax_name
            self.acc_ncbi_dict[gb_id] = ncbi_id
            self.acc_tax_seq_dict[gb_id] = {'taxname':tax_name, "^ncbi:taxon":ncbi_id, 'seq':seq}#ToDo memory hog
            with open(seq_path, 'w') as fi:
                fi.write("> {} taxname:{} ncbi:{}\n".format(gb_id, tax_name, ncbi_id))
                fi.write(self.acc_tax_seq_dict[gb_id]['seq'])
        assert ncbi_id is not None
        return ncbi_id, tax_name, seq



[docs]    def entrez_efetch(self, gb_id):
        """ Wrapper function around efetch from ncbi to get taxonomic information if everything else is failing.
            Also used when the local blast files have redundant information to access the taxon info of those sequences.
        It adds information to various id_dicts.

        :param gb_id: Genbank identifier
        :return: read_handle
        """
        tries = 10
        Entrez.email = self.config.email
        if self.config.api_key:
            Entrez.api_key = self.config.api_key
        handle = None

        # method needs delay because of ncbi settings
        for i in range(tries):
            try:
                # print("try")
                delay = 1.0
                previous = time.time()
                while True:
                    current = time.time()
                    wait = previous + delay - current
                    if wait > 0:
                        # print("if", wait)
                        time.sleep(wait)
                        previous = current + wait
                    else:
                        # print("else", wait)
                        previous = current
                    if delay + .5 * delay <= 5:
                        # print("if2", delay)
                        delay += .5 * delay
                    else:
                        # print("else2",  delay)
                        delay = 5
                    # print("read handle")
                    handle = Entrez.efetch(db="nucleotide", id=gb_id, retmode="xml")
                    assert handle is not None, ("your handle file to access data from efetch does not exist. "
                                                "Likely an issue with the internet connection of ncbi. Try rerun...")
                    read_handle = Entrez.read(handle)
                    handle.close()

                    return read_handle
            except (IndexError, HTTPError) as e:
                if i < tries - 1:  # i is zero indexed
                    continue
                else:
                    raise e
            # break
        assert handle is not None, ("your handle file to access data from efetch does not exist. "
                                    "Likely an issue with the internet connection of ncbi. Try rerun...")
        read_handle = Entrez.read(handle)
        handle.close()
        return read_handle
Source code for physcraper.ids

Physcraper

Navigation

Related Topics