Source code for physcraper.ids

"""Link together NCBI and Open Tree identifiers and names, with Gen Bank information for updated sequences"""
import sys
import os
import time
from urllib.error import HTTPError
from Bio import Entrez
from physcraper import ncbi_data_parser, ConfigObj  # is the ncbi data parser class and associated functions
from physcraper.helpers import debug

_DEBUG = 0




[docs]class IdDicts(): """Class contains different taxonomic identifiers and helps to find the corresponding ids between ncbi and OToL To build the class the following is needed: * **config_obj**: Object of class config (see above) * **workdir**: the path to the assigned working directory During the initializing process the following self objects are generated: * **self.workdir**: contains path of working directory * **self.config**: contains the Config class object * **self.ott_to_ncbi**: dictionary * key: OToL taxon identifier * value: ncbi taxon identifier * **self.ncbi_to_ott**: dictionary * key: OToL taxon identifier * value: ncbi taxon identifier * **self.ott_to_name**: dictionary * key: OToL taxon identifier * value: OToL taxon name * **self.acc_ncbi_dict**: dictionary * key: Genbank identifier * value: ncbi taxon identifier * **self.spn_to_ncbiid**: dictionary * key: OToL taxon name * value: ncbi taxon identifier * **self.ncbiid_to_spn**: dictionary * key: ncbi taxon identifier * value: ncbi taxon name user defined list of mrca OTT-ID's #TODO this is flipped form the dat aobj .ott_mrca. On purpose? #reomved mrca's from ida, and put them into scrape object * **Optional**: * depending on blasting method: * self.ncbi_parser: for local blast, initializes the ncbi_parser class, that contains information about rank and identifiers """ def __init__(self, configfile=None): """Generates a series of name disambiguation dicts""" if configfile is None: self.config = ConfigObj() elif isinstance(configfile, ConfigObj): self.config = configfile elif os.path.exists(configfile): self.config = ConfigObj(configfile) else: sys.stderr.write("Error reading config file {}\n".format(configfile)) sys.exit() assert self.config self.ott_to_ncbi = {} self.ncbi_to_ott = {} # used to get ott_id for new Genbank query taxa self.ott_to_name = {} # used in add_otu to get name from otuId self.acc_ncbi_dict = {} # filled by ncbi_parser (by subprocess in earlier versions of the code). self.spn_to_ncbiid = {} # spn to ncbi_id, it's only fed by the ncbi_data_parser, but makes it faster self.ncbiid_to_spn = {} fi = open(self.config.ott_ncbi) # This is in the taxonomy folder of the repo, needs to be updated by devs when OpenTree taxonomy changes. # by downloading teh taxonomy and then using # grep ncbi: ottVERSION/taxonomy.tsv | sed -r -e # "s/([0-9]+).+?\|.+?\|(.+?)\|.+?\|.*ncbi:([0-9]+).*/\\1,\\3,\\2/" > physcraper/taxonomy/ott_ncbi for lin in fi: lii = lin.split(",") self.ott_to_ncbi[int(lii[0])] = int(lii[1]) self.ncbi_to_ott[int(lii[1])] = int(lii[0]) self.ott_to_name[int(lii[0])] = lii[2].strip() # todo merge into ott_to_ncbi? fi.close() assert len(self.ott_to_ncbi) > 0 assert len(self.ott_to_name) > 0 assert len(self.ncbi_to_ott) > 1000 if self.config.blast_loc == 'remote': debug("Config remote {}".format(self.config.blast_loc)) else: # ncbi parser contains information about spn, tax_id, and ranks debug("Config not remote {}".format(self.config.blast_loc)) self.ncbi_parser = ncbi_data_parser.Parser(names_file=self.config.ncbi_names, nodes_file=self.config.ncbi_nodes) self.acc_tax_seq_dict = {} self.full_seq_path = "{}/downloaded_ncbi_sequences".format(self.config.taxonomy_dir)
[docs] def get_ncbiid_from_acc(self, acc): '''checks local dicts, and then runs eftech to get ncbi id for accession''' gb_id = acc if gb_id in self.acc_ncbi_dict: ncbi_id = self.acc_ncbi_dict[gb_id] elif gb_id in self.acc_tax_seq_dict: ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"] else: # Disabling unused variable bc not used locally but used by other functions later on taxid, taxname, seq = self.get_tax_seq_acc(gb_id) # pylint: disable=unused-variable ncbi_id = taxid return ncbi_id
#removed function find_tax_id because it wasn't being used
[docs] def get_tax_seq_acc(self, acc): """Pulls the taxon ID and the full sequences from NCBI""" if not os.path.exists(self.full_seq_path): os.makedirs(self.full_seq_path) gb_id = acc if len(gb_id.split(".")) == 1: debug("accession number {} not recognized".format(gb_id)) return None, None, None seq_path = "{}/{}_remote.fasta".format(self.full_seq_path, gb_id) ncbi_id = tax_name = seq = None if gb_id in self.acc_tax_seq_dict: tax_name = self.acc_tax_seq_dict[gb_id]["taxname"] ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"] seq = self.acc_tax_seq_dict[gb_id]["seq"] elif os.path.exists(seq_path): fi = open(seq_path) header = fi.readline().strip('>') assert header.split()[1].startswith('taxname:') tax_name = header.split()[1].strip('taxname:') ncbi_id = header.split()[2].strip('ncbi:') self.ncbiid_to_spn[ncbi_id] = tax_name self.acc_ncbi_dict[gb_id] = ncbi_id seq = "".join(fi.readlines()) if seq is None: read_handle = self.entrez_efetch(gb_id) tax_name = ncbi_data_parser.get_ncbi_tax_name(read_handle) ncbi_id = ncbi_data_parser.get_ncbi_tax_id(read_handle) seq = read_handle[0][u'GBSeq_sequence'] tax_name = tax_name.replace(" ", "_") #TODO check that searches are using names without spaces self.ncbiid_to_spn[ncbi_id] = tax_name self.acc_ncbi_dict[gb_id] = ncbi_id self.acc_tax_seq_dict[gb_id] = {'taxname':tax_name, "^ncbi:taxon":ncbi_id, 'seq':seq}#ToDo memory hog with open(seq_path, 'w') as fi: fi.write("> {} taxname:{} ncbi:{}\n".format(gb_id, tax_name, ncbi_id)) fi.write(self.acc_tax_seq_dict[gb_id]['seq']) assert ncbi_id is not None return ncbi_id, tax_name, seq
[docs] def entrez_efetch(self, gb_id): """ Wrapper function around efetch from ncbi to get taxonomic information if everything else is failing. Also used when the local blast files have redundant information to access the taxon info of those sequences. It adds information to various id_dicts. :param gb_id: Genbank identifier :return: read_handle """ tries = 10 Entrez.email = self.config.email if self.config.api_key: Entrez.api_key = self.config.api_key handle = None # method needs delay because of ncbi settings for i in range(tries): try: # print("try") delay = 1.0 previous = time.time() while True: current = time.time() wait = previous + delay - current if wait > 0: # print("if", wait) time.sleep(wait) previous = current + wait else: # print("else", wait) previous = current if delay + .5 * delay <= 5: # print("if2", delay) delay += .5 * delay else: # print("else2", delay) delay = 5 # print("read handle") handle = Entrez.efetch(db="nucleotide", id=gb_id, retmode="xml") assert handle is not None, ("your handle file to access data from efetch does not exist. " "Likely an issue with the internet connection of ncbi. Try rerun...") read_handle = Entrez.read(handle) handle.close() return read_handle except (IndexError, HTTPError) as e: if i < tries - 1: # i is zero indexed continue else: raise e # break assert handle is not None, ("your handle file to access data from efetch does not exist. " "Likely an issue with the internet connection of ncbi. Try rerun...") read_handle = Entrez.read(handle) handle.close() return read_handle