Source code for physcraper.opentree_helpers

"""Linker Functions to get data from OpenTree"""

import json
import copy
import sys
import os
import xml
from urllib.error import HTTPError
import requests
import dendropy
from dendropy import DataSet, datamodel
from opentree import OT, object_conversion
from nexson.syntax import get_subtree_otus


import physcraper
from physcraper.helpers import standardize_label, to_string



_VERBOSE = 0

[docs]def set_verbose(): """Set output verbosity """ global _VERBOSE # pylint: disable=global-statement _VERBOSE = 1
_DEBUG = 1
[docs]def debug(msg): """short debugging command """ if _DEBUG == 1: print(msg)
physcraper_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) phylesystemref = "McTavish EJ, Hinchliff CE, Allman JF, Brown JW, Cranston KA, Holder MT,\ Phylesystem: a gitbased data store for community curated phylogenetic estimates. \ Bioinformatics. 2015 31 2794-800. doi: 10.1093/bioinformatics/btv276\n" synthref = "Redelings BD, Holder MT. A supertree pipeline for summarizing phylogenetic and \ taxonomic information for millions of species. PeerJ. 2017;5:e3058. https://doi.org/10.7717/peerj.3058 \n"
[docs]def root_tree_from_synth(tree, otu_dict, base='ott'): # Disable all the too many locals and branches for this function # pylint: disable=too-many-locals, too-many-branches """Uses information from OpenTree of Life to suggest a root. :param tree: dendropy :class:`Tree :param otu_dict: a dictionary of tip label metadata, inculding an '^ot:ottId'attribute 'param base: either `synth` or `ott.` If `synth` will use OpenTree synthetic tree relationships to root input tree, if `ott` will use OpenTree taxonomy. """ leaves = [leaf.taxon.label for leaf in tree.leaf_nodes()] spp = {otu_dict[otu]['^ot:ottId'] for otu in leaves} if None in spp: spp.remove(None) assert(base in ['synth', 'ott']) if base == 'synth': ## ONLY included SPP with phylo information. synth_ids = ottids_in_synth() synth_spp = set() for sp in spp: if sp in synth_ids: synth_spp.add(int(sp)) if len(synth_spp) <= 3: sys.stdout.write("Didn't find enough taxon matches in synth tree to root. Tree is unrooted\n") return tree resp = OT.synth_induced_tree(ott_ids=synth_spp) induced_tree_of_taxa = resp.tree base = "OpenTree synthetic tree" elif base == 'ott': tax_mrca = OT.taxon_mrca(spp).response_dict['mrca']['ott_id'] resp = OT.taxon_subtree(tax_mrca) taxleaves = [leaf.taxon.label for leaf in resp.tree.leaf_nodes()] matches = [label for label in taxleaves if int(label.split()[-1].strip('ott')) in spp] induced_tree_of_taxa = resp.tree.extract_tree_with_taxa_labels(matches) base = "OpenTree taxonomy" node = None for node in induced_tree_of_taxa: if node.parent_node is None: break # stop when node has no parents, i.e., it is the root node children = node.child_nodes() representative_taxa = [] for child in children: if len(child.child_nodes()) > 1: subtre = child.extract_subtree() tip = subtre.leaf_nodes()[0] representative_taxa.append(tip.taxon.label) else: representative_taxa.append(child.leaf_nodes()[0].taxon.label) sys.stdout.write("Rooting tree based on taxon relationships in {}. \ \nRoot will be MRCA of {}\n".format(base, ", ".join(representative_taxa))) ## Get tips for those taxa: ## TODO: make following code a function of its own # def get_tips(tree, otu_dict, leaves): phyloref = set() for tax in representative_taxa: ott_id = int(tax.split()[-1].strip('ott_id')) phyloref.add(ott_id) tips_for_root = set() for otu in otu_dict: if otu_dict[otu]['^ot:ottId'] in phyloref: if otu in leaves: tips_for_root.add(otu) continue mrca = tree.mrca(taxon_labels=tips_for_root) tree.reroot_at_node(mrca) return tree
[docs]def ottids_in_synth(synthfile=None): """Checks if OTT ids are present in current synthetic tree, using a file listing all current OTT ids in synth (v12.3) :param synthfile: defaults to taxonomy/ottids_in_synth.txt """ if synthfile is None: synthfile = open("{}/taxonomy/ottids_in_synth.txt".format(physcraper_dir)) synth_ottids = set() for lin in synthfile: ottid = lin.lstrip('ott').strip() if len(ottid) >= 1: synth_ottids.add(int(ottid)) return synth_ottids
[docs]def check_if_ottid_in_synth(ottid): """Web call to check if OTT id in synthetic tree. NOT USED. """ url = 'https://api.opentreeoflife.org/v3/tree_of_life/node_info' payload = json.dumps({"ott_id":int(ottid)}) headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'} try: r = requests.post(url, data=payload, headers=headers) if r.status_code == 200: return 1 elif r.status_code == 400: return 0 elif r.status_code == 502: sys.stderr.write("Bad OpenTree taxon ID: {}".format(ottid)) return 0 sys.stderr.write("unexpected status code from node_info call: {}".format(r.status_code)) return 0 except requests.ConnectionError: sys.stderr.write("Connection Error - coud not get taxon information from OpenTree\n")
[docs]def get_ottid_from_gbifid(gbif_id): """Returns a dictionary mapping GBIF ids to OTT ids. ott_id is set to 'None' if the GBIF id is not found in the Open Tree Taxanomy """ url = 'https://api.opentreeoflife.org/v3/taxonomy/taxon_info' headers = {'content-type':'application/json'} tax = int(gbif_id) payload = json.dumps(dict(source_id='gbif:{}'.format(tax))) res = requests.post(url, data=payload, headers=headers) if res.status_code == 200: ott_id = int(res.json()['ott_id']) return ott_id sys.stderr.write("error getting OTT id for GBIF id {}, {}, {}".format(tax, res.status_code, res.reason)) return None
[docs]def bulk_tnrs_load(filename): """Read in outputs from OpenTree Bulk TNRS, translates to a Physcraper OTU dictionary. :param filename: input json file """ otu_dict = {} with open(filename) as data_file: input_dict = json.load(data_file) if "names" in input_dict.keys(): for name in input_dict["names"]: i = 1 otu = "otu" + name['id'].strip('name') while otu in otu_dict.keys(): otu = "{}_{}".format(otu, i) i += 1 otu_dict[otu] = {"^ot:originalLabel":name["originalLabel"]} if name.get("ottTaxonName"): otu_dict[otu]["^ot:ottTaxonName"] = name["ottTaxonName"] if name.get("ottId"): otu_dict[otu]["^ot:ottId"] = name["ottId"] for source in name.get("taxonomicSources", []): #debug(source) if source: taxsrc = source.split(":") assert len(taxsrc) == 2, taxsrc otu_dict[otu]["^{}:taxon".format(taxsrc[0])] = taxsrc[1] for otu in otu_dict: otu_dict[otu]["^physcraper:status"] = "original" otu_dict[otu]["^physcraper:last_blasted"] = None otu_dict[otu]["^physcraper:ingroup"] = "unknown" else: for otu in input_dict: assert input_dict[otu]["^physcraper:status"], otu_dict[otu] otu_dict = input_dict return otu_dict
#def get_cite_for_study(study_id): # to complement the function we should do also def get_ott_id_data(ott_id): # following function assumes that study_id is an object from # url = 'https://api.opentreeoflife.org/v3/tree_of_life/induced_subtree' # headers = {'content-type':'application/json'} # payload = json.dumps(dict(ott_ids=ott_ids, label_format = label_format)) # res = requests.post(url, data=payload, headers=headers)
[docs]def get_citations_from_json(synth_response, citations_file): """Get ciattions for studies in an induced synthetic tree repsonse. :param synth_response: Web service call record :param citations_file: Output file """ assert isinstance(citations_file, str) f = open(citations_file, "w+") sys.stdout.write("Gathering citations ...") assert 'supporting_studies' in synth_response.keys(), synth_response.keys() for study in synth_response['supporting_studies']: study = study.split('@')[0] index_url = 'https://api.opentreeoflife.org/v3/studies/find_studies' headers = {'content-type':'application/json'} payload = json.dumps({"property":"ot:studyId", "value":study, "verbose":"true"}) res_cites = requests.post(index_url, data=payload, headers=headers) new_cite = res_cites.json()['matched_studies'] # debug(new_cite) sys.stdout.write('.') if new_cite: f.write(to_string(new_cite[0].get('ot:studyPublicationReference', '')) + '\n' + new_cite[0].get('ot:studyPublication', '') + '\n') f.close() sys.stdout.write("Citations printed to {}\n".format(citations_file))
# another way to do it is calling each id # get_citation_for_study(study_id) # use append
[docs]def conflict_tree(inputtree, otu_dict): """Write out a tree with labels that work for the OpenTree Conflict API """ tmp_tree = copy.deepcopy(inputtree) i = 1 for node in tmp_tree: i += 1 if node.taxon: otu = otu_dict[node.taxon.label] ottid = otu['^ot:ottId'] new_label = "_nd{}_ott{}".format(i, ottid) node.taxon.label = new_label else: node.label = "_nd{}_".format(i) return tmp_tree
[docs]def get_tree_from_synth(ott_ids, label_format="name", citation="cites.txt"): """Wrapper for OT.synth_induced_tree that also pulls citations""" synth_json = OT.synth_induced_tree(ott_ids=ott_ids, label_format=label_format) get_citations_from_json(synth_json.response_dict, citation) return synth_json.tree
[docs]def get_tree_from_study(study_id, tree_id, label_format="ot:originallabel"): """Create a dendropy Tree object from OpenTree data. :param study_id: OpenTree Study Id :param tree_id: OpenTree tree id :param label_format: One of 'id', 'name', "ot:originallabel", "ot:ottid", "ot:otttaxonname". defaults to "ot:originallabel" """ assert label_format in ['id', 'name', "ot:originallabel", "ot:ottid", "ot:otttaxonname"] study = OT.get_study(study_id) study_nexson = study.response_dict['data'] DC = object_conversion.DendropyConvert() tree_obj = DC.tree_from_nexson(study_nexson, tree_id, label_format) cites = study_nexson['nexml']['^ot:studyPublicationReference'] return tree_obj, cites
# ATT is a dumb acronym for Alignment Tree Taxa object
[docs]def generate_ATT_from_phylesystem(alnfile, aln_schema, workdir, configfile, study_id, tree_id, search_taxon=None, tip_label='^ot:originalLabel'): # Disable all the too many locals, branches and statements for this function # pylint: disable=too-many-locals, too-many-branches, too-many-statements """ Gathers together tree, alignment, and study info; forces names to OTT ids. Study and tree ID's can be obtained by using python ./scripts/find_trees.py LINEAGE_NAME Spaces vs underscores kept being an issue, so all spaces are coerced to underscores when data are read in. :param aln: dendropy :class:`DnaCharacterMatrix <dendropy.datamodel.charmatrixmodel.DnaCharacterMatrix>` alignment object. :param workdir: Path to working directory. :param config_obj: Config class containing the settings. :param study_id: OpenTree study id of the phylogeny to update. :param tree_id: OpenTree tree id of the phylogeny to update, some studies have several phylogenies. :param phylesystem_loc: Access the GitHub version of the OpenTree data store, or a local clone. :param search_taxon: optional. OTT id of the MRCA of the clade that shall be updated. :return: Object of class ATT. """ assert(tip_label in ['^ot:originalLabel', 'otu', "^ot:ottTaxonName", "^ot:ottId"]) try: study = OT.get_study(study_id) except AttributeError: sys.stderr.write("Failure getting study {} from OpenTree phylesystem.".format(study_id)) sys.exit() try: study_nexson = study.response_dict['data'] DC = object_conversion.DendropyConvert() tree_obj = DC.tree_from_nexson(study_nexson, tree_id) except KeyError: sys.stderr.write("Failure getting tree {} from study {} from OpenTree phylesystem.".format(tree_id, study_id)) sys.exit() # this gets the taxa that are in the subtree with all of their info - ott_id, original name, otu_dict = {tn.taxon.otu:{} for tn in tree_obj.leaf_node_iter()} orig_lab_to_otu = {} treed_taxa = {} ingroup_otus = get_subtree_otus(study_nexson, tree_id=tree_id, subtree_id="ingroup", return_format="otu_id") if ingroup_otus is None: sys.stdout.write("No ingroup annotation found in tree; using all taxa.\n \ Please update tree annotation through OpenTree curation app.\n") for leaf in tree_obj.leaf_node_iter(): tn = leaf.taxon otu_id = tn.otu otu_dict[otu_id]["^ot:ottId"] = tn.ott_id otu_dict[otu_id]["^ot:ottTaxonName"] = tn.ott_taxon_name otu_dict[otu_id]["^ot:originalLabel"] = tn.original_label.replace(" ", "_") otu_dict[otu_id]["^physcraper:status"] = "original" otu_dict[otu_id]["^physcraper:last_blasted"] = None if ingroup_otus: if otu_id in set(ingroup_otus): otu_dict[otu_id]["^physcraper:ingroup"] = True else: otu_dict[otu_id]["^physcraper:ingroup"] = False else: otu_dict[otu_id]["^physcraper:ingroup"] = 'unknown' orig = otu_dict[otu_id].get(u"^ot:originalLabel").replace(" ", "_") orig_lab_to_otu[orig] = otu_id if tip_label == 'otu': tn.label = otu_id else: tn.label = otu_dict[otu_id].get(tip_label) treed_taxa[orig] = otu_dict[otu_id].get(u"^ot:ottId") # need to prune tree to seqs and seqs to tree... ott_mrca = None if search_taxon: if isinstance(search_taxon, list): ott_ids = set(search_taxon) ott_mrca = get_mrca_ott(ott_ids) else: ott_mrca = int(search_taxon) if ott_mrca is None: ingroup_ott_ids = set() for otu_id in otu_dict: if ingroup_otus: if otu_dict[otu_id]["^physcraper:ingroup"]: ingroup_ott_ids.add(otu_dict[otu_id].get(u"^ot:ottId")) else: ingroup_ott_ids.add(otu_dict[otu_id].get(u"^ot:ottId")) if None in ingroup_ott_ids: ingroup_ott_ids.remove(None) assert len(ingroup_ott_ids) >= 1 ott_mrca = get_mrca_ott(ingroup_ott_ids) otu_newick = tree_obj.as_string(schema="newick") return physcraper.aligntreetax.AlignTreeTax(tree=otu_newick, otu_dict=otu_dict, alignment=alnfile, aln_schema=aln_schema, search_taxon=ott_mrca, workdir=workdir, configfile=configfile)
# newick should be bare, but alignment should be DNACharacterMatrix
[docs]def get_dataset_from_treebase(study_id): """ Given a phylogeny in OpenTree with mapped tip labels, this function gets an alignment from the corresponding study on TreeBASE, if available. By default, it first tries getting the alignment from the supertreebase repository at https://github.com/TreeBASE/supertreebase. If that fials, it tries getting the alignment directly form TreeBASE at https://treebase.org If both fail, it exits with a message. """ try: study = OT.get_study(study_id) nexson = study.response_dict['data'] except HTTPError as err: sys.stderr.write(err) sys.stderr.write("Could not find study id {} in OpenTree phylesystem.\n".format(study_id)) treebase_url = nexson['nexml'][u'^ot:dataDeposit'][u'@href'] if 'treebase' not in nexson['nexml'][u'^ot:dataDeposit'][u'@href']: sys.stderr.write("No treeBASE record associated with study ") sys.exit(-2) else: tb_id = treebase_url.split(':S')[1] url = "https://raw.githubusercontent.com/TreeBASE/supertreebase/master/data/treebase/S{}.xml".format(tb_id) try: dna = DataSet.get(url=url, schema="nexml") except (xml.etree.ElementTree.ParseError, dendropy.utility.error.DataParseError) as error1a: sys.stderr.write("\nERROR: Problems reading the input data from supertreebase:\n") message = str(error1a) sys.stderr.write("DENDROPY ERROR:'"+ message + "'" +'\n') if "protein" or "amino" in message: sys.stderr.write("\nIt appears this may not be a DNA alignment - physcraper can only use DNA currently.\n") sys.stderr.write("Investigate, and potentialy download alignment directly at \n{}\n".format(url)) sys.exit() except (requests.HTTPError) as error1b: # dendropy.utility.error.DataParseError occurs when a dataset has alternative states defined with symbol '{' sys.stdout.write("\nRetrieving dataset from supertreebase ({}) failed.\n".format(url) + "Trying TreeBASE...\n") url = "https://treebase.org/treebase-web/search/downloadAStudy.html?id={}&format=nexml".format(tb_id) try: dna = DataSet.get(url=url, schema="nexml") except (requests.HTTPError, dendropy.utility.error.DataParseError) as error2: url = "https://treebase.org/treebase-web/search/matrices.html?id={}".format(tb_id) sys.stderr.write("\nAutomatically retrieving a dataset from TreeBASE and supertreebase failed.\n" + "You can manually download a dataset for this study from TreeBASE {}.\n".format(url)) sys.exit() except Exception as ex: sys.stderr.write("\nERROR: Problems reading the input data from treebase:\n") message = str(ex) sys.stderr.write("DENDROPY ERROR:'"+ message + "'" +'\n') if "protein" or "amino" in message: sys.stderr.write("\nIt appears this may not be a DNA alignment - physcraper can only use DNA currently.\n") sys.stderr.write("Investigate, and potentialy download alignment directly at \n{}\n".format(url)) sys.exit() if _DEBUG: sys.stderr.write(url + "\n") return dna
[docs]def count_match_tree_to_aln(tree, dataset): """Assess how many taxa match between multiple genes in an alignment data set and input tree.""" aln_match = {} i = 0 leaves = [leaf.taxon.label for leaf in tree.leaf_node_iter()] for mat in dataset.char_matrices: aln_match[i] = 0 if isinstance(mat, datamodel.charmatrixmodel.DnaCharacterMatrix): for tax in mat: if tax.label in leaves: aln_match[i] += 1 i += 1 return aln_match
[docs]def get_max_match_aln(tree, dataset, min_match=3): """Select an alignment from a DNA dataset """ aln_match = count_match_tree_to_aln(tree, dataset) max_val = min_match max_match = None for aln in aln_match: if aln_match[aln] > max_val: max_match = aln max_val = aln_match[aln] if max_match is not None: assert isinstance(dataset.char_matrices[max_match], datamodel.charmatrixmodel.DnaCharacterMatrix) return dataset.char_matrices[max_match] return None
[docs]def deconcatenate_aln(aln_obj, filename, direc): """Split out separate concatended alignments. NOT TESTED """ #dna1 = dendropy.DnaCharacterMatrix.get(file=open("treebase_alns/M4358.nex"), schema="nexus") for label in aln_obj.character_subsets.keys(): sys.stdout.write("deconcatenating {}".format(label)) submat = aln_obj.export_character_subset(label) submat.write(path="{}/{}_{}.fasta".format(direc, filename, label), schema="fasta")
[docs]def scraper_from_opentree(study_id, tree_id, alnfile, workdir, aln_schema, configfile=None): """Pull tree from OpenTree to create a physcraper object. """ # Read in the configuration information data_obj = generate_ATT_from_phylesystem(alnfile=alnfile, aln_schema=aln_schema, workdir=workdir, configfile=configfile, study_id=study_id, tree_id=tree_id) ids = physcraper.IdDicts(data_obj.config) scraper = physcraper.PhyscraperScrape(data_obj, ids) return scraper
[docs]def OtuJsonDict(id_to_spn, id_dict): # Disable all the too many locals for this function # pylint: disable=too-many-locals """Makes an OTU json dictionary, which is also produced within the openTreeLife-query. This function is used, if files that shall be updated are not part of the OpenTreeofLife project. It reads in the file that contains the tip names and the corresponding species names. It then tries to get the unique identifier from the OpenTree project or from NCBI. Reads input file into the var sp_info_dict, translates using an IdDict object using web to call OpenTree, then NCBI if not found. :param id_to_spn: User file, that contains tip name and corresponding sp name for input files. :param id_dict: Uses the id_dict generated earlier :return: dictionary with key: "otu_tiplabel" and value is another dict with the keys '^ncbi:taxon', '^ot:ottTaxonName', '^ot:ottId', '^ot:originalLabel', '^user:TaxonName', '^physcraper:status', '^physcraper:last_blasted' """ sys.stdout.write("Set up OtuJsonDict \n") sp_info_dict = {} nosp = [] with open(id_to_spn, mode="r") as infile: for lin in infile: ottid, ottname, ncbiid = None, None, None tipname, species = lin.strip().split(",") clean_lab = standardize_label(tipname) assert clean_lab not in sp_info_dict, ("Standardized label ('{}') of \ `{}` already exists".format(clean_lab, tipname)) otu_id = clean_lab spn = species.replace("_", " ") info = get_ott_taxon_info(spn) if info: ottid, ottname, ncbiid = info if not info: sys.stderr.write("Match to taxon {} not found in the OpenTree taxonomy or NCBI." " Proceeding without taxon info\n".format(spn)) nosp.append(spn) ncbi_spn = None if ncbiid is not None: ncbi_spn = spn else: ncbi_spn = id_dict.ott_to_ncbi[ottid] sp_info_dict[otu_id] = { "^ncbi:taxon": int(ncbiid), "^ot:ottTaxonName": ottname, "^ot:ottId": ottid, "^ot:originalLabel": tipname, "^user:TaxonName": species, "^physcraper:status": "original", "^physcraper:last_blasted": None, } if ncbi_spn is not None: sp_info_dict[otu_id]["^physcraper:TaxonName"] = ncbi_spn sp_info_dict[otu_id]["^ncbi:TaxonName"] = ncbi_spn elif ottname is not None: sp_info_dict[otu_id]["^physcraper:TaxonName"] = ottname elif sp_info_dict[otu_id]['^user:TaxonName']: sp_info_dict[otu_id]["^physcraper:TaxonName"] = sp_info_dict[otu_id]['^user:TaxonName'] assert sp_info_dict[otu_id]["^physcraper:TaxonName"] # is not None return sp_info_dict
#####################################
[docs]def get_nexson(study_id): """Grabs nexson from phylesystem.""" study = OT.get_study(study_id) nexson = study.response_dict['data'] return nexson
[docs]def get_mrca_ott(ott_ids): """Finds the MRCA of taxa in the ingroup of the original tree. The BLAST search later is limited to descendants of this MRCA according to the NCBI taxonomy. Only used in the functions that generate the ATT object. :param ott_ids: List of all OTT ids for tip labels in phylogeny :return: OTT id of most recent common ancestor """ debug("get_mrca_ott") mrca_node = OT.synth_mrca(ott_ids=ott_ids).response_dict if u'nearest_taxon' in mrca_node.keys(): tax_id = mrca_node[u'nearest_taxon'].get(u'ott_id') if _VERBOSE: sys.stdout.write('(v3) MRCA of sampled taxa is {}\n'.format(mrca_node[u'nearest_taxon'][u'name'])) elif u'taxon' in mrca_node['mrca'].keys(): tax_id = mrca_node['mrca'][u'taxon'][u'ott_id'] if _VERBOSE: sys.stdout.write('(v3) MRCA of sampled taxa is {}\n'.format(mrca_node['mrca'][u'taxon'][u'name'])) else: sys.stderr.write('(v3) MRCA of sampled taxa not found. Please find and input an ' 'appropriate OTT id as ingroup MRCA in generate_ATT_from_files') sys.exit(-4) return tax_id
[docs]def get_ott_taxon_info(spp_name): """Get OTT id, taxon name, and NCBI id (if present) from the OpenTree Taxonomy. Only works with version 3 of OpenTree APIs :param spp_name: Species name :return: """ #This is only used to write out the opentree info file. Could use NCBI id's instead of name, and likely be quicker. # debug(spp_name) try: call = OT.tnrs_match([spp_name], do_approximate_matching=True) res = call.response_dict['results'][0] except IndexError: sys.stderr.write("Match to taxon {} not found in open tree taxonomy\n".format(spp_name)) return None, None, None if res['matches'][0]['is_approximate_match'] == 1: sys.stderr.write("""Exact match to taxon {} not found in OpenTree Taxonomy. Check spelling. Maybe {}?\n""".format(spp_name, res['matches'][0][u'ot:ottTaxonName'])) return None, None, None if res["matches"][0]["is_approximate_match"] == 0: ottid = res["matches"][0]["taxon"][u"ott_id"] ottname = res["matches"][0]["taxon"][u"unique_name"] ncbi_id = None for source in res["matches"][0]["taxon"][u"tax_sources"]: if source.startswith("ncbi"): ncbi_id = source.split(":")[1] return ottid, ottname, ncbi_id sys.stderr.write("Match to taxon {} not found in OpenTree Taxonomy.".format(spp_name)) return None, None, None