"""
Physcraper run Configuration object generator
"""
# Disabling attributes defined outside init bc they are defined in functions called in init
# pylint: disable=attribute-defined-outside-init
import sys
import os
import datetime
import configparser
import shutil
import wget
_DEBUG = 0
PHYSCRAPER_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
#sys.stdout.write(PHYSCRAPER_DIR)
[docs]def is_number(inputstr):
"""Test if string can be coerced to float"""
try:
float(inputstr)
return True
except ValueError:
return False
[docs]class ConfigObj():
"""
To build the class the following is needed:
* **configfi**: a configuration file in a specific format, e.g. to read in self.e_value_thresh.
During the initializing process the following self objects are generated:
* **self.e_value_thresh**: the defined threshold for the e-value during Blast searches,
check out: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ
* **self.hitlist_size**: the maximum number of sequences retrieved by a single blast search
* **self.minlen**: value from 0 to 1. Defines how much shorter new seq can be compared to input
* **self.trim_perc**: value that determines how many seq need to be present before the beginning
and end of alignment will be trimmed
* **self.maxlen**: max length for values to add to aln
* **self.get_ncbi_taxonomy**: Path to sh file doing something...
* **self.ott_ncbi**: file containing OTT id, ncbi and taxon name (??)
* **self.email**: email address used for blast queries
* **self.blast_loc**: defines which blasting method to use:
* either web-query (=remote)
* from a local blast database (=local)
* **self.num_threads**: number of cores to be used during a run
* **self.url_base**:
* if blastloc == remote: it defines the url for the blast queries.
* if blastloc == local: url_base = None
* **self.delay**: defines when to reblast sequences in days
* **optional self.objects**:
* if blastloc == local:
* self.blastdb: this defines the path to the local blast database
* self.ncbi_nodes: path to 'nodes.dmp' file, that contains the hierarchical information
* self.ncbi_names: path to 'names.dmp' file, that contains the different ID's
"""
def __init__(self, configfile=None, run = True):
# debug(configfi)
if _DEBUG:
sys.stdout.write("Building config object\n")
self.run = run
if configfile:
self.set_defaults()
self.read_config(configfile)
else:
sys.stdout.write("No config file, using defaults\n")
self.set_defaults()
[docs] def set_defaults(self):
""" In the absence of an input configuration file, sets default values.
"""
self.email = None
self.e_value_thresh = 0.00001
self.hitlist_size = 10
self.blast_loc = 'remote'
self.url_base = None
self.blastdb = None
self.num_threads = 4
self.delay = 90
self.spp_threshold = 5
self.minlen = 0.8
self.api_key = None
self.maxlen = 1.2
self.taxonomy_dir = ''
self.check_taxonomy()
[docs] def config_str(self):
""" Write out the current config values.
DOES NOT INCUDE SOME HIDDEN CONFIGUREABLE ATTRIBUTES
"""
config_text = '''[blast]
Entrez.email = {email}
e_value_thresh = {e_val}
hitlist_size = {hls}
location = {bl}
localblastdb = {db}
num_threads = {nt}
delay = {delay}
[physcraper]
spp_threshold = {sppt}
min_length = {perc}
max_length = {maxlen}
'''.format(
email=self.email,
e_val=self.e_value_thresh,
hls=self.hitlist_size,
bl=self.blast_loc,
db=self.blastdb,
nt=self.num_threads,
delay=self.delay,
sppt=self.spp_threshold,
perc=self.minlen,
maxlen=self.maxlen)
return config_text
[docs] def check_taxonomy(self):
"""Locates a taxonomy directory in tthe phyysraper repo,
or if not avail (often because module was pip installed),
genertes one.
"""
if os.path.exists(self.taxonomy_dir):
pass
elif os.path.exists("{}/taxonomy".format(PHYSCRAPER_DIR)):
self.taxonomy_dir = "{}/taxonomy".format(PHYSCRAPER_DIR)
else:
if not os.path.exists('taxonomy'):
os.mkdir("taxonomy")
self.taxonomy_dir = os.path.abspath('taxonomy')
sys.stdout.write("Using {} as taxonomy dir.".format(self.taxonomy_dir))
self.ott_ncbi = "{}/ott_ncbi".format(self.taxonomy_dir)
if not os.path.exists(self.ott_ncbi):
sys.stdout.write(
"downloading taxonomy from \
https://raw.githubusercontent.com/McTavishLab/physcraper/main/taxonomy/ott_ncbi")
wget.download(
'https://raw.githubusercontent.com/McTavishLab/physcraper/main/taxonomy/ott_ncbi',
out=self.taxonomy_dir)
assert os.path.isfile(self.ott_ncbi), (
"file `%s` does not exist" % self.ott_ncbi
)
[docs] def write_file(self, direc, filename="run.config"):
""" writes config params to file
* **direc**: path to write file
* **filename**: filename to use. Default = run.config
"""
config_text = self.config_str()
fi = open("{}/{}".format(direc, filename), "w")
fi.write(config_text)
fi.close()
[docs] def read_config(self, configfi):
""" Reads configfile, and sets configuration params.
any params not listed will be set to dafault values in set_default()
* **configfile**: path to input file.
"""
assert os.path.isfile(configfi), "file `%s` does not exist" % configfi
config = configparser.ConfigParser()
self.configfi = configfi
config.read_file(open(configfi))
# read in blast settings
self.email = config["blast"].get("Entrez.email")
#if not "@" in self.email:
#sys.stderr.write(
# "your email `%s` does not have an @ sign. NCBI blast requests an email address.\n" % self.email)
if config["blast"].get("Entrez.api_key"):
self.api_key = config["blast"]["Entrez.api_key"]
if self.api_key == 'None':
self.api_key = None
else:
self.api_key = None
self.e_value_thresh = config["blast"]["e_value_thresh"]
assert is_number(self.e_value_thresh), (
"value `%s` does not exists" % self.e_value_thresh
)
self.hitlist_size = int(config["blast"]["hitlist_size"])
assert is_number(self.hitlist_size), (
"value `%s`is not a number" % self.e_value_thresh
)
# read in settings for internal Physcraper processes
if "taxonomy_path" in config["physcraper"].keys():
self.taxonomy_dir = config["physcraper"]["taxonomy_path"]
self.check_taxonomy()
self.blast_loc = config["blast"]["location"]
assert self.blast_loc in ["local", "remote"], (
"your blast location `%s` is not remote or local" % self.email
)
if self.blast_loc == "local":
self.blastdb = config["blast"]["localblastdb"]
self.set_local()
if self.blast_loc == "remote":
self.url_base = config["blast"].get("url_base")
if self.url_base == 'None':
self.url_base = None
if _DEBUG:
sys.stdout.write("{}\n".format(self.email))
#if self.blast_loc == "remote":
# sys.stdout.write("url base = {}\n".format(self.url_base))
sys.stdout.write("{}\n".format(self.blast_loc))
if self.blast_loc == "local":
sys.stdout.write("local blast db {}\n".format(self.blastdb))
self.num_threads = config["blast"].get("num_threads")
# print("slurm threads")
# print(os.environ.get('SLURM_JOB_CPUS_PER_NODE'))
if os.environ.get('SLURM_JOB_CPUS_PER_NODE'):
self.num_threads = int(os.environ.get('SLURM_JOB_CPUS_PER_NODE'))
self.delay = int(config["blast"]["delay"])
assert is_number(self.delay), (
"value `%s`is not a number" % self.delay
)
# #############
# read in physcraper settings
self.minlen = float(config["physcraper"]["min_length"])
assert 0 < self.minlen <= 1, (
"value `%s` is not between 0 and 1" % self.minlen
)
self.spp_threshold = int(config["physcraper"]["spp_threshold"])
self.maxlen = float(config["physcraper"]["max_length"])
assert self.maxlen > 1, (
"value `%s` is not larger than 1" % self.maxlen
)
[docs] def set_local(self):
""" Checks that all appropriate files etc are in place for local blast db.
"""
if not self.run:
return
self.blast_loc = "local"
self.ncbi_nodes = "{}/nodes.dmp".format(self.taxonomy_dir)
self.ncbi_names = "{}/names.dmp".format(self.taxonomy_dir)
assert(self.blastdb), "No blast db location set"
if not os.path.isdir(self.blastdb):
sys.stderr.write(
"Local Blast DB not found at {},\
please use a remote search, or update as described\
in 'https://physcraper.readthedocs.io/en/main/install.html#local-databases'\n".format(self.blastdb))
sys.exit()
if not os.path.exists("{}/nt.23.nhr".format(self.blastdb)):
sys.stderr.write("Errors with local Blast DB at {}, \
may be incomplete. please use a remote search, \
or update as described in \
'https://physcraper.readthedocs.io/en/main/install.html#local-databases'\n".format(self.blastdb))
sys.exit()
else:
download_date = os.path.getmtime("{}/nt.23.nhr".format(self.blastdb))
download_date = datetime.datetime.fromtimestamp(download_date)
today = datetime.datetime.now()
time_passed = (today - download_date).days
if time_passed >= 90:
sys.stderr.write("Your databases might not be up to date anymore. \
You downloaded them {} days ago. \
Continuing, but perhaps use a remote search, \
or update as decribed in \
'https://physcraper.readthedocs.io/en/main/install.html#local-databases'\n".format(time_passed))
if not os.path.exists(self.ncbi_nodes):
sys.stderr.write("NCBI taxonomy not found at {} - \
To perform a local blast search, please update nodes and names.dmp, \
as described in \
'https://physcraper.readthedocs.io/en/main/install.html#local-databases'\n".format(self.ncbi_nodes))
sys.exit()
else:
download_date = os.path.getmtime(self.ncbi_nodes)
download_date = datetime.datetime.fromtimestamp(download_date)
today = datetime.datetime.now()
time_passed = (today - download_date).days
if time_passed >= 90:
sys.stderr.write("Your taxonomy databases from NCBI were dowloaded {} days ago. \
To perform a local blast search, please update nodes and names.dmp, \
as described in \
'https://physcraper.readthedocs.io/en/main/install.html#local-databases'\n".format(time_passed))
assert(shutil.which("blastn")), "blastn not found in path"
self.url_base = None