import os
import cobra
import subprocess
import multiprocessing
from pathlib import Path
from .utils import (
split_list,
run_until_done,
file_exists_and_nonzero,
get_tool_location,
mtg_logger,
)
_logger_ = mtg_logger(__name__)
_MSGENRE_SCRIPT_ = Path(os.path.abspath(__file__)).parent.joinpath("msgenre.py")
[docs]
class GEMSReconstruction:
"""
Class for Genome Scale Metabolic Network Reconstruction (GENRE).
It makes use of either ModelSEEDpy or CarvMe, two well established methods for building GENRES.
Args:
config: Instance of the :class:`Config` class.
- `threads`
- `bin_filenames`
- `bins_path`
- `reconstructions`
- `sc_input_type`
- `for_reconstructions`
- `genres`
- `gapfill_model`
- `gapfill_media`
Note:
CarveMe:
Machado D, Andrejev S, Tramontano M, Patil KR.
Fast automated reconstruction of genome-scale metabolic models for microbial species and communities.
Nucleic acids research. 2018 Sep 6;46(15):7542-53. DOI: https://doi.org/10.1093/nar/gky537
ModelSEEDpy:
Faria JP, Liu F, Edirisinghe JN, Gupta N, Seaver SM, Freiburger AP, Zhang Q, Weisenhorn P, Conrad N, Zarecki R, Song HS.
ModelSEED v2: High-throughput genome-scale metabolic model reconstruction with enhanced energy biosynthesis pathway prediction.
bioRxiv. 2023 Oct 6:2023-10. DOI: https://doi.org/10.1101/2023.10.04.556561
Henry CS, DeJongh M, Best AA, Frybarger PM, Linsay B, Stevens RL.
High-throughput generation, optimization and analysis of genome-scale metabolic models.
Nature biotechnology. 2010 Sep;28(9):977-82. DOI: https://doi.org/10.1038/nbt.1672
"""
def __init__(self, config):
self.config = config
[docs]
def rast_annotate_genomes(self):
"""
Pool for running rast annotations for a list of bins
"""
if len(self.config.bin_filenames) > self.config.threads:
chunk_size = self.config.threads
bin_chunks = split_list(self.config.bin_filenames, chunk_size)
else:
chunk_size = len(self.config.bin_filenames)
bin_chunks = [self.config.bin_filenames]
self.chunk_size = chunk_size
counter = 0
for chunk in bin_chunks:
pool = multiprocessing.Pool(self.config.threads)
pool.map(self.rast_annotate_a_genome, chunk)
pool.close()
pool.join()
counter += chunk_size
_logger_.info(
"We now have annotated %s genomes out of the %s"
% (counter, len(self.config.bin_filenames))
)
[docs]
def rast_annotate_a_genome(self, bin_filename):
"""
RAST annotate a user's bin based on:
https://www.bv-brc.org/docs/cli_tutorial/rasttk_getting_started.html#the-concept-of-the-genome-typed-object
"""
bin_file = os.path.join(self.config.bins_path, bin_filename)
name, _ = os.path.splitext(bin_filename)
genome = name
gto_filename = os.path.join(
self.config.reconstructions, "".join([name, ".gto"])
)
# rast_create_genome_command: create a GTO for the bin's contig
rast_create_genome_command = " ".join(
[
"rast-create-genome",
"--scientific-name",
name,
"--genetic-code",
"11",
"--domain",
"Bacteria",
"--contigs",
bin_file,
"--genome-id",
genome,
">",
gto_filename,
]
)
if not file_exists_and_nonzero(gto_filename):
rast_create_genome_command = "".join(
["\\:" if char == ":" else char for char in rast_create_genome_command]
)
_logger_.info("rast_create_genome_command: %s", rast_create_genome_command)
run_until_done(rast_create_genome_command)
# rast-process-genome: run the default RASTtk pipeline tool
gto_filename_2 = "".join([gto_filename, "_2"])
rast_process_genome_command = " ".join(
["rast-process-genome", "<", gto_filename, ">", gto_filename_2]
)
if not file_exists_and_nonzero(gto_filename_2):
rast_process_genome_command = "".join(
["\\:" if char == ":" else char for char in rast_process_genome_command]
)
_logger_.info("rast_process_genome_command: %s", rast_process_genome_command)
run_until_done(rast_process_genome_command)
# rast-export-genome protein_fasta: export the genome in a desired format
faa_filename = "".join([gto_filename[:-4], ".faa"])
rast_export_genome_command = " ".join(
[
"rast-export-genome",
"protein_fasta",
"<",
gto_filename_2,
">",
faa_filename,
]
)
if not file_exists_and_nonzero(faa_filename):
rast_export_genome_command = "".join(
["\\:" if char == ":" else char for char in rast_export_genome_command]
)
_logger_.info("rast_export_genome_command: %s", rast_export_genome_command)
run_until_done(rast_export_genome_command)
[docs]
def modelseed_reconstructions(self):
"""
Pool for running GENREs reconstruction using the final .faa files from the
rast_annotate_genomes() function
Note:
Currently is not being as the RAST server seems not that stable to have several queries.
"""
# Make sure of the scikit version being used
if self.config.sc_input_type == "proteins_faa":
faa_files = [
os.path.join(self.config.for_reconstructions, file)
for file in os.listdir(self.config.for_reconstructions)
if file.endswith(".faa")
]
else:
faa_files = [
os.path.join(self.config.reconstructions, file)
for file in os.listdir(self.config.reconstructions)
if file.endswith(".faa")
]
_logger_.info(faa_files)
for faa_file in faa_files:
draft_genre = ms_reconstruct(faa_file, self.config.genres)
if self.config.gapfill_model:
dnngior_gapfill(
draft_genre,
outdir = self.config.genres,
medium = self.config.gapfill_media
)
[docs]
def carve_reconstructions(self):
"""
Reconstruct a GENRE using CarveMe and a .faa as input.
You can get such a file after running RAST annotation or after any gene prediction tool such as Prodigal, FragenScan etc.
"""
if self.config.sc_input_type == "bins_fasta":
gene_predictor_path = (
self.config.prodigal
if self.config.gene_predictor == "prodigal"
else self.config.reconstructions
)
faa_files = [
os.path.join(gene_predictor_path, tfile)
for tfile in os.listdir(gene_predictor_path)
if tfile.endswith(".faa")
]
run_carve(
faa_files,
output_dir=self.config.genres
)
elif self.config.sc_input_type in ["coding_regions", "proteins_faa"]:
faa_files = [
os.path.join(self.config.for_reconstructions, file)
for file in os.listdir(self.config.for_reconstructions)
]
run_carve(
faa_files,
output_dir=self.config.genres,
dna=self.config.sc_input_type == "coding_regions"
)
[docs]
def fgs_annotate_genomes(self):
"""
Use FragGeneScan to get .faa files.
"""
cwd = os.getcwd()
if self.config.mount is not None:
FGS = "/opt/FragGeneScan/FragGeneScan"
else:
FGS = get_tool_location("FragGeneScan")
COMPLETE = "complete"
for bin_filename in self.config.bin_filenames:
bin_file = os.path.join(self.config.bins_path, bin_filename)
bin_id, _ = os.path.splitext(bin_filename)
faa = os.path.join(self.config.reconstructions, bin_id)
if not file_exists_and_nonzero(faa):
_logger_.info(f"Bin {bin_filename} is being annotated using FGS.")
fgs_params = [
FGS,
"-s",
bin_file,
"-o",
faa,
"-w",
"1",
"-p",
str(self.config.threads),
"-t",
COMPLETE,
]
try:
_ = subprocess.run(
fgs_params, capture_output=True, text=True, check=True
)
except subprocess.CalledProcessError as e:
_logger_.error(
f"FGS annotation failed for {bin_filename}. Error:\n{e.stderr}"
)
os.chdir(cwd)
[docs]
def ms_reconstruct(faa, outdir = None):
"""
Running ModelSEEDpy on its own conda environment
"""
faa_path = Path(faa)
outdir = Path(outdir) if outdir else faa_path.parent
modelId = faa_path.stem
modelname = ".".join([modelId, "draft.xml"])
modelfile = os.path.join(outdir, modelname)
if not os.path.exists(modelfile):
subprocess.run([
"conda",
"run",
"-n", "mtg-modelseed",
"python", _MSGENRE_SCRIPT_,
"--reconstruct",
"--faa", faa,
"--outdir", outdir
])
return modelfile
[docs]
def dnngior_gapfill(draft_model, medium=None, outdir=None):
"""
draft_model: Path to draft .xml
medium: path to tab-separated file .tsv
"""
if not os.path.exists(str(draft_model).replace(".draft", "")):
# _MSGENRE_SCRIPT_ is a Path object; this would
command = [
"conda",
"run",
"-n", "mtg-dnngior",
"python", str(_MSGENRE_SCRIPT_),
"--gapfill",
"--draft-model", draft_model,
"--outdir", outdir,
]
if medium is not None:
command += ["--medium", medium]
subprocess.run(command)
[docs]
def run_carve(faa_files, output_dir, dna=False):
"""
Build a GEM using carveme for a list of
"""
for faa in faa_files:
bin_id = os.path.splitext(os.path.basename(faa))[0]
xml = os.path.join(output_dir, f"{bin_id}.xml")
carve_params = ["carve", "--solver", "gurobi", "-o", xml]
if os.path.exists(xml) and os.path.getsize(xml) > 0:
_logger_.info(
f"""A GEM (.xml) based on {faa} is already available to be used for seed complementarities; carve step will be skiped."""
)
continue
if dna:
carve_params.append("--dna")
carve_params.append(faa)
carve_command = " ".join(carve_params)
os.system(carve_command)