Source code for microbetag.genres

import os
import cobra

import subprocess
import multiprocessing
from pathlib import Path


from .utils import (
    split_list,
    run_until_done,
    file_exists_and_nonzero,
    get_tool_location,
    mtg_logger,
)

_logger_ = mtg_logger(__name__)

_MSGENRE_SCRIPT_ = Path(os.path.abspath(__file__)).parent.joinpath("msgenre.py")

[docs] class GEMSReconstruction: """ Class for Genome Scale Metabolic Network Reconstruction (GENRE). It makes use of either ModelSEEDpy or CarvMe, two well established methods for building GENRES. Args: config: Instance of the :class:`Config` class. - `threads` - `bin_filenames` - `bins_path` - `reconstructions` - `sc_input_type` - `for_reconstructions` - `genres` - `gapfill_model` - `gapfill_media` Note: CarveMe: Machado D, Andrejev S, Tramontano M, Patil KR. Fast automated reconstruction of genome-scale metabolic models for microbial species and communities. Nucleic acids research. 2018 Sep 6;46(15):7542-53. DOI: https://doi.org/10.1093/nar/gky537 ModelSEEDpy: Faria JP, Liu F, Edirisinghe JN, Gupta N, Seaver SM, Freiburger AP, Zhang Q, Weisenhorn P, Conrad N, Zarecki R, Song HS. ModelSEED v2: High-throughput genome-scale metabolic model reconstruction with enhanced energy biosynthesis pathway prediction. bioRxiv. 2023 Oct 6:2023-10. DOI: https://doi.org/10.1101/2023.10.04.556561 Henry CS, DeJongh M, Best AA, Frybarger PM, Linsay B, Stevens RL. High-throughput generation, optimization and analysis of genome-scale metabolic models. Nature biotechnology. 2010 Sep;28(9):977-82. DOI: https://doi.org/10.1038/nbt.1672 """ def __init__(self, config): self.config = config
[docs] def rast_annotate_genomes(self): """ Pool for running rast annotations for a list of bins """ if len(self.config.bin_filenames) > self.config.threads: chunk_size = self.config.threads bin_chunks = split_list(self.config.bin_filenames, chunk_size) else: chunk_size = len(self.config.bin_filenames) bin_chunks = [self.config.bin_filenames] self.chunk_size = chunk_size counter = 0 for chunk in bin_chunks: pool = multiprocessing.Pool(self.config.threads) pool.map(self.rast_annotate_a_genome, chunk) pool.close() pool.join() counter += chunk_size _logger_.info( "We now have annotated %s genomes out of the %s" % (counter, len(self.config.bin_filenames)) )
[docs] def rast_annotate_a_genome(self, bin_filename): """ RAST annotate a user's bin based on: https://www.bv-brc.org/docs/cli_tutorial/rasttk_getting_started.html#the-concept-of-the-genome-typed-object """ bin_file = os.path.join(self.config.bins_path, bin_filename) name, _ = os.path.splitext(bin_filename) genome = name gto_filename = os.path.join( self.config.reconstructions, "".join([name, ".gto"]) ) # rast_create_genome_command: create a GTO for the bin's contig rast_create_genome_command = " ".join( [ "rast-create-genome", "--scientific-name", name, "--genetic-code", "11", "--domain", "Bacteria", "--contigs", bin_file, "--genome-id", genome, ">", gto_filename, ] ) if not file_exists_and_nonzero(gto_filename): rast_create_genome_command = "".join( ["\\:" if char == ":" else char for char in rast_create_genome_command] ) _logger_.info("rast_create_genome_command: %s", rast_create_genome_command) run_until_done(rast_create_genome_command) # rast-process-genome: run the default RASTtk pipeline tool gto_filename_2 = "".join([gto_filename, "_2"]) rast_process_genome_command = " ".join( ["rast-process-genome", "<", gto_filename, ">", gto_filename_2] ) if not file_exists_and_nonzero(gto_filename_2): rast_process_genome_command = "".join( ["\\:" if char == ":" else char for char in rast_process_genome_command] ) _logger_.info("rast_process_genome_command: %s", rast_process_genome_command) run_until_done(rast_process_genome_command) # rast-export-genome protein_fasta: export the genome in a desired format faa_filename = "".join([gto_filename[:-4], ".faa"]) rast_export_genome_command = " ".join( [ "rast-export-genome", "protein_fasta", "<", gto_filename_2, ">", faa_filename, ] ) if not file_exists_and_nonzero(faa_filename): rast_export_genome_command = "".join( ["\\:" if char == ":" else char for char in rast_export_genome_command] ) _logger_.info("rast_export_genome_command: %s", rast_export_genome_command) run_until_done(rast_export_genome_command)
[docs] def modelseed_reconstructions(self): """ Pool for running GENREs reconstruction using the final .faa files from the rast_annotate_genomes() function Note: Currently is not being as the RAST server seems not that stable to have several queries. """ # Make sure of the scikit version being used if self.config.sc_input_type == "proteins_faa": faa_files = [ os.path.join(self.config.for_reconstructions, file) for file in os.listdir(self.config.for_reconstructions) if file.endswith(".faa") ] else: faa_files = [ os.path.join(self.config.reconstructions, file) for file in os.listdir(self.config.reconstructions) if file.endswith(".faa") ] _logger_.info(faa_files) for faa_file in faa_files: draft_genre = ms_reconstruct(faa_file, self.config.genres) if self.config.gapfill_model: dnngior_gapfill( draft_genre, outdir = self.config.genres, medium = self.config.gapfill_media )
[docs] def carve_reconstructions(self): """ Reconstruct a GENRE using CarveMe and a .faa as input. You can get such a file after running RAST annotation or after any gene prediction tool such as Prodigal, FragenScan etc. """ if self.config.sc_input_type == "bins_fasta": gene_predictor_path = ( self.config.prodigal if self.config.gene_predictor == "prodigal" else self.config.reconstructions ) faa_files = [ os.path.join(gene_predictor_path, tfile) for tfile in os.listdir(gene_predictor_path) if tfile.endswith(".faa") ] run_carve( faa_files, output_dir=self.config.genres ) elif self.config.sc_input_type in ["coding_regions", "proteins_faa"]: faa_files = [ os.path.join(self.config.for_reconstructions, file) for file in os.listdir(self.config.for_reconstructions) ] run_carve( faa_files, output_dir=self.config.genres, dna=self.config.sc_input_type == "coding_regions" )
[docs] def fgs_annotate_genomes(self): """ Use FragGeneScan to get .faa files. """ cwd = os.getcwd() if self.config.mount is not None: FGS = "/opt/FragGeneScan/FragGeneScan" else: FGS = get_tool_location("FragGeneScan") COMPLETE = "complete" for bin_filename in self.config.bin_filenames: bin_file = os.path.join(self.config.bins_path, bin_filename) bin_id, _ = os.path.splitext(bin_filename) faa = os.path.join(self.config.reconstructions, bin_id) if not file_exists_and_nonzero(faa): _logger_.info(f"Bin {bin_filename} is being annotated using FGS.") fgs_params = [ FGS, "-s", bin_file, "-o", faa, "-w", "1", "-p", str(self.config.threads), "-t", COMPLETE, ] try: _ = subprocess.run( fgs_params, capture_output=True, text=True, check=True ) except subprocess.CalledProcessError as e: _logger_.error( f"FGS annotation failed for {bin_filename}. Error:\n{e.stderr}" ) os.chdir(cwd)
[docs] def ms_reconstruct(faa, outdir = None): """ Running ModelSEEDpy on its own conda environment """ faa_path = Path(faa) outdir = Path(outdir) if outdir else faa_path.parent modelId = faa_path.stem modelname = ".".join([modelId, "draft.xml"]) modelfile = os.path.join(outdir, modelname) if not os.path.exists(modelfile): subprocess.run([ "conda", "run", "-n", "mtg-modelseed", "python", _MSGENRE_SCRIPT_, "--reconstruct", "--faa", faa, "--outdir", outdir ]) return modelfile
[docs] def dnngior_gapfill(draft_model, medium=None, outdir=None): """ draft_model: Path to draft .xml medium: path to tab-separated file .tsv """ if not os.path.exists(str(draft_model).replace(".draft", "")): # _MSGENRE_SCRIPT_ is a Path object; this would command = [ "conda", "run", "-n", "mtg-dnngior", "python", str(_MSGENRE_SCRIPT_), "--gapfill", "--draft-model", draft_model, "--outdir", outdir, ] if medium is not None: command += ["--medium", medium] subprocess.run(command)
[docs] def run_carve(faa_files, output_dir, dna=False): """ Build a GEM using carveme for a list of """ for faa in faa_files: bin_id = os.path.splitext(os.path.basename(faa))[0] xml = os.path.join(output_dir, f"{bin_id}.xml") carve_params = ["carve", "--solver", "gurobi", "-o", xml] if os.path.exists(xml) and os.path.getsize(xml) > 0: _logger_.info( f"""A GEM (.xml) based on {faa} is already available to be used for seed complementarities; carve step will be skiped.""" ) continue if dna: carve_params.append("--dna") carve_params.append(faa) carve_command = " ".join(carve_params) os.system(carve_command)