Source code for microbetag.genres

import os
import time
import cobra
import random
import logging
import subprocess
import multiprocessing

from .utils import split_list, run_until_done, file_exists_and_nonzero, get_library_version, get_tool_location



[docs]
class GEMSReconstruction():
    """
    Class to first RAST annotate and the build Genome Scale Metabolic Reconstructions
    using modelseedpy
    """
    def __init__(self, config):
        self.config = config


[docs]
    def rast_annotate_genomes(self):
        """
        Pool for running rast annotations for a list of bins
        """
        if  len(self.config.bin_filenames) > self.config.threads:
            chunk_size = self.config.threads
            bin_chunks = split_list(self.config.bin_filenames, chunk_size)
        else:
            chunk_size = len(self.config.bin_filenames)
            bin_chunks = [self.config.bin_filenames]

        self.chunk_size = chunk_size

        counter = 0
        for chunk in bin_chunks:
            pool = multiprocessing.Pool(self.config.threads)
            pool.map(self.rast_annotate_a_genome, chunk)
            pool.close()
            pool.join()
            counter += chunk_size
            logging.info(
                "We now have annotated %s genomes out of the %s" % (counter, len(self.config.bin_filenames))
            )



[docs]
    def rast_annotate_a_genome(self, bin_filename):
        """
        RAST annotate a user's bin based on:
        https://www.bv-brc.org/docs/cli_tutorial/rasttk_getting_started.html#the-concept-of-the-genome-typed-object
        """
        bin_file = os.path.join(self.config.bins_path, bin_filename)
        name, _ = os.path.splitext(bin_filename)
        genome = name
        gto_filename = os.path.join(self.config.reconstructions, "".join([name, ".gto"]))

        # rast_create_genome_command: create a GTO for the bin's contig
        rast_create_genome_command = " ".join([
            "rast-create-genome", "--scientific-name", name,
            "--genetic-code", "11", "--domain", "Bacteria",
            "--contigs", bin_file,
            "--genome-id", genome, ">", gto_filename
            ])
        if not file_exists_and_nonzero(gto_filename):
            rast_create_genome_command = ''.join(['\\:' if char == ':' else char for char in rast_create_genome_command])
            logging.info("rast_create_genome_command: %s", rast_create_genome_command)
            run_until_done(rast_create_genome_command)

        # rast-process-genome: run the default RASTtk pipeline tool
        gto_filename_2 = "".join([gto_filename, "_2"])
        rast_process_genome_command = " ".join([
            "rast-process-genome", "<", gto_filename, ">", gto_filename_2
            ])
        if not file_exists_and_nonzero(gto_filename_2):
            rast_process_genome_command = ''.join(['\\:' if char == ':' else char for char in rast_process_genome_command])
            logging.info("rast_process_genome_command: %s", rast_process_genome_command)
            run_until_done(rast_process_genome_command)

        # rast-export-genome protein_fasta: export the genome in a desired format
        faa_filename = "".join([gto_filename[:-4], ".faa"])
        rast_export_genome_command = " ".join([
            "rast-export-genome",
            "protein_fasta",
            "<", gto_filename_2, ">",
            faa_filename
            ])
        if not file_exists_and_nonzero(faa_filename):
            rast_export_genome_command = ''.join(['\\:' if char == ':' else char for char in rast_export_genome_command])
            logging.info("rast_export_genome_command: %s", rast_export_genome_command)
            run_until_done(rast_export_genome_command)



[docs]
    def modelseed_reconstructions(self):
        """
        Pool for running GENREs reconstruction using the final .faa files from the
        rast_annotate_genomes() function
        [NOTE] Not to be used for now as the RAST server seems not that stable to have several queries..
        """
        # Make sure of the scikit version being used
        if self.config.input_for_recon_type == "proteins_faa":
            faa_files = [
                os.path.join(self.config.for_reconstructions, file)
                for file in os.listdir(self.config.for_reconstructions)
            ]
        else:
            faa_files = [file
                        for file in os.listdir(self.config.reconstructions)
                        if file.endswith(".faa")
                        ]
        if get_library_version("scikit-learn") != "0.24.2":
            os.system("python3 -m pip install scikit-learn==0.24.2")
        for faa_file in faa_files:
            self.reconstruct_a_model(faa_file)



[docs]
    def reconstruct_a_model(self, annotation_faa):
        """
        Build a Genome Scale reconstruction using ModelSEEDpy and the BIN annotations
        """
        # ModelSEED using the default b.f and complete medium, gapfill with the default algo
        model_id, _ = os.path.splitext(annotation_faa.split("/")[-1])
        annotation_faa_path = os.path.join(self.config.reconstructions, annotation_faa)
        model_filename =  os.path.join(self.config.genres, "".join([model_id, ".xml"]))
        if os.path.exists(model_filename):
            return 1
        logging.info("Model to be reconstructed: %s", model_id)
        model = self.recursive_build(model_id, annotation_faa_path)
        cobra.io.write_sbml_model(cobra_model = model, filename = model_filename)



[docs]
    def recursive_build(self, model_id, annotation_faa_path, counter=0):
        """
        RAST server usually has issues that lead to fail attempts.
        This recursive function allows the build_metabolic_model() to be performed until the server responses.

        [NOTE] We have observed that when MSGenome is initiated in the same function with the MSBuilder, they behave much better!
        """

        from modelseedpy import MSBuilder, MSGenome
        if counter >= 20:

            # Run the script again with the given arguments   -- TODO:   THIS IS A BIT EXTREME...
            fire_microbetag(yaml_file=self.config.yaml_file)

        counter += 1
        try:
            msgenome = MSGenome.from_fasta(annotation_faa_path, split=' ')
            model = MSBuilder.build_metabolic_model(
                model_id = model_id,
                genome = msgenome,
                index = "0",
                gapfill_model = self.config.gapfill_model,
                gapfill_media = None,
                annotate_with_rast = True,
                allow_all_non_grp_reactions = True
            )
            return model
        except:
            time.sleep(random.randint(1, 20))
            logging.info("Recursive run for model_id: %s", model_id)
            return self.recursive_build(model_id, annotation_faa_path, counter)



[docs]
    def carve_reconstructions(self):
        """
        Reconstruct a GENRE using CarveMe and a .faa as input.
        You can get such a file after running RAST annotation or after any gene prediction tool such as Prodigal, FragenScan etc.
        """
        if self.config.input_for_recon_type == "bins_fasta":

            gene_predictor_path = self.config.prodigal if self.config.gene_predictor == "prodigal" else self.config.reconstructions
            print(gene_predictor_path)
            faa_files = [
                os.path.join(gene_predictor_path, tfile)
                for tfile in os.listdir(gene_predictor_path)
                if tfile.endswith(".faa")
            ]
            logging.info(faa_files)
            self.run_carve(faa_files)

        elif self.config.input_for_recon_type in ["coding_regions", "proteins_faa"]:
            faa_files = [
                os.path.join(self.config.for_reconstructions, file)
                for file in os.listdir(self.config.for_reconstructions)
            ]
            self.run_carve(faa_files, dna=self.config.input_for_recon_type == "coding_regions")



[docs]
    def run_carve(self, faa_files, dna=False):
        """
        Build a GEM using carveme for a list of
        """
        for faa in faa_files:
            bin_id = os.path.splitext(os.path.basename(faa))[0]
            xml = os.path.join(self.config.genres, f"{bin_id}.xml")
            carve_params = ["carve", "--solver", "gurobi", "-o", xml]
            if os.path.exists(xml) and os.path.getsize(xml) > 0:
                logging.info("""
                An .xml file for this bin is already available.
                This will be used for seed complementarities and carve step will be skiped."""
                )
                continue
            if dna:
                carve_params.append("--dna")
            carve_params.append(faa)
            carve_command = " ".join(carve_params)

            os.system(carve_command)



[docs]
    def fgs_annotate_genomes(self):
        """
        Use FragGeneScan to get .faa files.
        """
        cwd = os.getcwd()
        if self.config.mount is not None:
            FGS = "/opt/FragGeneScan/FragGeneScan"
        else:
            FGS = get_tool_location("FragGeneScan")
        COMPLETE = "complete"
        for bin_filename in self.config.bin_filenames:
            bin_file = os.path.join(self.config.bins_path, bin_filename)
            bin_id, _ = os.path.splitext(bin_filename)
            faa = os.path.join(self.config.reconstructions, bin_id)
            if not file_exists_and_nonzero(faa):
                logging.info(f"Bin {bin_filename} is being annotated using FGS.")
                fgs_params = [
                    FGS,
                    "-s",  bin_file,
                    " -o", faa,
                    "-w  1",
                    "-p", str(self.config.threads),
                    "-t", COMPLETE
                ]
                fgs_command = " ".join(fgs_params)
                os.system(fgs_command)
        os.chdir(cwd)





[docs]
def fire_microbetag(yaml_file):
    import sys
    logging.warning(
        """
        \n\n******
        microbetag kept calling the recursive function for building modelseedpy GEM.
        This function tries to establish a connection with the RAST server that at the moment does not allow it.
        microbetag will exit and restart its execution with the exact same settings.
        Since previous steps are alredy complete, they will be skipped.
        ******\n\n
        """
    )
    root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    script_path = os.path.join(root_dir, "microbetag.py")
    subprocess.run([sys.executable, script_path, yaml_file])