Source code for microbetag.pathway_complementarity

import os, json
import ast  #  process trees of the Python abstract syntax grammar.
import logging
import itertools
import pyshorteners
import pandas as pd
from tqdm import tqdm

from .utils import SetEncoder, flatten



[docs]
def build_kegg_url(kegg_map, clean_path, missing_kos, shortener=None):
    """
    Build url to colorify the related to the module kegg map based on the KO terms
    of the beneficiary (pink) and those it gets from the donor (green)
    """
    # Load the dictionary with the kegg modules and their corresponding maps
    color_mapp_base_url = "https://www.kegg.jp/kegg-bin/show_pathway?"
    present_kos_color   = "%09%23EAD1DC/"
    complemet_kos_color = "%09%2300A898/"

    # Make a url pointing at a colored kegg map based on what's on the beneficiary's genome
    # and what it gets as complement from the donor
    beneficiarys_kos = ""
    complements_kos  = ""
    for ko_term in clean_path:
        if ko_term not in missing_kos:
            beneficiarys_kos = "".join([beneficiarys_kos, ko_term, present_kos_color])
        else:
            complements_kos = "".join([complements_kos, ko_term, complemet_kos_color])
    try:
        # [NOTE] In rare cases, the module might not have a related map, thus kegg_map would be of NoneType
        # and the join() would return an error.
        url_ko_map_colored = "".join([color_mapp_base_url, kegg_map,  "/", beneficiarys_kos, complements_kos])
        if shortener is not None:
            logging.info("Shortening the URL.")
            url_ko_map_colored = shortener.tinyurl.short(url_ko_map_colored)
    except:
        url_ko_map_colored = "N/A"

    return url_ko_map_colored




[docs]
def all_alternatives(bin_kos_per_module, modules_definitions_json_map, alts_output_file):
    """
    Build the alts.json file
    list alternatives for a bin's modules to be completed

    Inputs:
        bin_kos_per_module (Dict):
        modules_definitions_json_map (str): path to


    """
    logging.info("Step 2, build alts.json file.")

    with open(modules_definitions_json_map, 'r') as f:
        mo_map = json.load(f)

    structurals = ["md:M00144","md:M00149","md:M00151",
                   "md:M00152","md:M00154","md:M00155",
                   "md:M00153", "md:M00156", "md:M00158",
                   "md:M00160"]
    # Iterate through bins
    bins_alternatives = {}
    for bin_id in bin_kos_per_module:

        complete_modules = set()
        alternatives_to_gap = {}

        for module, kos_on_its_own in bin_kos_per_module[bin_id].items():
            if module in structurals:
                continue
            # Get KOs related to the module under study that are present on the beneficiary's genome
            list_of_kos_present = set(kos_on_its_own)
            definition_under_study = mo_map[module]['steps']
            definition_under_study_proc = [term if isinstance(term, list) else [term] for term in definition_under_study.values()]
            potential_compl_paths = [list(tup) for tup in itertools.product(*definition_under_study_proc)]
            flat_potent_compl_paths = [flatten(path) for path in potential_compl_paths]
            for path in flat_potent_compl_paths:
                check = all(item in list_of_kos_present for item in path)
                if check:
                    if module not in complete_modules:
                        complete_modules.add(module)
                else:
                    gaps = set(x for x in set(path) if x not in set(list_of_kos_present))
                    if module not in alternatives_to_gap:
                        alternatives_to_gap[module] = {}
                        alternatives_to_gap[module][str(path)] = gaps
                    else:
                        alternatives_to_gap[module][str(path)] = gaps

        # Remove complete modules for the alternatived dict
        for key in complete_modules:
            if key in alternatives_to_gap:
                del alternatives_to_gap[key]

        # Get shortert alternative for each
        for module, path_gaps in alternatives_to_gap.items():
            tmp = tmp2 = alternatives_to_gap[module].copy()
            min_val = min([len(path_gaps[ele]) for ele in path_gaps])
            values = list(tmp2.values())
            shortest_alternatives = [list(tmp2.keys())[values.index(s)]
                                    for s in values
                                    if not any(s.issuperset(i) and len(s) > len(i) for i in values)
                                    ]
            for path, gaps in alternatives_to_gap[module].items():
                if len(gaps) > min_val + 1 or path not in shortest_alternatives:
                    del tmp[path]
            alternatives_to_gap[module] = tmp

        # Assign alternatives found to be potentially filled for the bin under study
        bins_alternatives[bin_id] = alternatives_to_gap

    # Write alts.json file
    with open(alts_output_file, "w") as file:
        json.dump(bins_alternatives, file, cls=SetEncoder)

    logging.info("Step 2, the alternatives of each bin's modules were enumerated.")

    return bins_alternatives




[docs]
def all_complements(bin_kos_per_module, bins_alternatives, module_to_map, compl_output_file, tinyurl=False):
    """
    Extract potential complementarities from other bins

    Inputs:
        bin_kos_per_module
        bins_alternatives
        module_to_map
    """
    logging.info("Build pathCompls.json file.")
    unique_url_input = {}

    # Init shortener
    shortener = pyshorteners.Shortener() if tinyurl else None

    # Parse KO annotations
    complements = {}
    for beneficiary_bin_id, all_bin_module_alternatives in tqdm(
        bins_alternatives.items(), desc="Processing bins", unit="bin"
    ):
        complements[beneficiary_bin_id] = {}
        for donor_bin_id in bin_kos_per_module:
            complements[beneficiary_bin_id][donor_bin_id] = []
            for module, alts in all_bin_module_alternatives.items():
                donors_kos_relativ_to_module = bin_kos_per_module[donor_bin_id][module]
                for alternative, missing_kos_for_alternative in alts.items():
                    is_subset = set(missing_kos_for_alternative).issubset(set(donors_kos_relativ_to_module))
                    if is_subset:
                        alternative = ast.literal_eval(alternative)
                        pc_comb = (module, tuple(missing_kos_for_alternative), tuple(alternative))
                        if pc_comb not in unique_url_input:
                            try:
                                module_map = module_to_map[module]
                                url = build_kegg_url(module_map,
                                                    list(alternative),
                                                    list(set(missing_kos_for_alternative)),
                                                    shortener)
                            except:
                                url = ""
                                pass
                            unique_url_input[pc_comb] = url

                        # Build list with the complete complement
                        pot_compl = [module,
                                    missing_kos_for_alternative,
                                    alternative,
                                    unique_url_input[pc_comb]
                                    ]
                        complements[beneficiary_bin_id][donor_bin_id].append(pot_compl)
    # Write the pathCompls.json file
    with open(compl_output_file, "w") as file:
        json.dump(complements, file, cls=SetEncoder)
    logging.info("Step 3, the potential complementarities among the bins were enumerated.")

    return complements




[docs]
def a_modules_maps(kegg_modules_to_maps):
    """Get the KEGG maps in which a module takes part in"""
    # maps = open(kegg_modules_to_maps, "r")
    with open(kegg_modules_to_maps, "r") as f:
        maps = f.readlines()
    module_to_map = {}
    for line in maps:
        module, mmap = line.split("\t")
        module_to_map[module[:-1]] = mmap[1:-1]
    return module_to_map




[docs]
def taxon_kos_per_module(bins_kos_df, ko_terms_per_module_definition):
    """Keep track of the KOs related to a module present on each bin
    Input:
        bins_kos_df (pd.DataFrame):

    Returns:
        bin_kos_per_module (Dict):
    """
    d = pd.read_csv(ko_terms_per_module_definition, sep="\t")
    d.columns =["module_id","ko_term"]
    d.loc[:, 'presence'] = 1
    definitions_df = d.pivot_table(index='ko_term', columns='module_id', values='presence', fill_value=0)
    ind = definitions_df.index.str.replace('ko:', '')
    definitions_df.index = ind

    bin_kos_per_module = {}
    # Iterate over each column in the second dataframe
    logging.info("Step 1, KOs related to a module present on each bin.")
    for bin_id in bins_kos_df.columns:
        bin_kos_per_module[bin_id] = {}  # Initialize inner dictionary for each bin
        for module, definition_ko_terms in definitions_df.items():
            # Get KOs of the module definition
            definition_ko_terms = definition_ko_terms[definition_ko_terms != 0]
            # Get KOs present on the bin
            bins_kos = bins_kos_df[bins_kos_df[bin_id]==1][bin_id]
            # Get intersection and add the module: kos_present to the dict
            bin_kos_per_module[bin_id][module] = bins_kos.index.intersection(definition_ko_terms.index).tolist()

    return bin_kos_per_module




[docs]
def export_pathway_complementarities(config, bins_kos_df):
    """
    Function to get all the KEGG pathway complements among a set of bins

    Input:
        config (Config): instance of the microbetag Config class with settings
        bins_kos_df (pd.DataFrame): dictionary with bin id as a key and the KOs found in the bin as the value

    Returns:
    {beneficiary_bin: {donor_bin_A: {module_a: [], module_b: [],.. }}}
    """

    # Keep track of the KOs related to a module present on each bin
    bin_kos_per_module = taxon_kos_per_module(bins_kos_df, config.ko_terms_per_module_definition)

    # If alts.json not available
    if not os.path.exists(config.alts_file):

        bins_alternatives = all_alternatives(bin_kos_per_module, config.modules_definitions_json_map, config.alts_file)

    else:

        with open(config.alts_file, "r") as h:
            bins_alternatives = json.load(h)

    # If compl.json not available
    if not os.path.exists(config.compl_file):

        module_to_map = a_modules_maps(config.kegg_modules_to_maps)
        complements = all_complements(bin_kos_per_module, bins_alternatives, module_to_map, config.compl_file, config.tinyurl)

    else:

        with open(config.compl_file, "r") as h:
            complements = json.load(h)

    return bins_alternatives, complements