Source code for microbetag.pathway_complementarity

import os, json
import ast  #  process trees of the Python abstract syntax grammar.
import logging
import itertools
import pyshorteners
import pandas as pd
from tqdm import tqdm

from .utils import SetEncoder, flatten


[docs] def build_kegg_url(kegg_map, clean_path, missing_kos, shortener=None): """ Build url to colorify the related to the module kegg map based on the KO terms of the beneficiary (pink) and those it gets from the donor (green) """ # Load the dictionary with the kegg modules and their corresponding maps color_mapp_base_url = "https://www.kegg.jp/kegg-bin/show_pathway?" present_kos_color = "%09%23EAD1DC/" complemet_kos_color = "%09%2300A898/" # Make a url pointing at a colored kegg map based on what's on the beneficiary's genome # and what it gets as complement from the donor beneficiarys_kos = "" complements_kos = "" for ko_term in clean_path: if ko_term not in missing_kos: beneficiarys_kos = "".join([beneficiarys_kos, ko_term, present_kos_color]) else: complements_kos = "".join([complements_kos, ko_term, complemet_kos_color]) try: # [NOTE] In rare cases, the module might not have a related map, thus kegg_map would be of NoneType # and the join() would return an error. url_ko_map_colored = "".join([color_mapp_base_url, kegg_map, "/", beneficiarys_kos, complements_kos]) if shortener is not None: logging.info("Shortening the URL.") url_ko_map_colored = shortener.tinyurl.short(url_ko_map_colored) except: url_ko_map_colored = "N/A" return url_ko_map_colored
[docs] def all_alternatives(bin_kos_per_module, modules_definitions_json_map, alts_output_file): """ Build the alts.json file list alternatives for a bin's modules to be completed Inputs: bin_kos_per_module (Dict): modules_definitions_json_map (str): path to """ logging.info("Step 2, build alts.json file.") with open(modules_definitions_json_map, 'r') as f: mo_map = json.load(f) structurals = ["md:M00144","md:M00149","md:M00151", "md:M00152","md:M00154","md:M00155", "md:M00153", "md:M00156", "md:M00158", "md:M00160"] # Iterate through bins bins_alternatives = {} for bin_id in bin_kos_per_module: complete_modules = set() alternatives_to_gap = {} for module, kos_on_its_own in bin_kos_per_module[bin_id].items(): if module in structurals: continue # Get KOs related to the module under study that are present on the beneficiary's genome list_of_kos_present = set(kos_on_its_own) definition_under_study = mo_map[module]['steps'] definition_under_study_proc = [term if isinstance(term, list) else [term] for term in definition_under_study.values()] potential_compl_paths = [list(tup) for tup in itertools.product(*definition_under_study_proc)] flat_potent_compl_paths = [flatten(path) for path in potential_compl_paths] for path in flat_potent_compl_paths: check = all(item in list_of_kos_present for item in path) if check: if module not in complete_modules: complete_modules.add(module) else: gaps = set(x for x in set(path) if x not in set(list_of_kos_present)) if module not in alternatives_to_gap: alternatives_to_gap[module] = {} alternatives_to_gap[module][str(path)] = gaps else: alternatives_to_gap[module][str(path)] = gaps # Remove complete modules for the alternatived dict for key in complete_modules: if key in alternatives_to_gap: del alternatives_to_gap[key] # Get shortert alternative for each for module, path_gaps in alternatives_to_gap.items(): tmp = tmp2 = alternatives_to_gap[module].copy() min_val = min([len(path_gaps[ele]) for ele in path_gaps]) values = list(tmp2.values()) shortest_alternatives = [list(tmp2.keys())[values.index(s)] for s in values if not any(s.issuperset(i) and len(s) > len(i) for i in values) ] for path, gaps in alternatives_to_gap[module].items(): if len(gaps) > min_val + 1 or path not in shortest_alternatives: del tmp[path] alternatives_to_gap[module] = tmp # Assign alternatives found to be potentially filled for the bin under study bins_alternatives[bin_id] = alternatives_to_gap # Write alts.json file with open(alts_output_file, "w") as file: json.dump(bins_alternatives, file, cls=SetEncoder) logging.info("Step 2, the alternatives of each bin's modules were enumerated.") return bins_alternatives
[docs] def all_complements(bin_kos_per_module, bins_alternatives, module_to_map, compl_output_file, tinyurl=False): """ Extract potential complementarities from other bins Inputs: bin_kos_per_module bins_alternatives module_to_map """ logging.info("Build pathCompls.json file.") unique_url_input = {} # Init shortener shortener = pyshorteners.Shortener() if tinyurl else None # Parse KO annotations complements = {} for beneficiary_bin_id, all_bin_module_alternatives in tqdm( bins_alternatives.items(), desc="Processing bins", unit="bin" ): complements[beneficiary_bin_id] = {} for donor_bin_id in bin_kos_per_module: complements[beneficiary_bin_id][donor_bin_id] = [] for module, alts in all_bin_module_alternatives.items(): donors_kos_relativ_to_module = bin_kos_per_module[donor_bin_id][module] for alternative, missing_kos_for_alternative in alts.items(): is_subset = set(missing_kos_for_alternative).issubset(set(donors_kos_relativ_to_module)) if is_subset: alternative = ast.literal_eval(alternative) pc_comb = (module, tuple(missing_kos_for_alternative), tuple(alternative)) if pc_comb not in unique_url_input: try: module_map = module_to_map[module] url = build_kegg_url(module_map, list(alternative), list(set(missing_kos_for_alternative)), shortener) except: url = "" pass unique_url_input[pc_comb] = url # Build list with the complete complement pot_compl = [module, missing_kos_for_alternative, alternative, unique_url_input[pc_comb] ] complements[beneficiary_bin_id][donor_bin_id].append(pot_compl) # Write the pathCompls.json file with open(compl_output_file, "w") as file: json.dump(complements, file, cls=SetEncoder) logging.info("Step 3, the potential complementarities among the bins were enumerated.") return complements
[docs] def a_modules_maps(kegg_modules_to_maps): """Get the KEGG maps in which a module takes part in""" # maps = open(kegg_modules_to_maps, "r") with open(kegg_modules_to_maps, "r") as f: maps = f.readlines() module_to_map = {} for line in maps: module, mmap = line.split("\t") module_to_map[module[:-1]] = mmap[1:-1] return module_to_map
[docs] def taxon_kos_per_module(bins_kos_df, ko_terms_per_module_definition): """Keep track of the KOs related to a module present on each bin Input: bins_kos_df (pd.DataFrame): Returns: bin_kos_per_module (Dict): """ d = pd.read_csv(ko_terms_per_module_definition, sep="\t") d.columns =["module_id","ko_term"] d.loc[:, 'presence'] = 1 definitions_df = d.pivot_table(index='ko_term', columns='module_id', values='presence', fill_value=0) ind = definitions_df.index.str.replace('ko:', '') definitions_df.index = ind bin_kos_per_module = {} # Iterate over each column in the second dataframe logging.info("Step 1, KOs related to a module present on each bin.") for bin_id in bins_kos_df.columns: bin_kos_per_module[bin_id] = {} # Initialize inner dictionary for each bin for module, definition_ko_terms in definitions_df.items(): # Get KOs of the module definition definition_ko_terms = definition_ko_terms[definition_ko_terms != 0] # Get KOs present on the bin bins_kos = bins_kos_df[bins_kos_df[bin_id]==1][bin_id] # Get intersection and add the module: kos_present to the dict bin_kos_per_module[bin_id][module] = bins_kos.index.intersection(definition_ko_terms.index).tolist() return bin_kos_per_module
[docs] def export_pathway_complementarities(config, bins_kos_df): """ Function to get all the KEGG pathway complements among a set of bins Input: config (Config): instance of the microbetag Config class with settings bins_kos_df (pd.DataFrame): dictionary with bin id as a key and the KOs found in the bin as the value Returns: {beneficiary_bin: {donor_bin_A: {module_a: [], module_b: [],.. }}} """ # Keep track of the KOs related to a module present on each bin bin_kos_per_module = taxon_kos_per_module(bins_kos_df, config.ko_terms_per_module_definition) # If alts.json not available if not os.path.exists(config.alts_file): bins_alternatives = all_alternatives(bin_kos_per_module, config.modules_definitions_json_map, config.alts_file) else: with open(config.alts_file, "r") as h: bins_alternatives = json.load(h) # If compl.json not available if not os.path.exists(config.compl_file): module_to_map = a_modules_maps(config.kegg_modules_to_maps) complements = all_complements(bin_kos_per_module, bins_alternatives, module_to_map, config.compl_file, config.tinyurl) else: with open(config.compl_file, "r") as h: complements = json.load(h) return bins_alternatives, complements