"""
This script aims at calculating and exporting patwhay complementarities in case of
running microbetag locally, with user-provided genomes.
"""
import os
import json
import ast # process trees of the Python abstract syntax grammar.
import itertools
import pyshorteners
import pandas as pd
from tqdm import tqdm
from .utils import SetEncoder, flatten, mtg_logger
[docs]
logger = mtg_logger(__name__)
[docs]
def build_kegg_url(kegg_map, clean_path, missing_kos, shortener=None):
"""
Build url to colorify the related to the module kegg map based on the KO terms
of the beneficiary (pink) and those it gets from the donor (green)
"""
# Load the dictionary with the kegg modules and their corresponding maps
color_mapp_base_url = "https://www.kegg.jp/kegg-bin/show_pathway?"
present_kos_color = "%09%23EAD1DC/"
complemet_kos_color = "%09%2300A898/"
# Make a url pointing at a colored kegg map based on what's on the beneficiary's genome
# and what it gets as complement from the donor
beneficiarys_kos = ""
complements_kos = ""
for ko_term in clean_path:
if ko_term not in missing_kos:
beneficiarys_kos = "".join([beneficiarys_kos, ko_term, present_kos_color])
else:
complements_kos = "".join([complements_kos, ko_term, complemet_kos_color])
try:
# [NOTE] In rare cases, the module might not have a related map, thus kegg_map would be of NoneType
# and the join() would return an error.
url_ko_map_colored = "".join(
[color_mapp_base_url, kegg_map, "/", beneficiarys_kos, complements_kos]
)
if shortener is not None:
logger.info("Shortening the URL.")
url_ko_map_colored = shortener.tinyurl.short(url_ko_map_colored)
except Exception:
url_ko_map_colored = "N/A"
return url_ko_map_colored
[docs]
def all_alternatives(
bin_kos_per_module, modules_definitions_json_map, alts_output_file
):
"""
Build the alts.json file
list alternatives for a bin's modules to be completed
Inputs:
bin_kos_per_module (Dict):
modules_definitions_json_map (str): path to
"""
logger.info("Step 2, build alts.json file.")
with open(modules_definitions_json_map, "r") as f:
mo_map = json.load(f)
structurals = [
"md:M00144",
"md:M00149",
"md:M00151",
"md:M00152",
"md:M00154",
"md:M00155",
"md:M00153",
"md:M00156",
"md:M00158",
"md:M00160",
]
# Iterate through bins
bins_alternatives = {}
for bin_id in bin_kos_per_module:
complete_modules = set()
alternatives_to_gap = {}
for module, kos_on_its_own in bin_kos_per_module[bin_id].items():
if module in structurals:
continue
# Get KOs related to the module under study that are present on the beneficiary's genome
list_of_kos_present = set(kos_on_its_own)
definition_under_study = mo_map[module]["steps"]
definition_under_study_proc = [
term if isinstance(term, list) else [term]
for term in definition_under_study.values()
]
potential_compl_paths = [
list(tup) for tup in itertools.product(*definition_under_study_proc)
]
flat_potent_compl_paths = [flatten(path) for path in potential_compl_paths]
for path in flat_potent_compl_paths:
check = all(item in list_of_kos_present for item in path)
if check:
if module not in complete_modules:
complete_modules.add(module)
else:
gaps = set(
x for x in set(path) if x not in set(list_of_kos_present)
)
if module not in alternatives_to_gap:
alternatives_to_gap[module] = {}
alternatives_to_gap[module][str(path)] = gaps
else:
alternatives_to_gap[module][str(path)] = gaps
# Remove complete modules for the alternatived dict
for key in complete_modules:
if key in alternatives_to_gap:
del alternatives_to_gap[key]
# Get shortert alternative for each
for module, path_gaps in alternatives_to_gap.items():
tmp = tmp2 = alternatives_to_gap[module].copy()
min_val = min([len(path_gaps[ele]) for ele in path_gaps])
values = list(tmp2.values())
shortest_alternatives = [
list(tmp2.keys())[values.index(s)]
for s in values
if not any(s.issuperset(i) and len(s) > len(i) for i in values)
]
for path, gaps in alternatives_to_gap[module].items():
if len(gaps) > min_val + 1 or path not in shortest_alternatives:
del tmp[path]
alternatives_to_gap[module] = tmp
# Assign alternatives found to be potentially filled for the bin under study
bins_alternatives[bin_id] = alternatives_to_gap
# Write alts.json file
with open(alts_output_file, "w") as file:
json.dump(bins_alternatives, file, cls=SetEncoder)
logger.info("Step 2, the alternatives of each bin's modules were enumerated.")
return bins_alternatives
[docs]
def all_complements(
bin_kos_per_module,
bins_alternatives,
module_to_map,
compl_output_file,
tinyurl=False,
):
"""
Extract potential complementarities from other bins
Inputs:
bin_kos_per_module
bins_alternatives
module_to_map
"""
logger.info("Build pathCompls.json file.")
unique_url_input = {}
# Init shortener
shortener = pyshorteners.Shortener() if tinyurl else None
# Parse KO annotations
complements = {}
for beneficiary_bin_id, all_bin_module_alternatives in tqdm(
bins_alternatives.items(), desc="Processing bins", unit="bin"
):
complements[beneficiary_bin_id] = {}
for donor_bin_id in bin_kos_per_module:
complements[beneficiary_bin_id][donor_bin_id] = []
for module, alts in all_bin_module_alternatives.items():
donors_kos_relativ_to_module = bin_kos_per_module[donor_bin_id][module]
for alternative, missing_kos_for_alternative in alts.items():
is_subset = set(missing_kos_for_alternative).issubset(
set(donors_kos_relativ_to_module)
)
if is_subset:
alternative = ast.literal_eval(alternative)
pc_comb = (
module,
tuple(missing_kos_for_alternative),
tuple(alternative),
)
if pc_comb not in unique_url_input:
try:
module_map = module_to_map[module]
url = build_kegg_url(
module_map,
list(alternative),
list(set(missing_kos_for_alternative)),
shortener,
)
except Exception:
url = ""
pass
unique_url_input[pc_comb] = url
# Build list with the complete complement
pot_compl = [
module,
missing_kos_for_alternative,
alternative,
unique_url_input[pc_comb],
]
complements[beneficiary_bin_id][donor_bin_id].append(pot_compl)
# Write the pathCompls.json file
with open(compl_output_file, "w") as file:
json.dump(complements, file, cls=SetEncoder)
logger.info(
"Step 3, the potential complementarities among the bins were enumerated."
)
return complements
[docs]
def a_modules_maps(kegg_modules_to_maps):
"""Get the KEGG maps in which a module takes part in"""
# maps = open(kegg_modules_to_maps, "r")
with open(kegg_modules_to_maps, "r") as f:
maps = f.readlines()
module_to_map = {}
for line in maps:
module, mmap = line.split("\t")
module_to_map[module[:-1]] = mmap[1:-1]
return module_to_map
[docs]
def taxon_kos_per_module(bins_kos_df, ref_ko_per_module):
"""Keep track of the KOs related to a module present on each bin
Input:
bins_kos_df (pd.DataFrame):
Returns:
bin_kos_per_module (Dict):
"""
d = pd.read_csv(ref_ko_per_module, sep="\t")
d.columns = ["module_id", "ko_term"]
d.loc[:, "presence"] = 1
definitions_df = d.pivot_table(
index="ko_term", columns="module_id", values="presence", fill_value=0
)
ind = definitions_df.index.str.replace("ko:", "")
definitions_df.index = ind
bin_kos_per_module = {}
# Iterate over each column in the second dataframe
logger.info("Step 1, KOs related to a module present on each bin.")
for bin_id in bins_kos_df.columns:
bin_kos_per_module[bin_id] = {} # Initialize inner dictionary for each bin
for module, definition_ko_terms in definitions_df.items():
# Get KOs of the module definition
definition_ko_terms = definition_ko_terms[definition_ko_terms != 0]
# Get KOs present on the bin
bins_kos = bins_kos_df[bins_kos_df[bin_id] == 1][bin_id]
# Get intersection and add the module: kos_present to the dict
bin_kos_per_module[bin_id][module] = bins_kos.index.intersection(
definition_ko_terms.index
).tolist()
return bin_kos_per_module
[docs]
def export_pathway_complementarities(config, bins_kos_df):
"""
Function to get all the KEGG pathway complements among a set of bins
Input:
config (Config): instance of the microbetag Config class with settings
bins_kos_df (pd.DataFrame): dictionary with bin id as a key and the KOs found in the bin as the value
Returns:
{beneficiary_bin: {donor_bin_A: {module_a: [], module_b: [],.. }}}
"""
# Keep track of the KOs related to a module present on each bin
bin_kos_per_module = taxon_kos_per_module(
bins_kos_df, config.ref_ko_per_module
)
# If alts.json not available
if not os.path.exists(config.alts_file):
bins_alternatives = all_alternatives(
bin_kos_per_module, config.modules_definitions_json_map, config.alts_file
)
else:
with open(config.alts_file, "r") as h:
bins_alternatives = json.load(h)
# If compl.json not available
if not os.path.exists(config.compl_file):
module_to_map = a_modules_maps(config.kegg_modules_to_maps)
complements = all_complements(
bin_kos_per_module,
bins_alternatives,
module_to_map,
config.compl_file,
config.tinyurl,
)
else:
with open(config.compl_file, "r") as h:
complements = json.load(h)
return bins_alternatives, complements