Snakefile_Visualizations

"""

Snakemakefile that runs the AOC application - Visualizations

Written by Alexander G Lucaci (2024)

"""

# =============================================================================
# Imports
# =============================================================================

import itertools
import os
import sys
import csv
import json
from pathlib import Path
from snakemake.utils import min_version
import glob
import numpy as np
import pandas as pd
import altair as alt
import statsmodels
import statsmodels.api
import networkx as nx
#import nx_altair as nxa
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib
matplotlib.use('agg')

# =============================================================================
# Configuration
# =============================================================================

configfile: 'config.yml'

with open("cluster.json", "r") as fh:
  cluster = json.load(fh)
#end with

Label = config["Label"]

# Set output directory
BASEDIR = os.getcwd()

# Set output directory
BASEDIR = os.getcwd()

print("# Visualizations! We are operating out of base directory:", BASEDIR)

OUTDIR = os.path.join(BASEDIR, "results", Label)

OUTDIR_Viz = os.path.join(BASEDIR, "results", Label, "Visualizations")

print("# Output directory:", OUTDIR)

Recombinants       = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR,
                                                                                 '*.codon.fas'))])

Recombinants_Trees = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR,
                                                                                 '*.tree.nwk'))])

DATADIR = os.path.join(BASEDIR,
                       "data",
                       Label)
                       
CladeLabels = sorted([x for x in glob.glob(os.path.join(OUTDIR, '*.clade'))])

ReferenceClade = os.path.basename(CladeLabels[0]).split(".")[0]
#CladeLabels[0].split(".")[0]

print("# We will process selection analyses in", len(Recombinants), "files")
print("# We will use the following clade labels:", CladeLabels)

# Set PPN
PPN = cluster["__default__"]["ppn"]

# HyPhy settings
HYPHY = "hyphy"
HYPHYMPI = "HYPHYMPI"

FITMG94 = os.path.join(BASEDIR, "hyphy-analyses", "FitMG94", "FitMG94.bf")

# =============================================================================
# Viz settings
# =============================================================================

pvalueThreshold = 0.1
posteriorThreshold = 0.5

# =============================================================================
# Results files
# =============================================================================

"""
rule all:
    input:
        # Calculate genetic distances
        expand(os.path.join(OUTDIR, "{sample}.dst"), sample = Recombinants, tree = Recombinants_Trees),
        expand(os.path.join(OUTDIR, "{sample}.FastTree.treefile"), sample=Recombinants),
        # Selection analysis repertoire
        #expand(os.path.join(OUTDIR, "{sample}.MG94.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.FEL.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.FUBAR.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.BUSTEDS.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.MEME.json"), sample=Recombinants),
        #expand(os.path.join(OUTDIR, "{sample}.MEME-iS.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.ABSREL.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.SLAC.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.BGM.json"), sample=Recombinants),
        #expand(os.path.join(OUTDIR, "{sample}.PRIME.json"), sample=Recombinants),
        #expand(os.path.join(OUTDIR, "{sample}.ABSREL-MH.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.BUSTEDS-MH.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.FMM.json"), sample=Recombinants),
        # Labelling taxonomy
        expand(os.path.join(OUTDIR, "{tree}.labelled"), tree = Recombinants_Trees),
        expand(os.path.join(OUTDIR, "{sample}.RELAX.json"), sample=Recombinants),
        expand(os.path.join(OUTDIR, "{sample}.CFEL.json"), sample=Recombinants),
        # Executive Summary
        #expand(os.path.join(OUTDIR, "{sample}.FEL.png"), sample=Recombinants), # Start summary of results
        #expand(os.path.join(OUTDIR, "{sample}.FEL.csv"), sample=Recombinants),
        #expand(os.path.join(OUTDIR, "{sample}.FEL.FigureLegend"), sample=Recombinants),
        #expand(os.path.join(OUTDIR, "{sample}.cumulativeResults.csv"), sample=Recombinants)
        # Executive Summary
        #expand(os.path.join(OUTDIR, "{sample}.executiveSummary.csv"), sample=Recombinants)
    #end input
#end rule all

"""

felJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.FEL.json'))])
fubarJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.FUBAR.json'))])
bustedsJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.BUSTEDS.json'))])
memeJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.MEME.json'))])
absrelJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.ABSREL.json'))])
slacJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.SLAC.json'))])
bgmJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.BGM.json'))])
bsmhJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.BUSTEDS-MH.json'))])
fmmJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.FMM.json'))])
cfelJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.CFEL.json'))])
relaxJsons = sorted([os.path.basename(x) for x in glob.glob(os.path.join(OUTDIR, '*.RELAX.json'))])

jsonDict = {"FEL": felJsons,
            "FUBAR": fubarJsons,
            "BUSTEDS": bustedsJsons,
            "MEME": memeJsons,
            "ABSREL": absrelJsons,
            "SLAC": slacJsons,
            "BGM": bgmJsons,
            "BUSTEDS-MH": bsmhJsons,
            "FMM": fmmJsons,
            "CFEL": cfelJsons,
            "RELAX": relaxJsons
     }


for _ in jsonDict.keys():
    print(f"We found {len(jsonDict[_])} {_} json files...")

print("# Plotting...")

# =============================================================================
# Helper functions
# =============================================================================

def getJsonData(jsonFile):
    with open(jsonFile, "r") as fh:
        json_data = json.load(fh)
    return json_data
#end method

def getFELData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getFELHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

def getMEMEData(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]["0"]
#end method

def getMEMEHeaders(json_file):
    # assert that the file exists
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

def getBGMData(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["content"]
#end method

def getBGMHeaders(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["MLE"]["headers"]
#end method

def getBGMInput(json_file):
    with open(json_file, "r") as in_d:
        json_data = json.load(in_d)
    return json_data["input"]
#end method

# =============================================================================
# Rule all
# =============================================================================

rule all:
    input:
        # FEL
        expand(os.path.join(OUTDIR_Viz, "{sample}.FEL.csv"), sample=felJsons),
        expand(os.path.join(OUTDIR_Viz, "{sample}.FEL.png"), sample=felJsons),
        expand(os.path.join(OUTDIR_Viz, "{sample}.FEL.svg"), sample=felJsons),
        expand(os.path.join(OUTDIR_Viz, "{sample}.FEL.figureLegend"), sample=felJsons),
        # MEME
        expand(os.path.join(OUTDIR_Viz, "{sample}.MEME.csv"), sample=memeJsons),
        expand(os.path.join(OUTDIR_Viz, "{sample}.MEME.png"), sample=memeJsons),
        expand(os.path.join(OUTDIR_Viz, "{sample}.MEME.svg"), sample=memeJsons),
        expand(os.path.join(OUTDIR_Viz, "{sample}.MEME.figureLegend"), sample=memeJsons),
        # BGM
        expand(os.path.join(OUTDIR_Viz, "{sample}.BGM.csv"), sample=bgmJsons),
        expand(os.path.join(OUTDIR_Viz, "{sample}.BGM.png"), sample=bgmJsons),
        expand(os.path.join(OUTDIR_Viz, "{sample}.BGM.svg"), sample=bgmJsons),
        #expand(os.path.join(OUTDIR_Viz, "{sample}.BGM.figureLegend"), sample=bgmJsons)


# =============================================================================
# Rules
# =============================================================================

rule PlotFEL:
    input:
        input = os.path.join(OUTDIR, "{sample}")
    output:
        output_csv = os.path.join(OUTDIR_Viz, "{sample}.FEL.csv"),
        output_png = os.path.join(OUTDIR_Viz, "{sample}.FEL.png"),
        output_svg = os.path.join(OUTDIR_Viz, "{sample}.FEL.svg"),
        output_figureLegend = os.path.join(OUTDIR_Viz, "{sample}.FEL.figureLegend")
    run:
        # FEL Plotting
        JSON_FILE = input.input
        columns = getFELHeaders(input.input)
        headers = [x[0] for x in columns]
        data = getFELData(JSON_FILE)
        df = pd.DataFrame(getFELData(JSON_FILE), columns=headers, dtype = float)
        df.index += 1
        df["CodonSite"] = df.index
        unadjusted_pvalues = df["p-value"].tolist()
        adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues,
                                                              alpha=0.10,
                                                              method='indep',
                                                              is_sorted=False)
        df["adjusted_p-value"] = adjusted_pvalues[1]
        df.to_csv(output.output_csv, index=False)
        
        df_results = df[df["adjusted_p-value"] <= pvalueThreshold]
        positive_sites = df_results[df_results["dN/dS MLE"] > 1.0]
        positive_sites = positive_sites.reset_index()
        positive_sites.index += 1
        positive_sites.drop('index', axis=1, inplace=True)
        
        negative_sites = df_results[df_results["dN/dS MLE"] < 1.0]
        negative_sites = negative_sites.reset_index()
        negative_sites.index += 1
        negative_sites.drop('index', axis=1, inplace=True)
        
        source = df.copy()
        source = source.dropna()
        source = source.rename(columns={"p-value": "p_value"})
        source = source.rename(columns={"adjusted_p-value": "adjusted_p_value"})

        line = alt.Chart(source).mark_circle(clip=True,
                                             opacity=0.9,
                                             size = 80
                                            ).encode(x= alt.X('CodonSite', title = "Codon Site"),
                                                     y = alt.Y('dN/dS MLE', title = "dN/dS estimate", scale=alt.Scale(domain=(0, 5),
                                                     clamp=True,
                                                     nice=False,
                                                     type="sqrt")),
            color = alt.condition(alt.datum.adjusted_p_value <= "0.1",
                                  alt.value("red"),
                                  alt.value("lightgray"))
        ).properties(width=800,
                     height=600)
                     
        band = alt.Chart(source).mark_area(opacity=0.5).encode(x='CodonSite',
                                                               y='dN/dS LB',
                                                               y2='dN/dS UB')
        chart = (line + band)
        chart.save(output.output_svg)
        chart.save(output.output_png)
        
        # Figure legend
        a = len(df["dN/dS MLE"])
        b = len(negative_sites["dN/dS MLE"])
        c = round((b/a) * 100, 3)

        with open(output.output_figureLegend, 'w') as file_h:
            print("The FEL analysis of your gene of interest found " + str(b) + " of " + str(a) + " (" + str(c)+"%" + ") sites to be statistically significant (LRT p-value <= " + str(pvalueThreshold) + ") for pervasive negative/purifying selection", file=file_h)
            #print()
            #print(str(c)+"%" )
        # end with
    # end run
# end rule

rule PlotMEME:
    input:
        input = os.path.join(OUTDIR, "{sample}")
    output:
        output_csv = os.path.join(OUTDIR_Viz, "{sample}.MEME.csv"),
        output_png = os.path.join(OUTDIR_Viz, "{sample}.MEME.png"),
        output_svg = os.path.join(OUTDIR_Viz, "{sample}.MEME.svg"),
        output_figureLegend = os.path.join(OUTDIR_Viz, "{sample}.MEME.figureLegend")
    run:
        JSON_FILE = input.input
        columns = getMEMEHeaders(JSON_FILE)
        headers = [x[0] for x in columns]
        df = pd.DataFrame(getMEMEData(JSON_FILE), columns=headers, dtype = float)
        #df["omega"] = df["&beta;<sup>+</sup>"] / df["&alpha;"]
        df.index += 1
        df["CodonSite"] = df.index
    
        unadjusted_pvalues = df["p-value"].tolist()
        adjusted_pvalues =  statsmodels.stats.multitest.fdrcorrection(unadjusted_pvalues,
                                                              alpha=0.10,
                                                              method='indep',
                                                              is_sorted=False)
        df["adjusted_p-value"] = adjusted_pvalues[1]
        df.to_csv(output.output_csv, index=False)
        df_results = df[df["adjusted_p-value"] <= pvalueThreshold]
        
        source = df.copy()
        chart = alt.Chart(source).mark_point().encode(
            x='CodonSite',
            y='adjusted_p-value',
            color=alt.Color('adjusted_p-value', scale=alt.Scale(scheme='reds', reverse=True))
        ).properties(
            width=800,
            height=600
        )
        chart.save(output.output_svg)
        chart.save(output.output_png)
        
        # Figure legend
        a = len(df["CodonSite"])
        b = len(df_results["CodonSite"])
        with open(output.output_figureLegend, 'w') as file_h:
            print("MEME analysis of your gene of interest found " + str(b) + " of " + str(a) + " sites to be statisically significant (adjusted p-value <= " + str(pvalueThreshold) + ")", file=file_h)
        # end with
    # end run
# end rule
        
rule PlotBGM:
    input:
        input = os.path.join(OUTDIR, "{sample}")
    output:
        output_csv = os.path.join(OUTDIR_Viz, "{sample}.BGM.csv"),
        output_png = os.path.join(OUTDIR_Viz, "{sample}.BGM.png"),
        output_svg = os.path.join(OUTDIR_Viz, "{sample}.BGM.svg"),
        #output_figureLegend = os.path.join(OUTDIR_Viz, "{sample}.BGM.figureLegend")
    run:
        JSON_FILE = input.input
        columns = getBGMHeaders(JSON_FILE)
        headers = [x[0] for x in columns]
        headers2= []
        for item in headers:
            item = item.replace('â€“', "-")
            headers2.append(item)
        #print(headers2)
        df = pd.DataFrame(getBGMData(JSON_FILE), columns=headers2, dtype = float)
        df.index += 1
        coevolving_sites_1 = df[df[df.columns[2]] >= posteriorThreshold]
        coevolving_sites_2 = df[df[df.columns[3]] >= posteriorThreshold]
        coevolving_sites_3 = df[df[df.columns[4]] >= posteriorThreshold]
        source = coevolving_sites_3.copy()
        source.to_csv(output.output_csv, index=False)
        
        # Visualization
        pos = [0, 0]
        fixed_pos = {}

        #source['HumanREM2_Site_1'] = source['HumanREM2_Site_1'].astype('int64')
        #source['HumanREM2_Site_2'] = source['HumanREM2_Site_2'].astype('int64')
        #print(source.head())

        for item in sorted(source["Site 1"].tolist()):
            fixed_pos[item] = tuple(pos)
            pos[0] += 3
            
        fixed_nodes = fixed_pos.keys()

        G = nx.Graph()
        G = nx.from_pandas_edgelist(source,
                                    'Site 1',
                                    'Site 2',
                                    edge_attr=["Shared subs"])

        pos = nx.spring_layout(G,
                               scale=2,
                               k=1.6,
                               seed=32,
                               pos=fixed_pos,
                               fixed=fixed_nodes,
                               iterations=300
                              )

        # Show it as an interactive plot!
        plt.figure(1, figsize=(32, 8))  # Width, height

        weights = [1 if G[u][v] == {} else G[u][v]['Shared subs'] for u,v in G.edges()]

        # extract the edge weight
        edge_colors = [a['Shared subs'] for u, v, a in G.edges(data=True)]

        cmap=plt.cm.viridis

        nx.draw_networkx(G,
                         pos,
                         with_labels=True,
                         node_size=500,
                         font_size=10,
                         font_weight='normal',
                         edgecolors="black",
                         width = weights,
                         node_color = "#A0CBE2",
                         edge_color=edge_colors,
                         edge_cmap=cmap,
                         edge_vmin=0, edge_vmax=np.max(edge_colors),
                         #vmin=vmin, vmax=vmax)
                         )

        plt.axis("off")

        cbar = plt.colorbar(
           plt.cm.ScalarMappable(cmap=cmap,
                                 norm=plt.Normalize(vmin = np.min(edge_colors), vmax=np.max(edge_colors))),
                                 ax=plt.gca()
        )

        cbar.ax.tick_params(labelsize=10)
        cbar.ax.set_title('Weight', fontsize=14)

        plt.savefig(output.output_png)
        plt.savefig(output.output_svg)
        plt.show()
        
# =============================================================================
# End of file
# =============================================================================