Source code for choppa.logoplots

import pandas as pd
import logomaker
import matplotlib.pyplot as plt
import matplotlib
import base64
import io
import numpy as np
import math
import logging

matplotlib.set_loglevel("critical")
LOGOPLOT_WITHOUT_CONF_COLORSCHEME = (
    {  # see https://github.com/jbkinney/logomaker/blob/master/logomaker/src/colors.py
        "A": "#f76ab4",
        "C": "#ff7f00",
        "D": "#e41a1c",
        "E": "#e41a1c",
        "F": "#84380b",
        "G": "#f76ab4",
        "H": "#3c58e5",
        "I": "#12ab0d",
        "K": "#3c58e5",
        "L": "#12ab0d",
        "M": "#12ab0d",
        "N": "#972aa8",
        "P": "#12ab0d",
        "Q": "#972aa8",
        "R": "#3c58e5",
        "S": "#ff7f00",
        "T": "#ff7f00",
        "V": "#12ab0d",
        "W": "#84380b",
        "Y": "#84380b",
        "X": "#000000",  # add 'X' so that LogoMaker doesn't log to stdout
        "*": "#000000",  # add '*' so that LogoMaker doesn't log to stdout
    }
)
WHITE_EMPTY_SQUARE = b"iVBORw0KGgoAAAANSUhEUgAAAJYAAACfCAIAAACUbLd9AAAACXBIWXMAAAsTAAALEwEAmpwYAAABhElEQVR4nO3RwQkAIBDAMHX/nc8hfEghmaDQPTOLsvM7gFcW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5l1WGwQ7i50I0AAAAABJRU5ErkJggg=="


[docs] def render_singleres_logoplot(res): """ Renders a simple, single-letter 'logoplot', normally for showing the wildtype residue. The plot is square and is typically rendered top center downstream. """ _, ax = plt.subplots(figsize=(4, 4)) # create Logo object logomaker.Logo( pd.DataFrame({res: 1}, index=[0]), font_name="Sans Serif", color_scheme=LOGOPLOT_WITHOUT_CONF_COLORSCHEME, flip_below=False, show_spines=True, ax=ax, ) plt.xticks([]) plt.yticks([]) # plt.savefig("debug_logoplot.png", dpi=70, bbox_inches="tight") # uncomment for testing # plt object directly to base64 string instead of tmpfile lp_bytes = io.BytesIO() plt.savefig( lp_bytes, format="png", dpi=70, # DPI 70 seems to be ~smallest we can get away with bbox_inches="tight", ) lp_bytes.seek(0) lp_base64 = base64.b64encode(lp_bytes.read()) plt.close() return lp_base64
[docs] class LogoPlot: """ Given a dict with mutants for a given residue, generate logoplots. """
[docs] def __init__(self, residue_dict, fitness_threshold): self.residue_dict = residue_dict self.fitness_threshold = fitness_threshold
[docs] def divide_fitness_types(self): """ Determines which mutants are fit/unfit given the `fitness_threshold` and returns all data required for logoplot generation in a simple dict form. """ # first collect some info on the wildtype residue wildtype = self.residue_dict["wildtype"]["aa"] wildtype_fitness = [ self.residue_dict["wildtype"]["fitness"] ] # make this a list to keep consistent typing if not math.isnan(self.residue_dict["wildtype"]["confidence"]): # if the wildtype has a confidence tag we can assume that all mutant data will also have confidence self.confidence = True wildtype_fitness.append(self.residue_dict["wildtype"]["confidence"]) else: self.confidence = False # get fit and unfit mutants according to `fitness_threshold` unfit_mutants = {} fit_mutants = {} for mutant in self.residue_dict["mutants"]: if self.confidence: # also write confidence into the dict if mutant["fitness"] < self.fitness_threshold: unfit_mutants[mutant["aa"]] = [ mutant["fitness"], mutant["confidence"], ] else: fit_mutants[mutant["aa"]] = [ mutant["fitness"], mutant["confidence"], ] elif ( not self.confidence ): # no confidences, so only write fitness to the dict if mutant["fitness"] < self.fitness_threshold: unfit_mutants[mutant["aa"]] = [ mutant["fitness"] ] # make this a list to keep consistent typing else: fit_mutants[mutant["aa"]] = [mutant["fitness"]] return {wildtype: wildtype_fitness}, unfit_mutants, fit_mutants
[docs] def render_logoplot( self, mutants, global_min_confidence=False, global_max_confidence=False, lhs=True, wildtype=False, ): """ Creates a logoplot as a base64 string. Also annotes with confidence values if present. TODO: nicer rounded ticks agnostic to array limits """ if len(mutants) == 0: # this can happen when there are no mutants in this category. Return an empty white-sqare base64 instead. return WHITE_EMPTY_SQUARE plt.switch_backend("Agg") # prevents plt from opening a figure on OS if ( wildtype ): # we want this to be a bit smaller and square because it'll always have 1 residue. _, ax = plt.subplots(figsize=(4, 4)) if list(mutants.values())[0][0] == 0.0: # in some experiments the fitness values are scaled such that wildtype is 0.0. This breaks # plotting (because of lgooplots being sized by fitness vals), so set wildtype to 1.0 instead. mutants[list(mutants.keys())[0]] = [1.0] else: _, ax = plt.subplots(figsize=(3, 10)) # if there are confidences, we well color the logoplot AA letters by confidence and # show a color bar if this is the left-hand-side logoplot. if self.confidence: if not global_min_confidence or not global_max_confidence: raise ValueError( "If confidence is provided then a global confidence limit needs to be passed to render_logoplot()" ) matplotlib.rcParams.update( {"font.size": 12} ) # instead of doing for ticks/title separately # define a 'mappable' which allows us to generate the colormap before the rest of the plot mappable = plt.cm.ScalarMappable( cmap=matplotlib.colors.LinearSegmentedColormap.from_list( "custom blue", # bit convoluted but this way we force the colormap to be continuous ["#ff6600", "#0066ff"], # between two colors N=256, ), norm=matplotlib.colors.Normalize( vmin=global_min_confidence, vmax=global_max_confidence ), ) if not lhs: # plot the colorbar plt.colorbar( mappable, ticks=np.linspace( global_min_confidence, global_max_confidence, int(len(mutants) / 4), ), ax=ax, ).ax.set_title( " Confidence", y=1.02 ) # indent to make title appear nicely centered. `ha` doesn't get us there # build a dict that has {residue : RGBA color, ..} that we can use to color the logoplot conf_color_per_AA = {k: mappable.to_rgba(v[1]) for k, v in mutants.items()} else: # just use regular coloring if there is no confidence set. conf_color_per_AA = LOGOPLOT_WITHOUT_CONF_COLORSCHEME # create Logo object logomaker.Logo( pd.DataFrame(mutants)[:1], font_name="Sans Serif", color_scheme=conf_color_per_AA, flip_below=False, show_spines=True, ax=ax, ) plt.xticks([]) plt.yticks([]) # plt.savefig("debug_logoplot.png", dpi=70, bbox_inches="tight") # uncomment for testing # plt object directly to base64 string instead of tmpfile lp_bytes = io.BytesIO() plt.savefig( lp_bytes, format="png", dpi=70, # DPI 70 seems to be ~smallest we can get away with bbox_inches="tight", ) lp_bytes.seek(0) lp_base64 = base64.b64encode(lp_bytes.read()) plt.close() return lp_base64
[docs] def build_logoplot(self, global_min_confidence=False, global_max_confidence=False): # determine the wildtype, unfit and fit mutants for this input wildtype, unfit_mutants, fit_mutants = self.divide_fitness_types() # generate the logoplot base64 for wildtype (LHS, top), fit (LHS, bottom) and unfit (RHS; with colorbar) wildtype_base64 = self.render_logoplot( wildtype, global_min_confidence=global_min_confidence, global_max_confidence=global_max_confidence, wildtype=True, ) fit_base64 = self.render_logoplot( fit_mutants, global_min_confidence=global_min_confidence, global_max_confidence=global_max_confidence, ) unfit_base64 = self.render_logoplot( unfit_mutants, global_min_confidence=global_min_confidence, global_max_confidence=global_max_confidence, lhs=False, ) return wildtype_base64, fit_base64, unfit_base64
if __name__ == "__main__": # test a fitness dict with conf values residue_dict = { "fitness_aligned_index": 164, "fitness_csv_index": 160, "wildtype": {"aa": "L", "fitness": 1.0, "confidence": 4221}, "mutants": [ {"aa": "V", "fitness": -1.98, "confidence": 2455}, {"aa": "I", "fitness": -3.3, "confidence": 434}, {"aa": "E", "fitness": -4.52, "confidence": 3706}, {"aa": "Q", "fitness": -3.78, "confidence": 3079}, {"aa": "D", "fitness": 0.56, "confidence": 3615}, {"aa": "N", "fitness": -1.05, "confidence": 3911}, {"aa": "H", "fitness": -0.59, "confidence": 4891}, {"aa": "W", "fitness": -1.88, "confidence": 2627}, {"aa": "F", "fitness": -0.56, "confidence": 2663}, {"aa": "Y", "fitness": 0.66, "confidence": 4534}, {"aa": "R", "fitness": -0.73, "confidence": 11}, {"aa": "K", "fitness": 0.89, "confidence": 3763}, {"aa": "S", "fitness": -1.77, "confidence": 2352}, {"aa": "T", "fitness": -1.16, "confidence": 3843}, {"aa": "M", "fitness": -2.11, "confidence": 4018}, {"aa": "A", "fitness": -3.33, "confidence": 4132}, {"aa": "G", "fitness": -0.44, "confidence": 3251}, {"aa": "P", "fitness": -0.0, "confidence": 3817}, {"aa": "C", "fitness": -1.5, "confidence": 3445}, {"aa": "X", "fitness": -0.37, "confidence": 3281}, ], } LogoPlot(residue_dict, fitness_threshold=0.7).build_logoplot( global_min_confidence=10, global_max_confidence=5000 )