import pandas as pd
import logomaker
import matplotlib.pyplot as plt
import matplotlib
import base64
import io
import numpy as np
import math
import logging
matplotlib.set_loglevel("critical")
LOGOPLOT_WITHOUT_CONF_COLORSCHEME = (
{ # see https://github.com/jbkinney/logomaker/blob/master/logomaker/src/colors.py
"A": "#f76ab4",
"C": "#ff7f00",
"D": "#e41a1c",
"E": "#e41a1c",
"F": "#84380b",
"G": "#f76ab4",
"H": "#3c58e5",
"I": "#12ab0d",
"K": "#3c58e5",
"L": "#12ab0d",
"M": "#12ab0d",
"N": "#972aa8",
"P": "#12ab0d",
"Q": "#972aa8",
"R": "#3c58e5",
"S": "#ff7f00",
"T": "#ff7f00",
"V": "#12ab0d",
"W": "#84380b",
"Y": "#84380b",
"X": "#000000", # add 'X' so that LogoMaker doesn't log to stdout
"*": "#000000", # add '*' so that LogoMaker doesn't log to stdout
}
)
WHITE_EMPTY_SQUARE = b"iVBORw0KGgoAAAANSUhEUgAAAJYAAACfCAIAAACUbLd9AAAACXBIWXMAAAsTAAALEwEAmpwYAAABhElEQVR4nO3RwQkAIBDAMHX/nc8hfEghmaDQPTOLsvM7gFcW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5l1WGwQ7i50I0AAAAABJRU5ErkJggg=="
[docs]
def render_singleres_logoplot(res):
"""
Renders a simple, single-letter 'logoplot', normally for showing the wildtype residue. The plot
is square and is typically rendered top center downstream.
"""
_, ax = plt.subplots(figsize=(4, 4))
# create Logo object
logomaker.Logo(
pd.DataFrame({res: 1}, index=[0]),
font_name="Sans Serif",
color_scheme=LOGOPLOT_WITHOUT_CONF_COLORSCHEME,
flip_below=False,
show_spines=True,
ax=ax,
)
plt.xticks([])
plt.yticks([])
# plt.savefig("debug_logoplot.png", dpi=70, bbox_inches="tight") # uncomment for testing
# plt object directly to base64 string instead of tmpfile
lp_bytes = io.BytesIO()
plt.savefig(
lp_bytes,
format="png",
dpi=70, # DPI 70 seems to be ~smallest we can get away with
bbox_inches="tight",
)
lp_bytes.seek(0)
lp_base64 = base64.b64encode(lp_bytes.read())
plt.close()
return lp_base64
[docs]
class LogoPlot:
"""
Given a dict with mutants for a given residue, generate logoplots.
"""
[docs]
def __init__(self, residue_dict, fitness_threshold):
self.residue_dict = residue_dict
self.fitness_threshold = fitness_threshold
[docs]
def divide_fitness_types(self):
"""
Determines which mutants are fit/unfit given the `fitness_threshold` and returns
all data required for logoplot generation in a simple dict form.
"""
# first collect some info on the wildtype residue
wildtype = self.residue_dict["wildtype"]["aa"]
wildtype_fitness = [
self.residue_dict["wildtype"]["fitness"]
] # make this a list to keep consistent typing
if not math.isnan(self.residue_dict["wildtype"]["confidence"]):
# if the wildtype has a confidence tag we can assume that all mutant data will also have confidence
self.confidence = True
wildtype_fitness.append(self.residue_dict["wildtype"]["confidence"])
else:
self.confidence = False
# get fit and unfit mutants according to `fitness_threshold`
unfit_mutants = {}
fit_mutants = {}
for mutant in self.residue_dict["mutants"]:
if self.confidence: # also write confidence into the dict
if mutant["fitness"] < self.fitness_threshold:
unfit_mutants[mutant["aa"]] = [
mutant["fitness"],
mutant["confidence"],
]
else:
fit_mutants[mutant["aa"]] = [
mutant["fitness"],
mutant["confidence"],
]
elif (
not self.confidence
): # no confidences, so only write fitness to the dict
if mutant["fitness"] < self.fitness_threshold:
unfit_mutants[mutant["aa"]] = [
mutant["fitness"]
] # make this a list to keep consistent typing
else:
fit_mutants[mutant["aa"]] = [mutant["fitness"]]
return {wildtype: wildtype_fitness}, unfit_mutants, fit_mutants
[docs]
def render_logoplot(
self,
mutants,
global_min_confidence=False,
global_max_confidence=False,
lhs=True,
wildtype=False,
):
"""
Creates a logoplot as a base64 string. Also annotes with confidence values if present.
TODO: nicer rounded ticks agnostic to array limits
"""
if len(mutants) == 0:
# this can happen when there are no mutants in this category. Return an empty white-sqare base64 instead.
return WHITE_EMPTY_SQUARE
plt.switch_backend("Agg") # prevents plt from opening a figure on OS
if (
wildtype
): # we want this to be a bit smaller and square because it'll always have 1 residue.
_, ax = plt.subplots(figsize=(4, 4))
if list(mutants.values())[0][0] == 0.0:
# in some experiments the fitness values are scaled such that wildtype is 0.0. This breaks
# plotting (because of lgooplots being sized by fitness vals), so set wildtype to 1.0 instead.
mutants[list(mutants.keys())[0]] = [1.0]
else:
_, ax = plt.subplots(figsize=(3, 10))
# if there are confidences, we well color the logoplot AA letters by confidence and
# show a color bar if this is the left-hand-side logoplot.
if self.confidence:
if not global_min_confidence or not global_max_confidence:
raise ValueError(
"If confidence is provided then a global confidence limit needs to be passed to render_logoplot()"
)
matplotlib.rcParams.update(
{"font.size": 12}
) # instead of doing for ticks/title separately
# define a 'mappable' which allows us to generate the colormap before the rest of the plot
mappable = plt.cm.ScalarMappable(
cmap=matplotlib.colors.LinearSegmentedColormap.from_list(
"custom blue", # bit convoluted but this way we force the colormap to be continuous
["#ff6600", "#0066ff"], # between two colors
N=256,
),
norm=matplotlib.colors.Normalize(
vmin=global_min_confidence, vmax=global_max_confidence
),
)
if not lhs:
# plot the colorbar
plt.colorbar(
mappable,
ticks=np.linspace(
global_min_confidence,
global_max_confidence,
int(len(mutants) / 4),
),
ax=ax,
).ax.set_title(
" Confidence", y=1.02
) # indent to make title appear nicely centered. `ha` doesn't get us there
# build a dict that has {residue : RGBA color, ..} that we can use to color the logoplot
conf_color_per_AA = {k: mappable.to_rgba(v[1]) for k, v in mutants.items()}
else:
# just use regular coloring if there is no confidence set.
conf_color_per_AA = LOGOPLOT_WITHOUT_CONF_COLORSCHEME
# create Logo object
logomaker.Logo(
pd.DataFrame(mutants)[:1],
font_name="Sans Serif",
color_scheme=conf_color_per_AA,
flip_below=False,
show_spines=True,
ax=ax,
)
plt.xticks([])
plt.yticks([])
# plt.savefig("debug_logoplot.png", dpi=70, bbox_inches="tight") # uncomment for testing
# plt object directly to base64 string instead of tmpfile
lp_bytes = io.BytesIO()
plt.savefig(
lp_bytes,
format="png",
dpi=70, # DPI 70 seems to be ~smallest we can get away with
bbox_inches="tight",
)
lp_bytes.seek(0)
lp_base64 = base64.b64encode(lp_bytes.read())
plt.close()
return lp_base64
[docs]
def build_logoplot(self, global_min_confidence=False, global_max_confidence=False):
# determine the wildtype, unfit and fit mutants for this input
wildtype, unfit_mutants, fit_mutants = self.divide_fitness_types()
# generate the logoplot base64 for wildtype (LHS, top), fit (LHS, bottom) and unfit (RHS; with colorbar)
wildtype_base64 = self.render_logoplot(
wildtype,
global_min_confidence=global_min_confidence,
global_max_confidence=global_max_confidence,
wildtype=True,
)
fit_base64 = self.render_logoplot(
fit_mutants,
global_min_confidence=global_min_confidence,
global_max_confidence=global_max_confidence,
)
unfit_base64 = self.render_logoplot(
unfit_mutants,
global_min_confidence=global_min_confidence,
global_max_confidence=global_max_confidence,
lhs=False,
)
return wildtype_base64, fit_base64, unfit_base64
if __name__ == "__main__":
# test a fitness dict with conf values
residue_dict = {
"fitness_aligned_index": 164,
"fitness_csv_index": 160,
"wildtype": {"aa": "L", "fitness": 1.0, "confidence": 4221},
"mutants": [
{"aa": "V", "fitness": -1.98, "confidence": 2455},
{"aa": "I", "fitness": -3.3, "confidence": 434},
{"aa": "E", "fitness": -4.52, "confidence": 3706},
{"aa": "Q", "fitness": -3.78, "confidence": 3079},
{"aa": "D", "fitness": 0.56, "confidence": 3615},
{"aa": "N", "fitness": -1.05, "confidence": 3911},
{"aa": "H", "fitness": -0.59, "confidence": 4891},
{"aa": "W", "fitness": -1.88, "confidence": 2627},
{"aa": "F", "fitness": -0.56, "confidence": 2663},
{"aa": "Y", "fitness": 0.66, "confidence": 4534},
{"aa": "R", "fitness": -0.73, "confidence": 11},
{"aa": "K", "fitness": 0.89, "confidence": 3763},
{"aa": "S", "fitness": -1.77, "confidence": 2352},
{"aa": "T", "fitness": -1.16, "confidence": 3843},
{"aa": "M", "fitness": -2.11, "confidence": 4018},
{"aa": "A", "fitness": -3.33, "confidence": 4132},
{"aa": "G", "fitness": -0.44, "confidence": 3251},
{"aa": "P", "fitness": -0.0, "confidence": 3817},
{"aa": "C", "fitness": -1.5, "confidence": 3445},
{"aa": "X", "fitness": -0.37, "confidence": 3281},
],
}
LogoPlot(residue_dict, fitness_threshold=0.7).build_logoplot(
global_min_confidence=10, global_max_confidence=5000
)