Source code for choppa.IO.input

import pandas as pd
from typing import Optional
from pathlib import Path
import numpy as np
import logging, sys
from collections import OrderedDict

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger()


[docs] class FitnessFactory: """ Base class for handling Fitness data within `choppa`. """
[docs] def __init__( self, input_fitness_csv: Path, resindex_colname: Optional[str] = "residue_index", wildtype_colname: Optional[str] = "wildtype", mutant_colname: Optional[str] = "mutant", fitness_colname: Optional[str] = "fitness", confidence_colname: Optional[str] = None, ): self.input_fitness_csv = input_fitness_csv self.resindex_colname = resindex_colname self.wildtype_colname = wildtype_colname self.mutant_colname = mutant_colname self.fitness_colname = fitness_colname self.confidence_colname = confidence_colname if self.confidence_colname is not None: # makes it easier to track whether confidence values are provided or set to NaN by us self.confidence_set = True else: self.confidence_set = False self.fitness_df = None
[docs] def check_validity(self, fitness_df): """ Does some quick checks to make sure the imported CSV file is valid. """ if self.confidence_colname not in fitness_df.columns: raise KeyError( f"Column {self.confidence_colname} not found in {self.input_fitness_csv}" ) # quick check for columns non_confidence_columns = [ self.resindex_colname, self.wildtype_colname, self.mutant_colname, self.fitness_colname, ] missing_columns = [] for colname in non_confidence_columns: if not colname in fitness_df.columns: missing_columns.append(colname) if missing_columns: raise KeyError( f"Column(s) {missing_columns} not found in {self.input_fitness_csv}" ) # keep only the requested columns fitness_df = fitness_df[non_confidence_columns + [self.confidence_colname]] # if residue_index is a float (ran into this use-case), convert to int. fitness_df[self.resindex_colname] = fitness_df[self.resindex_colname].apply( np.int64 ) # check that there aren't any NaNs and that fitness (and confidence) data is scalar if fitness_df[non_confidence_columns].isnull().values.any(): raise ValueError( f"Found missing values in input CSV: {fitness_df[fitness_df[non_confidence_columns].isnull().any(axis=1)]}" ) if ( len( fitness_df[ pd.to_numeric( fitness_df[self.fitness_colname], errors="coerce" ).isnull() ] ) > 0 ): raise ValueError( f"Found non-numeric fitness values in input CSV: {fitness_df[pd.to_numeric(fitness_df[self.fitness_colname], errors='coerce').isnull()]}" ) if self.confidence_colname is not None: if self.confidence_set: if ( len( fitness_df[ pd.to_numeric( fitness_df[self.confidence_colname], errors="coerce" ).isnull() ] ) > 0 ): raise ValueError( f"Found non-numeric confidence values in input CSV: {fitness_df[pd.to_numeric(fitness_df[self.confidence_colname], errors='coerce').isnull()]}" ) # if checks reach this point then the input data should be correctly formatted. Rename and adopt. fitness_df = fitness_df.rename( columns={ self.resindex_colname: "residue_index", self.wildtype_colname: "wildtype", self.mutant_colname: "mutant", self.fitness_colname: "fitness", } ) self.fitness_df = fitness_df return True
[docs] def read_fitness_csv(self): """ Reads in a fitness CSV file and checks that all requested columns are present, complete and numeric. """ # read in the complete fitness data. This may have many columns logger.info(f"Reading in fitness data from {self.input_fitness_csv}") fitness_df = pd.read_csv(self.input_fitness_csv) # if no confidence is provided, just set to NaN if self.confidence_colname is None: fitness_df["confidence"] = np.nan self.confidence_colname = "confidence" # check whether the CSV file is correct if self.check_validity(fitness_df): logger.info(f"Successfully read fitness data:\n{self.fitness_df}") return self.fitness_df
[docs] def get_fitness_basedict(self): """ Converts a `pandas` fitness dataframe (read by `FitnessFactory.read_fitness_csv`) into a `fitness basedict` which is essentially just an `OrderedDict`. We want the dict to have the form: { residue_index: { fitness_csv_index, # this is the original index in the fitness CSV for provenance wildtype: {AA, fitness, confidence}, mutants: [{AA, fitness, confidence}, {AA, fitness, confidence}, etc] } } """ fitness_basedict = OrderedDict() for residx, res_df in self.read_fitness_csv().groupby(by="residue_index"): # first construct the dict entry for the wildtype wildtype = res_df["wildtype"].values[0] # if there is no wildtype mutation available, the experimentalists have omitted it and set # the value implicitly as 0. if len(res_df[res_df.mutant == wildtype]) == 0: res_df.loc[-1] = [ res_df["residue_index"].values[0], res_df["wildtype"].values[0], res_df["wildtype"].values[0], # this is where we add the wildtype 0.0, res_df["confidence"].values[0], ] wt_dict = { "aa": wildtype, "fitness": res_df[res_df.mutant == wildtype]["fitness"].values[0], "confidence": res_df[res_df.mutant == wildtype]["confidence"].values[0], } # now construct the mutant list of dict entries for this residue index while excluding wildtype mutant_list = [] for mut, fitness, conf in res_df[ [ "mutant", "fitness", "confidence", ] ].values: if not mut == wildtype: # ignore wildtype entry mutant_list.append( {"aa": mut, "fitness": fitness, "confidence": conf} ) fitness_basedict[residx] = { "fitness_csv_index": residx, # double now, but will be helpful for provenance after alignment "wildtype": wt_dict, "mutants": mutant_list, } logger.info( f"Created fitness dictionary as `FitnessFactory` of length {len(fitness_basedict)}" ) return fitness_basedict
from Bio.PDB import PDBParser from rdkit import Chem
[docs] class ComplexFactory: """ """
[docs] def __init__( self, path_to_pdb_file: Path, ): self.path_to_pdb_file = path_to_pdb_file
[docs] def remove_waters(system): """[Placeholder] Returns a system with water entries removed""" return system
[docs] def extract_ligands(system): """[Placeholder] Returns a system's ligands""" return system
[docs] def check_validity(self, complex): """ [Placeholder] Does some quick checks to make sure the imported PDB structure is valid. We're not doing any kind of protein prep, just whether biopython _is able to_ read the PDB file and we try to figure out what entry names the solvent/ligands have (if there are any) """ return complex
[docs] def load_pdb(self): """ Loads an input PDB file """ complex = PDBParser(QUIET=False).get_structure("COMPLEX", self.path_to_pdb_file) self.check_validity(complex) return complex
[docs] def load_pdb_rdkit(self): """ Loads an input PDB file to an RDKit object for easier string retrieval """ return Chem.MolFromPDBFile(self.path_to_pdb_file, sanitize=False)
if __name__ == "__main__": from choppa.data.toy_data.resources import ( TOY_COMPLEX, TOY_FITNESS_DATA_COMPLETE, TOY_FITNESS_DATA_COMPLETE_NOCONF, ) fitness_df = FitnessFactory( TOY_FITNESS_DATA_COMPLETE, confidence_colname="confidence" ).get_fitness_basedict() complex = ComplexFactory(TOY_COMPLEX).load_pdb()